In [2]:
import zipfile
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import datetime
from dateutil import tz
os.chdir('/Users/a.kholodov/Documents/02. Personal/20. Education/50. Universities/Springboard/Springboard_git/Springboard _repo/CS2-flights-delay-REPO')

# pd.set_option('display.max_rows', 130)
# pd.set_option('display.width', 200)

# Loading timezones for IATA codes of airports
IATAtz_df = pd.read_csv('https://raw.githubusercontent.com/hroptatyr/dateutils/tzmaps/iata.tzmap', 
                        sep = '\t', 
                        index_col=0, 
                        header=None)

# Dictionary with IATA codes as keys and timezones as values
IATAtz = IATAtz_df.to_dict('dict')[1]
del(IATAtz_df)

In [3]:
data_types = {
    'FlightDate':           'str',
    'Flight_Number_Reporting_Airline':  np.int16,
    'Origin':               'category',
    'Dest':                 'category',
    'CRSDepTime':           np.int16,
    'DepTime':              np.float32,
    'DepDelay':             np.float32,
    'CRSArrTime':           np.int16,
    'ArrTime':              np.float32,
    'ArrDelay':             np.float32,
    'Cancelled':            np.int8,        # boolean
    'Diverted':             np.int8,        # boolean
    'CRSElapsedTime':       np.float32,
    'ActualElapsedTime':    np.float32,
    'AirTime':              np.float32}

In [4]:
def load_data_from(zip_file, data_file, field_type=None):
    '''
    zip_file - path and name of source zip-file contaning 60 csv files
    dat_faile - path and name of csv-file with data
    field_type - dictinary with fields to load and thiers relative data types
    '''
    # reading the file
    with zipfile.ZipFile(zip_file) as zip_source:
        with zip_source.open(data_file) as file:
            if field_type != None:
                df = pd.read_csv(file, header = 0, 
                                usecols = field_type.keys(),
                                dtype = field_type)
            else:
                df = pd.read_csv(file, header = 0, low_memory=False)

    # Converting dates and boolean        
    if 'FlightDate' in df.columns:
        df['FlightDate'] = pd.to_datetime(df['FlightDate'])
    if 'DivReachedDest' in df.columns:
        df['DivReachedDest'] = df['DivReachedDest'].fillna(0)
    if 'Cancelled' in df.columns:
        df['Cancelled'] = df['Cancelled'].astype('bool')
    if 'Diverted' in df.columns:
        df['Diverted'] = df['Diverted'].astype('bool')
    if 'DivReachedDest' in df.columns:
        df['DivReachedDest'] = df['DivReachedDest'].astype('bool')
    return df

In [5]:
# The most important combinations of fields for analysis
Date_details = ['FlightDate']
DepTime_details = ['CRSDepTime', 'DepTime']
ArrTime_details = ['CRSArrTime', 'ArrTime']
DepTime_min_details = ['CRSDepTime_min', 'DepTime_min']
ArrTime_min_details = ['CRSArrTime_min', 'ArrTime_min']
ElapsedTime_details = ['CRSElapsedTime', 'ActualElapsedTime']
CRS_details = ['CRSDepTime', 'CRSArrTime', 'CRSElapsedTime']
CRS_min_details = ['CRSDepTime_min', 'CRSArrTime_min', 'CRSElapsedTime']
Route_datails = ['Origin', 'Dest']

In [6]:
# Loading some dataset just for tests
source_zip = 'data/interim/csv_flight.zip'
source_path = 'csv_flight/report_'

flights = load_data_from(source_zip, source_path + '2014_3.csv', data_types)

In [None]:
flights.info()

In [8]:
# Fuilter for all flights from SEA to JFK airport and back
SEA_JFK = (flights['Origin'] == 'SEA') & (flights['Dest'] == 'JFK')
JFK_SEA = (flights['Origin'] == 'JFK') & (flights['Dest'] == 'SEA')

In [None]:
print(flights[SEA_JFK][Date_details + CRS_details].head(10))

del(SEA_JFK)

In [None]:
print(flights[JFK_SEA][Date_details + CRS_details].head(10))

del(JFK_SEA)

In [None]:
# Filter for flights when CRS Arrival time is less of equal than CRS Departure time
CRSArrTime_less_than_CRSDepTime = flights['CRSArrTime'] <= flights['CRSDepTime']

# Filter for flights lasting less than 60 minutes (CRS)
less_than_60_min = flights['CRSElapsedTime'] < 60

# Flights where CRS Arrival time less than CRS Departure time AND a flight was less than 60 min
# To evaluate is it possible to 'travel in time' - to arrive the same day earlier 
print(flights[CRSArrTime_less_than_CRSDepTime & less_than_60_min][Date_details + CRS_details] \
    .sort_values('CRSElapsedTime').head())

del(CRSArrTime_less_than_CRSDepTime)

### Examining the CRS Elapsed time

In [12]:
# [TRANSFORMATION]
# # Convert Arrival and Departure times (CRS and Actual) to minutes
flights['CRSArrTime_min'] = flights['CRSArrTime'] // 100 * 60 + flights['CRSArrTime'] % 100
flights['CRSDepTime_min'] = flights['CRSDepTime'] // 100 * 60 + flights['CRSDepTime'] % 100
flights['ArrTime_min'] = flights['ArrTime'] // 100 * 60 + flights['ArrTime'] % 100
flights['DepTime_min'] = flights['DepTime'] // 100 * 60 + flights['DepTime'] % 100

In [None]:
# Filter for flights when CRS Arrival time is less of equal than CRS Departure time
CRSArr_less_than_CRSDep_min = flights['CRSArrTime_min'] <= flights['CRSDepTime_min']

# FOR TIMES EXPRESSED IN MINUTES !!!
# Flights where CRS Arrival time less than CRS Departure time AND a flight was less than 60 min
# To evaluate is it possible to 'travel in time' - to arrive the same day earlier 
flights[CRSArr_less_than_CRSDep_min & less_than_60_min][Date_details + CRS_min_details] \
    .sort_values('CRSElapsedTime').head()

In [None]:
# ... and the END of the table
flights[CRSArr_less_than_CRSDep_min & less_than_60_min][Date_details + CRS_min_details] \
    .sort_values('CRSElapsedTime').tail()

In [15]:
# [TRANSFORMATION]
# Calculation of the difference between departure and arrival time 
flights['CRS_Arr_minus_Dep'] = flights['CRSArrTime_min'] - flights['CRSDepTime_min'] 

In [None]:
# Examine the difference between CRS Arrival and Departure times
bins = np.arange(-1440, 1440, 100)
plt.hist(flights['CRS_Arr_minus_Dep'], density=True,  bins = bins)
plt.show()

In [None]:
# Examine MORE CLOSELY the difference between CRS Arrival and Departure times 
bins = np.arange(-540, 60, 60)
plt.hist(flights['CRS_Arr_minus_Dep'], density=True,  bins = bins)
plt.xticks(bins)
plt.show()

In [None]:
# Examine the difference between CRS Arrival and Departure times
flights['CRS_Arr_minus_Dep'].describe()

In [None]:
# Are there any values between -900 and -200
diff_between_900_200 = (flights['CRS_Arr_minus_Dep'] > -480) & (flights['CRS_Arr_minus_Dep'] < -60)
print(flights[diff_between_900_200])

del(diff_between_900_200)

It is possible to ‘travel in time’ only for a maximum of 60 minutes. All other differences occur when flights take off before midnight and land the next day. I can use a threshold of -60 minutes to adjust the departure date by adding one day in order to calculate the arrival date.

In [20]:
# [TRANSFORMATION]
# Creating two fields:  CRSDepDateNS - CRS Departure date timezone naive (datetime)
#                       CRSArrDateNS - CRS Arrival date timezone naive, which is next day if the difference 
#                                   between CRS Arrival and Departure time more than 60 minutes (datetime) 
flights['CRSDepDT_tz'] = pd.to_datetime(flights['FlightDate']) + pd.to_timedelta(flights['CRSDepTime_min'], unit='min')

# Filter for scheduled flights arriving next or privios day
CRS_Arrived_next_day = flights['CRSArrTime_min'] - flights['CRSDepTime_min']  <= -60
CRS_Arrived_previous_day = flights['CRSArrTime_min'] - flights['CRSDepTime_min']  >= 1380

# For all scheduled flights by default arrival date is eaueal to departure date
flights['CRSArrDT_tz'] = flights['FlightDate']

# Adding one day to the date if flight arrived next day
flights.loc[CRS_Arrived_next_day, 'CRSArrDT_tz'] = flights.loc[CRS_Arrived_next_day, 'CRSArrDT_tz'] \
                                                    + datetime.timedelta(1)
# Adding one day to the date if flight arrived day befor departure day
flights.loc[CRS_Arrived_previous_day, 'CRSArrDT_tz'] = flights.loc[CRS_Arrived_previous_day, 'CRSArrDT_tz'] \
                                                    + datetime.timedelta(-1)
# Finally adding actual arrival time to get actual arrival datetime
flights['CRSArrDT_tz'] = flights['CRSArrDT_tz'] + pd.to_timedelta(flights['CRSArrTime_min'], 'm')

In [None]:
# Check for actual departure and arrival datetimes 
print('Flights arrived next day after departure\n', 
      flights[CRS_Arrived_next_day][['FlightDate', 'CRSDepTime', 'CRSDepDT_tz', 'DepDelay', 'CRSArrDT_tz', 'CRSArrTime']].head())
print('Flights arrived the same day as departured\n',
      flights[~CRS_Arrived_next_day][['FlightDate', 'CRSDepTime', 'CRSDepDT_tz', 'DepDelay', 'CRSArrDT_tz', 'CRSArrTime']].head())

In [22]:
# [TRANSFORMATION]

# Creating two fields:  ActDepDateNS - Actual Departure date timezone naive (datetime)
#                       ActArrDateNS - Actual Arrival date timezone naive, which is next day if the difference 
#                                   between CRS Arrival and Departure time more than 60 minutes (datetime)

# Filters for flights having DepTime_min and ArrTime_min 
# These filters actually for not cancelled flights, but they differ each other because some flights cancelles after departure
# So they have departure time but didn't fly 
DepTime_not_NA = ~flights['DepTime_min'].isna()
ArrTime_not_NA = ~flights['ArrTime_min'].isna()

# Array with 'day shift' due to flight delay. NOTICE: some flights have -1 day shift because day had a small negative delay 
# having a scheduled departure time several minutes after midnight
day_deltas_due_to_delay = pd.to_timedelta((flights['CRSDepTime_min'] + flights['DepDelay']) // 1440, 'd')

# Calculating actual departure datetime
flights.loc[DepTime_not_NA, 'ActDepDT_tz'] = flights[DepTime_not_NA]['FlightDate'] \
                                            + day_deltas_due_to_delay[DepTime_not_NA] \
                                            + pd.to_timedelta(flights[DepTime_not_NA]['DepTime_min'], 'm')


# Calculating actual arrival datetime

# Filter for flights arrived next or previous day
Arrived_next_day = flights['ArrTime_min'] - flights['DepTime_min']  <= -60
Arrived_previous_day = flights['ArrTime_min'] - flights['DepTime_min']  >= 1380

# For all arrived flights at first arrival date is eaueal to departure date
flights.loc[ArrTime_not_NA, 'ActArrDT_tz'] = flights[ArrTime_not_NA]['FlightDate'] \
                                            + day_deltas_due_to_delay[ArrTime_not_NA] 
# Adding one day to the date if flight arrived next day
flights.loc[ArrTime_not_NA & Arrived_next_day, 'ActArrDT_tz'] = flights.loc[ArrTime_not_NA & Arrived_next_day, 'ActArrDT_tz'] \
                                                            + datetime.timedelta(1)
# Adding one day to the date if flight arrived next day
flights.loc[ArrTime_not_NA & Arrived_previous_day, 'ActArrDT_tz'] = flights.loc[ArrTime_not_NA & Arrived_previous_day, 'ActArrDT_tz'] \
                                                            + datetime.timedelta(-1)
# Finally adding actual arrival time to get actual arrival datetime
flights.loc[ArrTime_not_NA, 'ActArrDT_tz'] = flights.loc[ArrTime_not_NA, 'ActArrDT_tz'] \
                                                            + pd.to_timedelta(flights.loc[ArrTime_not_NA, 'ArrTime_min'], 'm')

In [None]:
# Check for actual departure and arrival datetimes 
check_fields = ['FlightDate', 'DepTime', 'ActDepDT_tz', 'DepDelay', 'ActArrDT_tz', 'ArrTime']
print('Flights arrived next day after departure\n', 
      flights[Arrived_next_day][check_fields].head())
print('Flights arrived the same day as departured\n',
      flights[~Arrived_next_day][check_fields].head())

In [24]:
# [TRANSFORMATION]
# Function to add to the series of datetime a timezone specified by airports IATA codes
def add_local_tz(df, dt_field, IATA_code_field):
    return [row[dt_field].tz_localize(tz=tz.gettz(IATAtz[row[IATA_code_field]]), nonexistent='shift_forward') 
            for _, row in df.iterrows()]

In [25]:
# [TRANSFORMATION]
# Adding local timezones to the Departure and Arrival times 
flights['CRSDepDT_tz'] = add_local_tz(flights, 'CRSDepDT_tz', 'Origin')
flights['CRSArrDT_tz'] = add_local_tz(flights, 'CRSArrDT_tz', 'Dest')
flights['ActDepDT_tz'] = add_local_tz(flights, 'ActDepDT_tz', 'Origin')
flights['ActArrDT_tz'] = add_local_tz(flights, 'ActArrDT_tz', 'Dest')

In [None]:
# check CRS fields
check_fields_CRS = ['FlightDate', 'CRSDepDT_tz', 'CRSDepTime', 'CRSArrDT_tz', 'CRSArrTime', 'CRSElapsedTime']
print(flights[CRSArr_less_than_CRSDep_min][check_fields_CRS].head())
print(flights[~CRSArr_less_than_CRSDep_min][check_fields_CRS].head())

In [None]:
# check ACTUAL fields
check_fields_CRS = ['FlightDate', 'ActDepDT_tz', 'DepTime', 'ActArrDT_tz', 'ArrTime', 'ActualElapsedTime']
print(flights[Arrived_next_day][check_fields_CRS].head())
print(flights[~Arrived_next_day][check_fields_CRS].head())

In [28]:
# Function to convert datetime with localizes timezone to UTC timezone
def convert_to_UTC(df, dt_field):
    return pd.to_datetime([row[dt_field].astimezone(tz.UTC) for _, row in df.iterrows()])

In [29]:
# Getting UTC timezone for scheduled (CRS) departure and arrival times
flights['CRSDep_UTC'] = convert_to_UTC(flights, 'CRSDepDT_tz')
flights['CRSArr_UTC'] = convert_to_UTC(flights, 'CRSArrDT_tz')

# We have to filter for non-NA values because astimezone doesn't work wiht NaN/NaT
nonNA_Dep = ~flights['ActDepDT_tz'].isna()
flights.loc[nonNA_Dep, 'ActDep_UTC'] = convert_to_UTC(flights[nonNA_Dep], 'ActDepDT_tz')

# We have to filter for non-NA values because astimezone doesn't work wiht NaN/NaT
nonNA_Arr = ~flights['ActArrDT_tz'].isna()
flights.loc[nonNA_Arr, 'ActArr_UTC'] = convert_to_UTC(flights[nonNA_Arr], 'ActArrDT_tz')

In [None]:
# Check CRS
fields_to_check_UTC_CRS = ['CRSDepDT_tz', 'CRSDep_UTC', 'CRSArrDT_tz', 'CRSArr_UTC']
print(flights[fields_to_check_UTC_CRS].head())

# Check ACTUAL
fields_to_check_UTC_Act = ['ActDepDT_tz', 'ActDep_UTC', 'ActArrDT_tz', 'ActArr_UTC']
print(flights[fields_to_check_UTC_Act].head())

In [31]:
# [TRANSFORTATION]
# Calculating Elapsed time based on UTC times
flights['UTCElapsedTime_CRS'] = (flights['CRSArr_UTC'] - flights['CRSDep_UTC']).dt.total_seconds() / 60

flights.loc[nonNA_Arr, 'UTCElapsedTime_Act'] = pd.to_timedelta((flights.loc[nonNA_Arr, 'ActArr_UTC'] - 
                                                                flights.loc[nonNA_Arr, 'ActDep_UTC'])).dt.total_seconds() / 60



In [32]:
# [TRANSFORMATINO]
# The difference between UTC-based Elapsed time and CRS/ACTUAL Elapsed time
flights['diff_CRS'] = flights['UTCElapsedTime_CRS'] - flights['CRSElapsedTime']
flights['diff_Act'] = flights['UTCElapsedTime_Act'] - flights['ActualElapsedTime']

In [None]:
# Examine the flights having difference between UTC-Based and CRS Elapsed times 
diff_not_zero = flights[flights['diff_CRS'] != 0][fields_to_check_UTC_CRS + ['CRSElapsedTime', 'UTCElapsedTime_CRS']]
print('There are', len(diff_not_zero), 'flights where CRS Elapsed time differ from CRS Elapsed time culculated based on UTC')
diff_not_zero.head()

del(diff_not_zero)

In [None]:
flights['diff_CRS'].value_counts()

There are 76 flights in the datased where we have a difference between UTC and CRS elapsed time.


Let’s take a detailed look at one of these flights: the first one from the table above, with ID = 86022, which has a -60 minute difference between UTC and CRS elapsed times.

In [None]:
# Let's take a look on the first flight with diffeence
flights.iloc[86022]

First, we see that this flight took place during the spring 2014 DST, and the difference between UTC and CRS elapsed times is exactly 60 minutes. But which time is correct? Let’s take a look at this flight on other days.

In [None]:
flights[(flights['Flight_Number_Reporting_Airline'] == 103) &
         (flights['Origin'] == 'SEA')][['CRSArrDT_tz', 'CRSElapsedTime', 'UTCElapsedTime_CRS', 'diff_CRS']]

It seems that the only flight having the difference is the filight landed 9th March 2014 and the UTC-based elapsed time is correct (CRSElapsedTime has the wrong value for DST day). Let's examine other flights having difference of 60 min.

In [None]:
diff_60_min = flights[flights['diff_CRS'] == -60][['CRSDepDT_tz', 'CRSArrDT_tz', 'CRSElapsedTime', 'UTCElapsedTime_CRS', 'diff_CRS']]
print(len(diff_60_min), 'records')
diff_60_min

del(diff_60_min)

We can see that all flights with a -60 minute difference between UTC and CRS elapsed times occurred during DST. There are 13 such records in the dataset.

We can conclude that the CRS elapsed time is incorrect for all 13 flights that passed through DST.

This explains the discrepancy between UTC and CRS elapsed times for 13 out of the 76 flights in the dataset. But what about the remaining flights with a similar difference?

In [None]:
flights['UTCElapsedTime_CRS'].describe()

There is another interesting fact - some flights has also engative UTC elapsed time! How is this possible?

In [None]:
error_less_0 = flights[flights['UTCElapsedTime_CRS'] < 0]
print(len(error_less_0))
error_less_0[
    ['CRSDepDT_tz', 
     'Origin', 
     'CRSArrDT_tz', 
     'Dest', 
     'CRSElapsedTime', 
     'UTCElapsedTime_CRS', 
     'diff_CRS']]


There are 31 flights with a negative UTC elapsed time. Previously, we analyzed the differences between UTC and CRS elapsed times and found that 31 flights had a -1440 minute difference. These are the same flights. A difference of 1440 minutes is equivalent to 24 hours.

All 31 flights were from HNL to GUM, which is interesting because HNL is in the UTC-10 timezone, while GUM is in the UTC+10 timezone. These two airports are located across the International Date Line (IDL).

In [40]:
error_less_0[['CRSDep_UTC', 'CRSDepDT_tz', 'Origin', 'CRSArr_UTC', 'CRSArrDT_tz', 'Dest', 'diff_CRS']].head()

del(error_less_0)

The easiest way to correct this is to find all flight with UTC Departures less than UTC Arrival and add 24 x 60 minutes to the UTC Elapsed time

There ara other flights which have the "oposite" difference - they differ on +1440 minutes. 

In [None]:
error_plus_1440 = flights[flights['diff_CRS'] == 1440]
print(len(error_plus_1440))
error_plus_1440[['CRSDep_UTC', 'CRSDepDT_tz', 'Origin', 'CRSArr_UTC', 'CRSArrDT_tz', 'Dest', 'diff_CRS']]

del(error_plus_1440)

All these flight are back flights from GUM to HLN across Internation Date Line (IDL). The ramedy is simillar - to subtract 24 x 60 (1440) minutes from UTC elapsed time. 

There is just one flight with unexamined yet difference. 

In [None]:
error_minus_36 = flights[flights['diff_CRS'] == -36]
print(len(error_minus_36), 'records')
error_minus_36[['Flight_Number_Reporting_Airline', 'CRSDep_UTC', 'CRSDepDT_tz', 'Origin', 'CRSArr_UTC', 
               'CRSArrDT_tz', 'Dest', 'CRSElapsedTime', 'UTCElapsedTime_CRS', 
               'ActualElapsedTime', 'diff_CRS', 'Cancelled', 'Diverted']].T

del(error_minus_36)

In [None]:
flights[(flights['Flight_Number_Reporting_Airline'] == 317) &
        (flights['Origin'] == 'DEN') &
        (flights['Dest'] == 'PSP')] [['CRSElapsedTime', 'UTCElapsedTime_CRS', 'diff_CRS']]

In this case we see that CRS Elapsed time is:
1. also inconsistent with CRS Departure and CRS Arrival times
2. all other flights for this route have the equal CRS and UTC elapsed times which is 129 minutes  

So this can be an error

### Checking ACTUAL time

In [None]:
flights['diff_Act'].value_counts()

It appears that the actual elapsed time shows the same differences as the scheduled elapsed time between the two types of flight duration: one calculated using UTC arrival and departure times, and the other based on the ActualElapsedTime provided in the dataset. Let’s take a closer look.

The total number of discrepancies is 78, with the following breakdown:
*	+/-1440 minutes: 60 records, likely related to flights crossing the International Date Line (IDL) from HNL to GUM and back.
*	-60 minutes: 15 records, possibly due to flights occurring during the transition to Daylight Saving Time (DST) in the spring.
*	Other: 3 records with small, random differences, which could be attributed to minor errors.

In [None]:
error_1440 = flights[flights['diff_Act'].isin([1440, -1440])]
print('There are:', len(error_1440), 'records with differenc +/-1440 min (equal to 24 hours)')
print('Which airports are presented among these flights:', *set(error_1440['Origin']).union(set(error_1440['Origin'])))

del(error_1440)

All these flights are again between two airports across IDL. We already know this problem and know how to cue it. 

In [46]:
error_60 = flights[flights['diff_Act'] == -60][['Flight_Number_Reporting_Airline', 'ActDepDT_tz', 'ActArrDT_tz',
                                           'ActDep_UTC', 'ActArr_UTC', 'Origin', 'Dest', 'ActualElapsedTime', 'UTCElapsedTime_Act']]

In [None]:
error_60['DST_Origin'] = pd.to_datetime("2014-03-09 01:59:59")
error_60['DST_Dest'] = pd.to_datetime("2014-03-09 02:00:01")
error_60['DST_Origin'] = add_local_tz(error_60, 'DST_Origin', 'Origin')
error_60['DST_Dest'] = add_local_tz(error_60, 'DST_Dest', 'Dest')

print((error_60['ActDepDT_tz'] < error_60['DST_Origin']).all())
print(error_60[~(error_60['ActArrDT_tz'] > error_60['DST_Dest'])].T)

All these flights departured before the transition to DST and arrived after it. This is the reason they have a wrong ActualElapsedTime. It's worth to mention that thiers arrival time is correct.


In [None]:
other = flights[(flights['diff_Act'] > -60) & (flights['diff_Act'] < 10) & (flights['diff_Act'] != 0)]
print(len(other))
other[['Flight_Number_Reporting_Airline',
       'ActDepDT_tz', 
       'ActArrDT_tz',
       'Origin', 
       'Dest', 
       'ActDep_UTC',
       'ActArr_UTC',
       'diff_Act', 
       'ActualElapsedTime', 
       'UTCElapsedTime_Act']].T

First of all, we notice that all these differences appear again on the spring DST start date.
Secondly, it looks like the first two flights have minor errors that we can disregard, and we can use UTC elapsed time instead of Actual elapsed time. However, the third flight has a significant difference. Let’s examine it in more detail.

In [None]:
flights[(flights['Flight_Number_Reporting_Airline'] == 5702) &\
        (flights['Origin'] == 'ORD') &\
        (flights['Dest'] == 'CAE')] [['Flight_Number_Reporting_Airline',
       'ActDepDT_tz', 
       'ActArrDT_tz',
       'Origin', 
       'Dest', 
       'ActDep_UTC',
       'ActArr_UTC',
       'diff_Act', 
       'ActualElapsedTime', 
       'UTCElapsedTime_Act']]

We observe that the same flight on other days has a duration close to the actual elapsed time recorded on March 9th. The elapsed time based on UTC, however, is significantly different from the real duration, which suggests that either the actual departure or arrival time (or both) is incorrect. There are several options to address this, such as adjusting the departure or arrival time based on the average duration of similar flights. However, due to the insignificance of this error (1 out of almost 500k flights in total, and 25 flights on this route in the same month), the simplest solution is to drop this record on the next stage.