**Clean Flights 2019**
<br/>Perform cleaning on the 2019 dataframe and then export a clean csv.

In [1]:
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# increase display size to enable viewing of all data columns

pd.options.display.max_columns = 35

In [4]:
# read in the 2019 flights csv and ensure everything loads correctly

flights2019_df = pd.read_csv('flights2019.csv', index_col=0)
flights2019_df

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay
0,4,2019-08-01,DL,N354NW,ATL,"Atlanta, GA",GA,DFW,"Dallas/Fort Worth, TX",TX,1954,2025.0,31.0,1.0,2.0,17.0,2042.0,0,,731,3,1.0,0.0,0.0,0.0,16.0,2114,2131.0,17.0
1,4,2019-08-01,DL,N320US,DFW,"Dallas/Fort Worth, TX",TX,ATL,"Atlanta, GA",GA,1709,1706.0,-3.0,0.0,-1.0,19.0,1725.0,0,,731,3,,,,,,2024,2012.0,-12.0
2,4,2019-08-01,DL,N931DN,IAH,"Houston, TX",TX,ATL,"Atlanta, GA",GA,1749,1829.0,40.0,1.0,2.0,12.0,1841.0,0,,689,3,17.0,0.0,0.0,0.0,20.0,2102,2139.0,37.0
3,4,2019-08-01,DL,N851DN,PDX,"Portland, OR",OR,SLC,"Salt Lake City, UT",UT,1310,1306.0,-4.0,0.0,-1.0,13.0,1319.0,0,,630,3,,,,,,1601,1558.0,-3.0
4,4,2019-08-01,DL,N775DE,SLC,"Salt Lake City, UT",UT,PDX,"Portland, OR",OR,829,820.0,-9.0,0.0,-1.0,16.0,836.0,0,,630,3,,,,,,926,911.0,-15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7422032,5,2019-11-08,DL,N978AT,HOU,"Houston, TX",TX,ATL,"Atlanta, GA",GA,1324,1320.0,-4.0,0.0,-1.0,14.0,1334.0,0,,696,3,,,,,,1624,1605.0,-19.0
7422033,5,2019-11-08,DL,N6716C,CHS,"Charleston, SC",SC,ATL,"Atlanta, GA",GA,700,719.0,19.0,1.0,1.0,10.0,729.0,0,,259,2,,,,,,817,820.0,3.0
7422034,5,2019-11-08,DL,N851DN,MSP,"Minneapolis, MN",MN,BWI,"Baltimore, MD",MD,1800,1754.0,-6.0,0.0,-1.0,22.0,1816.0,0,,936,4,,,,,,2130,2116.0,-14.0
7422035,5,2019-11-08,DL,N352NB,GEG,"Spokane, WA",WA,SLC,"Salt Lake City, UT",UT,1015,1009.0,-6.0,0.0,-1.0,11.0,1020.0,0,,546,3,,,,,,1258,1237.0,-21.0


In [5]:
# summary statistics for the dataframe

flights2019_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DayOfWeek,7422037.0,3.937155,1.995814,1.0,2.0,4.0,6.0,7.0
CRSDepTime,7422037.0,1330.258507,492.985466,1.0,913.0,1321.0,1736.0,2359.0
DepTime,7291951.0,1334.606311,507.199818,1.0,914.0,1327.0,1746.0,2400.0
DepDelay,7291927.0,10.923267,48.95964,-82.0,-5.0,-2.0,7.0,2710.0
DepDel15,7291927.0,0.186804,0.389754,0.0,0.0,0.0,0.0,1.0
DepartureDelayGroups,7291927.0,0.080959,2.276868,-2.0,-1.0,-1.0,0.0,12.0
TaxiOut,7288060.0,17.389349,10.003982,1.0,11.0,15.0,20.0,227.0
WheelsOff,7288060.0,1358.26439,508.810907,1.0,930.0,1340.0,1801.0,2400.0
Cancelled,7422037.0,0.018179,0.133598,0.0,0.0,0.0,0.0,1.0
Distance,7422037.0,800.544232,592.511847,31.0,369.0,640.0,1034.0,5095.0


In [6]:
# check to see if there any NaN values in the data

flights2019_df.isna().sum()

DayOfWeek                     0
FlightDate                    0
Reporting_Airline             0
Tail_Number               17837
Origin                        0
OriginCityName                0
OriginState                   0
Dest                          0
DestCityName                  0
DestState                     0
CRSDepTime                    0
DepTime                  130086
DepDelay                 130110
DepDel15                 130110
DepartureDelayGroups     130110
TaxiOut                  133977
WheelsOff                133977
Cancelled                     0
CancellationCode        7287112
Distance                      0
DistanceGroup                 0
CarrierDelay            6032784
WeatherDelay            6032784
NASDelay                6032784
SecurityDelay           6032784
LateAircraftDelay       6032784
CRSArrTime                    0
ArrTime                  137646
ArrDelay                 153805
dtype: int64

In [7]:
# list of all the columns in the data dataframe with missing values and the percent that's missing

round(np.mean(flights2019_df[flights2019_df.columns[flights2019_df.isnull().any()]].isna(), axis = 0) * 100,2)

Tail_Number              0.24
DepTime                  1.75
DepDelay                 1.75
DepDel15                 1.75
DepartureDelayGroups     1.75
TaxiOut                  1.81
WheelsOff                1.81
CancellationCode        98.18
CarrierDelay            81.28
WeatherDelay            81.28
NASDelay                81.28
SecurityDelay           81.28
LateAircraftDelay       81.28
ArrTime                  1.85
ArrDelay                 2.07
dtype: float64

In [8]:
# show flights that are missing the tail number and were cancelled

flights2019_df[(flights2019_df['Cancelled'] == 1) & (flights2019_df['Tail_Number'].isna())]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay
40202,6,2019-08-24,DL,,ORD,"Chicago, IL",IL,LGA,"New York, NY",NY,1145,,,,,,,1,A,733,3,,,,,,1507,,
93619,6,2019-08-03,YV,,ELP,"El Paso, TX",TX,PHX,"Phoenix, AZ",AZ,829,,,,,,,1,A,347,2,,,,,,859,,
93772,3,2019-08-07,YV,,MRY,"Monterey, CA",CA,PHX,"Phoenix, AZ",AZ,608,,,,,,,1,A,598,3,,,,,,801,,
94611,2,2019-08-20,YV,,DFW,"Dallas/Fort Worth, TX",TX,ICT,"Wichita, KS",KS,1228,,,,,,,1,A,328,2,,,,,,1348,,
94642,2,2019-08-20,YV,,ICT,"Wichita, KS",KS,DFW,"Dallas/Fort Worth, TX",TX,1423,,,,,,,1,A,328,2,,,,,,1549,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7212228,2,2019-11-26,YV,,DFW,"Dallas/Fort Worth, TX",TX,CRP,"Corpus Christi, TX",TX,1430,,,,,,,1,A,354,2,,,,,,1552,,
7212229,3,2019-11-27,YV,,DFW,"Dallas/Fort Worth, TX",TX,CRP,"Corpus Christi, TX",TX,1430,,,,,,,1,A,354,2,,,,,,1552,,
7221600,2,2019-11-05,YV,,JAN,"Jackson/Vicksburg, MS",MS,DFW,"Dallas/Fort Worth, TX",TX,1625,,,,,,,1,A,408,2,,,,,,1803,,
7222311,3,2019-11-06,YV,,FSD,"Sioux Falls, SD",SD,DFW,"Dallas/Fort Worth, TX",TX,700,,,,,,,1,A,737,3,,,,,,927,,


In [9]:
# fill in 'unknown' for the NaN tail numbers
# these are all flights that were cancelled, and we already have a small number of cancelled flights, so don't want to delete them

flights2019_df['Tail_Number'].fillna('unknown', inplace=True)
flights2019_df.shape

(7422037, 29)

In [10]:
# check to ensure that tail number was filled in appropriately

flights2019_df.isna().sum()

DayOfWeek                     0
FlightDate                    0
Reporting_Airline             0
Tail_Number                   0
Origin                        0
OriginCityName                0
OriginState                   0
Dest                          0
DestCityName                  0
DestState                     0
CRSDepTime                    0
DepTime                  130086
DepDelay                 130110
DepDel15                 130110
DepartureDelayGroups     130110
TaxiOut                  133977
WheelsOff                133977
Cancelled                     0
CancellationCode        7287112
Distance                      0
DistanceGroup                 0
CarrierDelay            6032784
WeatherDelay            6032784
NASDelay                6032784
SecurityDelay           6032784
LateAircraftDelay       6032784
CRSArrTime                    0
ArrTime                  137646
ArrDelay                 153805
dtype: int64

In [11]:
# check to see how many cancelled flights there were in 2019

flights2019_df[flights2019_df['Cancelled'] == 1].shape

(134925, 29)

134,925 cancelled flights, but 130,086 with no departure time -- there are 4,839 (see below) that need updating.

In [12]:
# look at the flights that are both cancelled and have a departure time

flights2019_df[(flights2019_df['Cancelled'] ==1) & (flights2019_df['DepTime'])]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay
49969,3,2019-08-07,DL,N358NB,DTW,"Detroit, MI",MI,PHL,"Philadelphia, PA",PA,1414,1408.0,-6.0,0.0,-1.0,19.0,1427.0,1,C,453,2,,,,,,1556,,
50470,3,2019-08-07,DL,N984DL,PNS,"Pensacola, FL",FL,ATL,"Atlanta, GA",GA,1559,1602.0,3.0,0.0,0.0,10.0,1612.0,1,A,271,2,,,,,,1820,,
64816,3,2019-08-28,DL,N685DA,JFK,"New York, NY",NY,STT,"Charlotte Amalie, VI",VI,900,903.0,3.0,0.0,0.0,84.0,1027.0,1,B,1623,7,,,,,,1307,,
64885,3,2019-08-28,DL,N551NW,ATL,"Atlanta, GA",GA,STT,"Charlotte Amalie, VI",VI,940,1012.0,32.0,1.0,2.0,21.0,1033.0,1,B,1599,7,,,,,,1325,,
64961,3,2019-08-28,DL,N690DL,ATL,"Atlanta, GA",GA,STT,"Charlotte Amalie, VI",VI,1107,1104.0,-3.0,0.0,-1.0,11.0,1115.0,1,B,1599,7,,,,,,1452,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7365137,1,2019-11-11,AA,N301PA,DFW,"Dallas/Fort Worth, TX",TX,DEN,"Denver, CO",CO,1900,1858.0,-2.0,0.0,-1.0,,,1,A,641,3,,,,,,2016,,
7368189,2,2019-11-12,AA,N198AA,DFW,"Dallas/Fort Worth, TX",TX,PHL,"Philadelphia, PA",PA,849,904.0,15.0,1.0,1.0,,,1,A,1303,6,,,,,,1256,,
7370399,5,2019-11-01,B6,N273JB,JFK,"New York, NY",NY,BTV,"Burlington, VT",VT,735,741.0,6.0,0.0,0.0,11.0,752.0,1,B,266,2,,,,,,850,,
7373537,2,2019-11-05,B6,N283JB,ORH,"Worcester, MA",MA,MCO,"Orlando, FL",FL,1608,2132.0,324.0,1.0,12.0,,,1,A,1091,5,,,,,,1913,,


These don't have arrival times, so it is safe to assume that they were actually cancelled, and should not have a departure time.

In [13]:
# change the departure time to 0 for all flights that are cancelled and have a depature time
# and check that there are now no flights with this combination

flights2019_df['DepTime'][(flights2019_df['Cancelled'] == 1) & (flights2019_df['DepTime'])] = 0
flights2019_df[(flights2019_df['Cancelled'] ==1) & (flights2019_df['DepTime'])]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay


In [14]:
# check for flights that have a depature time greater than 0, but no departure delay

flights2019_df[(flights2019_df['DepTime'] > 0 ) & (flights2019_df['DepDelay'].isna())]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay


In [15]:
# review remaining flights with NaN values in the DepDelay column

flights2019_df[(flights2019_df['DepDelay'].isna())]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay
893,4,2019-08-01,DL,N103DU,EWR,"Newark, NJ",NJ,DTW,"Detroit, MI",MI,600,,,,,,,1,A,488,2,,,,,,750,,
1126,4,2019-08-01,DL,N334DN,LGA,"New York, NY",NY,MIA,"Miami, FL",FL,700,,,,,,,1,A,1096,5,,,,,,1007,,
1289,4,2019-08-01,DL,N382DN,LGA,"New York, NY",NY,ATL,"Atlanta, GA",GA,959,,,,,,,1,A,762,4,,,,,,1231,,
40202,6,2019-08-24,DL,unknown,ORD,"Chicago, IL",IL,LGA,"New York, NY",NY,1145,,,,,,,1,A,733,3,,,,,,1507,,
49419,3,2019-08-07,DL,N374DX,ATL,"Atlanta, GA",GA,BOS,"Boston, MA",MA,1829,,,,,,,1,C,946,4,,,,,,2112,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7390903,7,2019-11-17,B6,N645JB,RSW,"Fort Myers, FL",FL,BOS,"Boston, MA",MA,1842,,,,,,,1,A,1249,5,,,,,,2142,,
7402106,2,2019-11-12,DL,N968AT,SYR,"Syracuse, NY",NY,DTW,"Detroit, MI",MI,600,,,,,,,1,A,374,2,,,,,,745,,
7402423,2,2019-11-12,DL,N320NB,ALB,"Albany, NY",NY,DTW,"Detroit, MI",MI,555,,,,,,,1,A,489,2,,,,,,741,,
7402676,2,2019-11-12,DL,N355DN,MKE,"Milwaukee, WI",WI,DTW,"Detroit, MI",MI,632,,,,,,,1,A,237,1,,,,,,850,,


In [16]:
# fill in 0 for the remaining NaN values in the DepTime, DepDelay, DepDel15 and DepartureDelayGroups columns

flights2019_df['DepTime'].fillna(0, inplace=True)
flights2019_df['DepDelay'].fillna(0, inplace=True)
flights2019_df['DepDel15'].fillna(0, inplace=True)
flights2019_df['DepartureDelayGroups'].fillna(0, inplace=True)
flights2019_df.isna().sum()

DayOfWeek                     0
FlightDate                    0
Reporting_Airline             0
Tail_Number                   0
Origin                        0
OriginCityName                0
OriginState                   0
Dest                          0
DestCityName                  0
DestState                     0
CRSDepTime                    0
DepTime                       0
DepDelay                      0
DepDel15                      0
DepartureDelayGroups          0
TaxiOut                  133977
WheelsOff                133977
Cancelled                     0
CancellationCode        7287112
Distance                      0
DistanceGroup                 0
CarrierDelay            6032784
WeatherDelay            6032784
NASDelay                6032784
SecurityDelay           6032784
LateAircraftDelay       6032784
CRSArrTime                    0
ArrTime                  137646
ArrDelay                 153805
dtype: int64

In [17]:
# look at the flights that are both cancelled and have a departure time

flights2019_df[(flights2019_df['Cancelled'] ==1) & (flights2019_df['ArrTime'])]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay


In [18]:
# inspect the columns with NaN values for arrival time that are not cancelled

flights2019_df[(flights2019_df['Cancelled'] == 0) & (flights2019_df['ArrTime'].isna())]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay
47884,2,2019-08-06,DL,N348NB,RSW,"Fort Myers, FL",FL,LGA,"New York, NY",NY,1959,2328.0,209.0,1.0,12.0,47.0,15.0,0,,1080,5,,,,,,2251,,
50386,3,2019-08-07,DL,N384DN,MSP,"Minneapolis, MN",MN,BWI,"Baltimore, MD",MD,1253,1251.0,-2.0,0.0,-1.0,17.0,1308.0,0,,936,4,,,,,,1618,,
51567,3,2019-08-07,DL,N317US,DTW,"Detroit, MI",MI,LGA,"New York, NY",NY,1359,48.0,649.0,1.0,12.0,14.0,102.0,0,,502,3,,,,,,1550,,
52390,3,2019-08-07,DL,N354DN,FLL,"Fort Lauderdale, FL",FL,LGA,"New York, NY",NY,1144,1147.0,3.0,0.0,0.0,24.0,1211.0,0,,1076,5,,,,,,1440,,
54178,4,2019-08-08,DL,N112DU,DFW,"Dallas/Fort Worth, TX",TX,LGA,"New York, NY",NY,1405,1459.0,54.0,1.0,3.0,21.0,1520.0,0,,1389,6,,,,,,1840,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7365406,6,2019-11-16,AA,N817NN,ORD,"Chicago, IL",IL,DCA,"Washington, DC",VA,707,703.0,-4.0,0.0,-1.0,29.0,732.0,0,,612,3,,,,,,1000,,
7370272,5,2019-11-01,B6,N353JB,MCO,"Orlando, FL",FL,HPN,"White Plains, NY",NY,1700,2048.0,228.0,1.0,12.0,18.0,2106.0,0,,972,4,,,,,,1934,,
7376357,6,2019-11-09,B6,N294JB,TPA,"Tampa, FL",FL,HPN,"White Plains, NY",NY,1812,2048.0,156.0,1.0,10.0,15.0,2103.0,0,,1032,5,,,,,,2050,,
7376455,6,2019-11-09,B6,N304JB,FLL,"Fort Lauderdale, FL",FL,HPN,"White Plains, NY",NY,1808,2103.0,175.0,1.0,11.0,19.0,2122.0,0,,1097,5,,,,,,2059,,


In [19]:
# there are 2,721 rows with flights that are not cancelled, and have a departure time, but no arrival time
# delete these rows

flights2019_df = flights2019_df.drop(flights2019_df[(flights2019_df['Cancelled'] == 0) & (flights2019_df['ArrTime'].isna())].index)
flights2019_df[(flights2019_df['Cancelled'] == 0) & (flights2019_df['ArrTime'].isna())]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay


In [20]:
# replace NaN with 0 for all flights that are cancelled 
# and check that there are now no flights with this combination

flights2019_df['ArrTime'][(flights2019_df['Cancelled'] == 1) & (flights2019_df['ArrTime'].isna())] = 0
flights2019_df['ArrDelay'][(flights2019_df['Cancelled'] == 1) & (flights2019_df['ArrDelay'].isna())] = 0
flights2019_df[(flights2019_df['Cancelled'] == 1) & (flights2019_df['ArrTime'])]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay


In [21]:
# check the status of NaNs

flights2019_df.isna().sum()

DayOfWeek                     0
FlightDate                    0
Reporting_Airline             0
Tail_Number                   0
Origin                        0
OriginCityName                0
OriginState                   0
Dest                          0
DestCityName                  0
DestState                     0
CRSDepTime                    0
DepTime                       0
DepDelay                      0
DepDel15                      0
DepartureDelayGroups          0
TaxiOut                  133977
WheelsOff                133977
Cancelled                     0
CancellationCode        7284391
Distance                      0
DistanceGroup                 0
CarrierDelay            6030063
WeatherDelay            6030063
NASDelay                6030063
SecurityDelay           6030063
LateAircraftDelay       6030063
CRSArrTime                    0
ArrTime                       0
ArrDelay                  16159
dtype: int64

In [22]:
# look at the remaining flights with NaN values for ArrDelay

flights2019_df[(flights2019_df['ArrDelay'].isna()) & (flights2019_df['ArrTime'])]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay
16,4,2019-08-01,DL,N928AT,ATL,"Atlanta, GA",GA,GSP,"Greer, SC",SC,1654,1702.0,8.0,0.0,0.0,37.0,1739.0,0,,153,1,,,,,,1757,2157.0,
317,4,2019-08-01,DL,N684DA,SLC,"Salt Lake City, UT",UT,ATL,"Atlanta, GA",GA,1105,1115.0,10.0,0.0,0.0,13.0,1128.0,0,,1590,7,,,,,,1646,2130.0,
975,4,2019-08-01,DL,N967DL,ATL,"Atlanta, GA",GA,SRQ,"Sarasota/Bradenton, FL",FL,1503,1526.0,23.0,1.0,1.0,36.0,1602.0,0,,444,2,,,,,,1641,1930.0,
1007,4,2019-08-01,DL,N922AT,MSN,"Madison, WI",WI,ATL,"Atlanta, GA",GA,1523,1520.0,-3.0,0.0,-1.0,10.0,1530.0,0,,707,3,,,,,,1835,1958.0,
1201,4,2019-08-01,DL,N988AT,DTW,"Detroit, MI",MI,CLT,"Charlotte, NC",NC,1730,1729.0,-1.0,0.0,-1.0,10.0,1739.0,0,,500,3,,,,,,1920,2155.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7417696,4,2019-11-07,DL,N920DN,ATL,"Atlanta, GA",GA,ORD,"Chicago, IL",IL,1111,1110.0,-1.0,0.0,-1.0,14.0,1124.0,0,,606,3,,,,,,1216,1714.0,
7420086,4,2019-11-07,DL,N966DL,ATL,"Atlanta, GA",GA,JAN,"Jackson/Vicksburg, MS",MS,2255,2306.0,11.0,0.0,0.0,13.0,2319.0,0,,341,2,,,,,,2320,519.0,
7420500,5,2019-11-08,DL,N121DU,FAI,"Fairbanks, AK",AK,SEA,"Seattle, WA",WA,55,322.0,147.0,1.0,9.0,24.0,346.0,0,,1533,7,,,,,,546,1111.0,
7420979,5,2019-11-08,DL,N669DN,MCO,"Orlando, FL",FL,SEA,"Seattle, WA",WA,612,609.0,-3.0,0.0,-1.0,15.0,624.0,0,,2554,11,,,,,,944,1129.0,


In [23]:
# fill in these NaN values with the difference between the expected arrival time (CRSArrTime) and the actual arrival time (ArrTime)

flights2019_df['ArrDelay'].fillna((flights2019_df['ArrTime'] - flights2019_df['CRSArrTime']), inplace=True)
flights2019_df[(flights2019_df['ArrDelay'].isna()) & (flights2019_df['ArrTime'])]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay


In [24]:
# check the status of NaNs

flights2019_df.isna().sum()

DayOfWeek                     0
FlightDate                    0
Reporting_Airline             0
Tail_Number                   0
Origin                        0
OriginCityName                0
OriginState                   0
Dest                          0
DestCityName                  0
DestState                     0
CRSDepTime                    0
DepTime                       0
DepDelay                      0
DepDel15                      0
DepartureDelayGroups          0
TaxiOut                  133977
WheelsOff                133977
Cancelled                     0
CancellationCode        7284391
Distance                      0
DistanceGroup                 0
CarrierDelay            6030063
WeatherDelay            6030063
NASDelay                6030063
SecurityDelay           6030063
LateAircraftDelay       6030063
CRSArrTime                    0
ArrTime                       0
ArrDelay                      0
dtype: int64

In [25]:
# fill in the TaxiOut column with the median value 

flights2019_df['TaxiOut'].fillna(flights2019_df['TaxiOut'].median(), inplace=True)
flights2019_df.isna().sum()

DayOfWeek                     0
FlightDate                    0
Reporting_Airline             0
Tail_Number                   0
Origin                        0
OriginCityName                0
OriginState                   0
Dest                          0
DestCityName                  0
DestState                     0
CRSDepTime                    0
DepTime                       0
DepDelay                      0
DepDel15                      0
DepartureDelayGroups          0
TaxiOut                       0
WheelsOff                133977
Cancelled                     0
CancellationCode        7284391
Distance                      0
DistanceGroup                 0
CarrierDelay            6030063
WeatherDelay            6030063
NASDelay                6030063
SecurityDelay           6030063
LateAircraftDelay       6030063
CRSArrTime                    0
ArrTime                       0
ArrDelay                      0
dtype: int64

In [26]:
# fill in WheelsOff with the CRSDepTime + the TaxiOut time

flights2019_df['WheelsOff'].fillna((flights2019_df['CRSDepTime'] + flights2019_df['TaxiOut']), inplace=True)
flights2019_df.isna().sum()

DayOfWeek                     0
FlightDate                    0
Reporting_Airline             0
Tail_Number                   0
Origin                        0
OriginCityName                0
OriginState                   0
Dest                          0
DestCityName                  0
DestState                     0
CRSDepTime                    0
DepTime                       0
DepDelay                      0
DepDel15                      0
DepartureDelayGroups          0
TaxiOut                       0
WheelsOff                     0
Cancelled                     0
CancellationCode        7284391
Distance                      0
DistanceGroup                 0
CarrierDelay            6030063
WeatherDelay            6030063
NASDelay                6030063
SecurityDelay           6030063
LateAircraftDelay       6030063
CRSArrTime                    0
ArrTime                       0
ArrDelay                      0
dtype: int64

In [27]:
# check the shapes of the whole dataframe as well as just the cancelled flights

display(flights2019_df.shape)
display(flights2019_df[flights2019_df['Cancelled'] == 1].shape)

(7419316, 29)

(134925, 29)

134,925 cancelled + 7,284,391 NaN Cancelation Code = 7,419,316 total --> correct

In [28]:
# replace NaN with 0 for CancellationCode and the reasons for Delay

flights2019_df['CancellationCode'].fillna(0, inplace=True)
flights2019_df['CarrierDelay'].fillna(0, inplace=True)
flights2019_df['WeatherDelay'].fillna(0, inplace=True)
flights2019_df['NASDelay'].fillna(0, inplace=True)
flights2019_df['SecurityDelay'].fillna(0, inplace=True)
flights2019_df['LateAircraftDelay'].fillna(0, inplace=True)
flights2019_df.isna().sum()

DayOfWeek               0
FlightDate              0
Reporting_Airline       0
Tail_Number             0
Origin                  0
OriginCityName          0
OriginState             0
Dest                    0
DestCityName            0
DestState               0
CRSDepTime              0
DepTime                 0
DepDelay                0
DepDel15                0
DepartureDelayGroups    0
TaxiOut                 0
WheelsOff               0
Cancelled               0
CancellationCode        0
Distance                0
DistanceGroup           0
CarrierDelay            0
WeatherDelay            0
NASDelay                0
SecurityDelay           0
LateAircraftDelay       0
CRSArrTime              0
ArrTime                 0
ArrDelay                0
dtype: int64

In [29]:
# add a boolean column for whether a flight is delayed or not (based on whether it arrives late)

flights2019_df['Delayed'] = 0

flights2019_df.loc[flights2019_df['ArrDelay'] <= 0, 'Delayed'] = 0
flights2019_df.loc[flights2019_df['ArrDelay'] > 0, 'Delayed'] = 1

flights2019_df['Delayed'].value_counts()

0    4905121
1    2514195
Name: Delayed, dtype: int64

In [30]:
# display the clean dataframe

flights2019_df

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay,Delayed
0,4,2019-08-01,DL,N354NW,ATL,"Atlanta, GA",GA,DFW,"Dallas/Fort Worth, TX",TX,1954,2025.0,31.0,1.0,2.0,17.0,2042.0,0,0,731,3,1.0,0.0,0.0,0.0,16.0,2114,2131.0,17.0,1
1,4,2019-08-01,DL,N320US,DFW,"Dallas/Fort Worth, TX",TX,ATL,"Atlanta, GA",GA,1709,1706.0,-3.0,0.0,-1.0,19.0,1725.0,0,0,731,3,0.0,0.0,0.0,0.0,0.0,2024,2012.0,-12.0,0
2,4,2019-08-01,DL,N931DN,IAH,"Houston, TX",TX,ATL,"Atlanta, GA",GA,1749,1829.0,40.0,1.0,2.0,12.0,1841.0,0,0,689,3,17.0,0.0,0.0,0.0,20.0,2102,2139.0,37.0,1
3,4,2019-08-01,DL,N851DN,PDX,"Portland, OR",OR,SLC,"Salt Lake City, UT",UT,1310,1306.0,-4.0,0.0,-1.0,13.0,1319.0,0,0,630,3,0.0,0.0,0.0,0.0,0.0,1601,1558.0,-3.0,0
4,4,2019-08-01,DL,N775DE,SLC,"Salt Lake City, UT",UT,PDX,"Portland, OR",OR,829,820.0,-9.0,0.0,-1.0,16.0,836.0,0,0,630,3,0.0,0.0,0.0,0.0,0.0,926,911.0,-15.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7422032,5,2019-11-08,DL,N978AT,HOU,"Houston, TX",TX,ATL,"Atlanta, GA",GA,1324,1320.0,-4.0,0.0,-1.0,14.0,1334.0,0,0,696,3,0.0,0.0,0.0,0.0,0.0,1624,1605.0,-19.0,0
7422033,5,2019-11-08,DL,N6716C,CHS,"Charleston, SC",SC,ATL,"Atlanta, GA",GA,700,719.0,19.0,1.0,1.0,10.0,729.0,0,0,259,2,0.0,0.0,0.0,0.0,0.0,817,820.0,3.0,1
7422034,5,2019-11-08,DL,N851DN,MSP,"Minneapolis, MN",MN,BWI,"Baltimore, MD",MD,1800,1754.0,-6.0,0.0,-1.0,22.0,1816.0,0,0,936,4,0.0,0.0,0.0,0.0,0.0,2130,2116.0,-14.0,0
7422035,5,2019-11-08,DL,N352NB,GEG,"Spokane, WA",WA,SLC,"Salt Lake City, UT",UT,1015,1009.0,-6.0,0.0,-1.0,11.0,1020.0,0,0,546,3,0.0,0.0,0.0,0.0,0.0,1258,1237.0,-21.0,0


In [31]:
# export the clean dataframe with no NaN values to a csv

flights2019_df.to_csv('flights2019clean.csv')