**Clean Flights 2017**
<br/>Perform cleaning on the 2017 dataframe and then export a clean csv.

In [1]:
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# increase display size to enable viewing of all data columns

pd.options.display.max_columns = 35

In [4]:
# read in the 2017 flights csv and ensure everything loads correctly

flights2017_df = pd.read_csv('flights2017.csv', index_col=0)
flights2017_df

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay
0,6,2017-07-01,AS,N559AS,DCA,"Washington, DC",VA,SEA,"Seattle, WA",WA,800,750.0,-10.0,0.0,-1.0,17.0,807.0,0,,2329,10,,,,,,1053,1021.0,-32.0
1,6,2017-07-01,AS,N513AS,SEA,"Seattle, WA",WA,DCA,"Washington, DC",VA,1335,1330.0,-5.0,0.0,-1.0,15.0,1345.0,0,,2329,10,,,,,,2153,2129.0,-24.0
2,6,2017-07-01,AS,N588AS,DCA,"Washington, DC",VA,SEA,"Seattle, WA",WA,1852,1856.0,4.0,0.0,0.0,9.0,1905.0,0,,2329,10,,,,,,2157,2107.0,-50.0
3,6,2017-07-01,AS,N538AS,SEA,"Seattle, WA",WA,DCA,"Washington, DC",VA,800,840.0,40.0,1.0,2.0,9.0,849.0,0,,2329,10,36.0,0.0,0.0,0.0,0.0,1617,1653.0,36.0
4,6,2017-07-01,AS,N536AS,DCA,"Washington, DC",VA,LAX,"Los Angeles, CA",CA,900,854.0,-6.0,0.0,-1.0,13.0,907.0,0,,2311,10,,,,,,1138,1115.0,-23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5674616,2,2017-09-05,UA,N456UA,EWR,"Newark, NJ",NJ,LAS,"Las Vegas, NV",NV,842,949.0,67.0,1.0,4.0,24.0,1013.0,0,,2227,9,20.0,0.0,0.0,0.0,27.0,1110,1157.0,47.0
5674617,2,2017-09-05,UA,N809UA,RDU,"Raleigh/Durham, NC",NC,DEN,"Denver, CO",CO,1623,1616.0,-7.0,0.0,-1.0,21.0,1637.0,0,,1436,6,,,,,,1817,1828.0,11.0
5674618,2,2017-09-05,UA,N69816,DEN,"Denver, CO",CO,SAT,"San Antonio, TX",TX,1526,1523.0,-3.0,0.0,-1.0,12.0,1535.0,0,,794,4,,,,,,1835,1817.0,-18.0
5674619,2,2017-09-05,UA,N17752,EWR,"Newark, NJ",NJ,DFW,"Dallas/Fort Worth, TX",TX,605,557.0,-8.0,0.0,-1.0,24.0,621.0,0,,1372,6,,,,,,845,827.0,-18.0


In [5]:
# summary statistics for the dataframe

flights2017_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DayOfWeek,5674621.0,3.939507,1.991768,1.0,2.0,4.0,6.0,7.0
CRSDepTime,5674621.0,1329.968832,490.93757,1.0,912.0,1323.0,1735.0,2359.0
DepTime,5594313.0,1333.704391,504.398714,1.0,914.0,1327.0,1743.0,2400.0
DepDelay,5594278.0,9.725734,43.865896,-234.0,-5.0,-2.0,6.0,2755.0
DepDel15,5594278.0,0.181229,0.385208,0.0,0.0,0.0,0.0,1.0
DepartureDelayGroups,5594278.0,0.025027,2.149665,-2.0,-1.0,-1.0,0.0,12.0
TaxiOut,5592476.0,16.779942,9.365385,0.0,11.0,14.0,20.0,183.0
WheelsOff,5592480.0,1355.793697,506.222749,1.0,930.0,1340.0,1758.0,2400.0
Cancelled,5674621.0,0.014572,0.119834,0.0,0.0,0.0,0.0,1.0
Distance,5674621.0,856.689109,624.488107,31.0,391.0,680.0,1097.0,4983.0


In [6]:
# check to see if there any NaN values in the data

flights2017_df.isna().sum()

DayOfWeek                     0
FlightDate                    0
Reporting_Airline             0
Tail_Number               12511
Origin                        0
OriginCityName                0
OriginState                   0
Dest                          0
DestCityName                  0
DestState                     0
CRSDepTime                    0
DepTime                   80308
DepDelay                  80343
DepDel15                  80343
DepartureDelayGroups      80343
TaxiOut                   82145
WheelsOff                 82141
Cancelled                     0
CancellationCode        5591928
Distance                      0
DistanceGroup                 0
CarrierDelay            4645148
WeatherDelay            4645148
NASDelay                4645148
SecurityDelay           4645148
LateAircraftDelay       4645148
CRSArrTime                    0
ArrTime                   84674
ArrDelay                  95211
dtype: int64

In [7]:
# list of all the columns in the data dataframe with missing values and the percent that's missing

round(np.mean(flights2017_df[flights2017_df.columns[flights2017_df.isnull().any()]].isna(), axis = 0) * 100,2)

Tail_Number              0.22
DepTime                  1.42
DepDelay                 1.42
DepDel15                 1.42
DepartureDelayGroups     1.42
TaxiOut                  1.45
WheelsOff                1.45
CancellationCode        98.54
CarrierDelay            81.86
WeatherDelay            81.86
NASDelay                81.86
SecurityDelay           81.86
LateAircraftDelay       81.86
ArrTime                  1.49
ArrDelay                 1.68
dtype: float64

In [8]:
# this shows that all flights missing the tail numbers were cancelled

flights2017_df[(flights2017_df['Cancelled'] == 1) & (flights2017_df['Tail_Number'].isna())]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay
34539,5,2017-07-28,OO,,DHN,"Dothan, AL",AL,ATL,"Atlanta, GA",GA,1059,,,,,,,1,A,170,1,,,,,,1258,,
76501,1,2017-07-10,OO,,ORD,"Chicago, IL",IL,FWA,"Fort Wayne, IN",IN,732,,,,,,,1,B,157,1,,,,,,935,,
76591,1,2017-07-10,OO,,FWA,"Fort Wayne, IN",IN,ORD,"Chicago, IL",IL,1005,,,,,,,1,B,157,1,,,,,,1009,,
76747,1,2017-07-10,OO,,TVC,"Traverse City, MI",MI,ORD,"Chicago, IL",IL,1320,,,,,,,1,B,224,1,,,,,,1339,,
77231,1,2017-07-10,OO,,ORD,"Chicago, IL",IL,TVC,"Traverse City, MI",MI,1040,,,,,,,1,B,224,1,,,,,,1250,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5674594,2,2017-09-05,UA,,IAH,"Houston, TX",TX,BOS,"Boston, MA",MA,1940,,,,,,,1,C,1597,7,,,,,,27,,
5674595,2,2017-09-05,UA,,PDX,"Portland, OR",OR,IAH,"Houston, TX",TX,1230,,,,,,,1,C,1825,8,,,,,,1835,,
5674603,2,2017-09-05,UA,,IAH,"Houston, TX",TX,PHX,"Phoenix, AZ",AZ,909,,,,,,,1,C,1009,5,,,,,,950,,
5674606,2,2017-09-05,UA,,IAH,"Houston, TX",TX,LAX,"Los Angeles, CA",CA,910,,,,,,,1,C,1379,6,,,,,,1040,,


In [9]:
# fill in 'unknown' for the NaN tail numbers
# these are all flights that were cancelled, and we already have a small number of cancelled flights, so don't want to delete them

flights2017_df['Tail_Number'].fillna('unknown', inplace=True)
flights2017_df.shape

(5674621, 29)

In [10]:
# check to ensure that tail number was filled in appropriately

flights2017_df.isna().sum()

DayOfWeek                     0
FlightDate                    0
Reporting_Airline             0
Tail_Number                   0
Origin                        0
OriginCityName                0
OriginState                   0
Dest                          0
DestCityName                  0
DestState                     0
CRSDepTime                    0
DepTime                   80308
DepDelay                  80343
DepDel15                  80343
DepartureDelayGroups      80343
TaxiOut                   82145
WheelsOff                 82141
Cancelled                     0
CancellationCode        5591928
Distance                      0
DistanceGroup                 0
CarrierDelay            4645148
WeatherDelay            4645148
NASDelay                4645148
SecurityDelay           4645148
LateAircraftDelay       4645148
CRSArrTime                    0
ArrTime                   84674
ArrDelay                  95211
dtype: int64

In [11]:
# check to see how many cancelled flights there were in 2017

flights2017_df[flights2017_df['Cancelled'] == 1].shape

(82693, 29)

82,693 cancelled flights, but 80,308 with no departure time -- there are 2,385 (see below) that need updating.

In [12]:
# look at the flights that are both cancelled and have a departure time

flights2017_df[(flights2017_df['Cancelled'] ==1) & (flights2017_df['DepTime'])]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay
3604,5,2017-07-07,AS,N526AS,LAX,"Los Angeles, CA",CA,PDX,"Portland, OR",OR,830,832.0,2.0,0.0,0.0,,,1,A,834,4,,,,,,1048,,
4618,7,2017-07-09,AS,N320AS,BNA,"Nashville, TN",TN,SEA,"Seattle, WA",WA,1510,1502.0,-8.0,0.0,-1.0,,,1,A,1978,8,,,,,,1801,,
4936,1,2017-07-10,AS,N618AS,ANC,"Anchorage, AK",AK,BET,"Bethel, AK",AK,620,609.0,-11.0,0.0,-1.0,12.0,621.0,1,B,399,2,,,,,,733,,
4957,1,2017-07-10,AS,N524AS,SIT,"Sitka, AK",AK,SEA,"Seattle, WA",WA,1357,1704.0,187.0,1.0,12.0,,,1,A,861,4,,,,,,1705,,
5368,1,2017-07-10,AS,N407AS,BWI,"Baltimore, MD",MD,SEA,"Seattle, WA",WA,1840,1851.0,11.0,0.0,0.0,,,1,A,2335,10,,,,,,2141,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5651439,1,2017-09-11,UA,N818UA,ORD,"Chicago, IL",IL,CHS,"Charleston, SC",SC,734,737.0,3.0,0.0,0.0,20.0,757.0,1,B,760,4,,,,,,1047,,
5662110,5,2017-09-08,UA,N77431,IAD,"Washington, DC",VA,SFO,"San Francisco, CA",CA,1935,1925.0,-10.0,0.0,-1.0,,,1,A,2419,10,,,,,,2235,,
5670557,3,2017-09-20,UA,N17244,SFO,"San Francisco, CA",CA,DEN,"Denver, CO",CO,2030,2141.0,71.0,1.0,4.0,,,1,A,967,4,,,,,,2359,,
5671096,2,2017-09-19,UA,N78506,BZN,"Bozeman, MT",MT,DEN,"Denver, CO",CO,714,718.0,4.0,0.0,0.0,,,1,A,524,3,,,,,,852,,


These don't have arrival times, so it is safe to assume that they were actually cancelled, and should not have a departure time.

In [13]:
# change the departure time to 0 for all flights that are cancelled and have a depature time
# and check that there are now no flights with this combination

flights2017_df['DepTime'][(flights2017_df['Cancelled'] == 1) & (flights2017_df['DepTime'])] = 0
flights2017_df[(flights2017_df['Cancelled'] ==1) & (flights2017_df['DepTime'])]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay


In [14]:
# check for flights that have a depature time greater than 0, but no departure delay

flights2017_df[(flights2017_df['DepTime'] > 0 ) & (flights2017_df['DepDelay'].isna())]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay
2159957,7,2017-10-29,F9,N713FR,MCO,"Orlando, FL",FL,PVD,"Providence, RI",RI,1915,1925.0,,,,175.0,2220.0,0,,1072,5,,,,,,2205,,


In [15]:
# there is only one flight in this situation, so manually fill in the appropriate values for DepDelay, DepDel15, and DepatureDelayGroups

flights2017_df['DepDelay'][(flights2017_df['DepTime'] > 0 ) & (flights2017_df['DepDelay'].isna())] = 10
flights2017_df['DepDel15'][(flights2017_df['DepTime'] > 0 ) & (flights2017_df['DepDel15'].isna())] = 0
flights2017_df['DepartureDelayGroups'][(flights2017_df['DepTime'] > 0 ) & (flights2017_df['DepartureDelayGroups'].isna())] = 0

In [16]:
# review remaining flights with NaN values in the DepDelay column

flights2017_df[(flights2017_df['DepDelay'].isna())]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay
581,7,2017-07-02,AS,N765AS,BRW,"Barrow, AK",AK,FAI,"Fairbanks, AK",AK,1150,,,,,,,1,B,503,3,,,,,,1310,,
1157,1,2017-07-03,AS,N763AS,PSG,"Petersburg, AK",AK,WRG,"Wrangell, AK",AK,1509,,,,,,,1,B,31,1,,,,,,1532,,
1389,1,2017-07-03,AS,N615AS,SEA,"Seattle, WA",WA,SAN,"San Diego, CA",CA,1355,,,,,,,1,A,1050,5,,,,,,1642,,
1394,1,2017-07-03,AS,N644AS,SAN,"San Diego, CA",CA,SEA,"Seattle, WA",WA,800,,,,,,,1,A,1050,5,,,,,,1055,,
1490,1,2017-07-03,AS,N546AS,DFW,"Dallas/Fort Worth, TX",TX,SEA,"Seattle, WA",WA,1920,,,,,,,1,A,1660,7,,,,,,2133,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5674594,2,2017-09-05,UA,unknown,IAH,"Houston, TX",TX,BOS,"Boston, MA",MA,1940,,,,,,,1,C,1597,7,,,,,,27,,
5674595,2,2017-09-05,UA,unknown,PDX,"Portland, OR",OR,IAH,"Houston, TX",TX,1230,,,,,,,1,C,1825,8,,,,,,1835,,
5674603,2,2017-09-05,UA,unknown,IAH,"Houston, TX",TX,PHX,"Phoenix, AZ",AZ,909,,,,,,,1,C,1009,5,,,,,,950,,
5674606,2,2017-09-05,UA,unknown,IAH,"Houston, TX",TX,LAX,"Los Angeles, CA",CA,910,,,,,,,1,C,1379,6,,,,,,1040,,


In [17]:
# fill in 0 for the remaining NaN values in the DepTime, DepDelay, DepDel15 and DepartureDelayGroups columns

flights2017_df['DepTime'].fillna(0, inplace=True)
flights2017_df['DepDelay'].fillna(0, inplace=True)
flights2017_df['DepDel15'].fillna(0, inplace=True)
flights2017_df['DepartureDelayGroups'].fillna(0, inplace=True)
flights2017_df.isna().sum()

DayOfWeek                     0
FlightDate                    0
Reporting_Airline             0
Tail_Number                   0
Origin                        0
OriginCityName                0
OriginState                   0
Dest                          0
DestCityName                  0
DestState                     0
CRSDepTime                    0
DepTime                       0
DepDelay                      0
DepDel15                      0
DepartureDelayGroups          0
TaxiOut                   82145
WheelsOff                 82141
Cancelled                     0
CancellationCode        5591928
Distance                      0
DistanceGroup                 0
CarrierDelay            4645148
WeatherDelay            4645148
NASDelay                4645148
SecurityDelay           4645148
LateAircraftDelay       4645148
CRSArrTime                    0
ArrTime                   84674
ArrDelay                  95211
dtype: int64

In [18]:
# look at the flights that are both cancelled and have a departure time

flights2017_df[(flights2017_df['Cancelled'] ==1) & (flights2017_df['ArrTime'])]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay
4216847,1,2017-03-06,NK,N509NK,OAK,"Oakland, CA",CA,LAX,"Los Angeles, CA",CA,2115,0.0,0.0,0.0,0.0,,,1,C,337,2,,,,,,1139,1130.0,-9.0
4216896,1,2017-03-13,NK,N588NK,FLL,"Fort Lauderdale, FL",FL,EWR,"Newark, NJ",NJ,2050,0.0,0.0,0.0,0.0,,,1,B,1065,5,,,,,,1407,1358.0,-9.0
4216927,1,2017-03-06,NK,N615NK,DEN,"Denver, CO",CO,LAS,"Las Vegas, NV",NV,1131,0.0,0.0,0.0,0.0,,,1,C,628,3,,,,,,1651,1613.0,-38.0
4216931,1,2017-03-06,NK,N615NK,LAS,"Las Vegas, NV",NV,LAX,"Los Angeles, CA",CA,1313,0.0,0.0,0.0,0.0,,,1,C,236,1,,,,,,1651,1611.0,-40.0
4216935,1,2017-03-06,NK,N615NK,LAS,"Las Vegas, NV",NV,DEN,"Denver, CO",CO,755,0.0,0.0,0.0,0.0,,,1,C,628,3,,,,,,1651,1811.0,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4695729,7,2017-03-12,AA,N4UBAA,DEN,"Denver, CO",CO,ORD,"Chicago, IL",IL,1417,0.0,0.0,0.0,0.0,,,1,A,888,4,,,,,,950,1025.0,35.0
4695733,7,2017-03-12,AA,N566AA,ORD,"Chicago, IL",IL,DEN,"Denver, CO",CO,1200,0.0,0.0,0.0,0.0,,,1,A,888,4,,,,,,720,718.0,-2.0
4696598,7,2017-03-12,AA,N3HGAA,ABQ,"Albuquerque, NM",NM,DFW,"Dallas/Fort Worth, TX",TX,1500,0.0,0.0,0.0,0.0,,,1,A,569,3,,,,,,1750,1753.0,3.0
4696672,7,2017-03-12,AA,N4XMAA,ORD,"Chicago, IL",IL,DFW,"Dallas/Fort Worth, TX",TX,1025,0.0,0.0,0.0,0.0,,,1,A,802,4,,,,,,815,814.0,-1.0


In [19]:
# change the arrival time and arrival delay to 0 for all flights that are cancelled and have an arrival time
# and check that there are now no flights with this combination

flights2017_df['ArrTime'][(flights2017_df['Cancelled'] == 1) & (flights2017_df['ArrTime'])] = 0
flights2017_df['ArrDelay'][(flights2017_df['Cancelled'] == 1) & (flights2017_df['ArrDelay'])] = 0
flights2017_df[(flights2017_df['Cancelled'] == 1) & (flights2017_df['ArrTime'])]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay


In [20]:
# inspect the columns with NaN values for arrival time that are not cancelled

flights2017_df[(flights2017_df['Cancelled'] == 0) & (flights2017_df['ArrTime'].isna())]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay
583,7,2017-07-02,AS,N765AS,SCC,"Deadhorse, AK",AK,BRW,"Barrow, AK",AK,1006,1005.0,-1.0,0.0,-1.0,6.0,1011.0,0,,204,1,,,,,,1054,,
1156,1,2017-07-03,AS,N763AS,JNU,"Juneau, AK",AK,PSG,"Petersburg, AK",AK,1341,1406.0,25.0,1.0,1.0,19.0,1425.0,0,,123,1,,,,,,1424,,
2513,3,2017-07-05,AS,N318AS,SEA,"Seattle, WA",WA,LAS,"Las Vegas, NV",NV,2150,2145.0,-5.0,0.0,-1.0,20.0,2205.0,0,,867,4,,,,,,15,,
3061,4,2017-07-06,AS,N529AS,PDX,"Portland, OR",OR,SNA,"Santa Ana, CA",CA,1540,1903.0,203.0,1.0,12.0,21.0,1924.0,0,,859,4,,,,,,1802,,
3732,5,2017-07-07,AS,N563AS,DCA,"Washington, DC",VA,PDX,"Portland, OR",OR,1720,1710.0,-10.0,0.0,-1.0,28.0,1738.0,0,,2350,10,,,,,,2023,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5650422,1,2017-09-11,UA,N17229,SFO,"San Francisco, CA",CA,SNA,"Santa Ana, CA",CA,2102,2221.0,79.0,1.0,5.0,21.0,2242.0,0,,372,2,,,,,,2230,,
5670034,3,2017-09-20,UA,N76529,IAH,"Houston, TX",TX,SNA,"Santa Ana, CA",CA,2108,2107.0,-1.0,0.0,-1.0,14.0,2121.0,0,,1346,6,,,,,,2230,,
5671890,2,2017-09-19,UA,N14250,SFO,"San Francisco, CA",CA,BUR,"Burbank, CA",CA,2015,2009.0,-6.0,0.0,-1.0,16.0,2025.0,0,,326,2,,,,,,2134,,
5673914,2,2017-09-05,UA,N54711,EWR,"Newark, NJ",NJ,SNA,"Santa Ana, CA",CA,1833,2122.0,169.0,1.0,11.0,75.0,2237.0,0,,2434,10,,,,,,2133,,


In [21]:
# there are 10,318 rows with flights that are not cancelled, and have a departure time, but no arrival time
# delete these rows

flights2017_df = flights2017_df.drop(flights2017_df[(flights2017_df['Cancelled'] == 0) & (flights2017_df['ArrTime'].isna())].index)
flights2017_df[(flights2017_df['Cancelled'] == 0) & (flights2017_df['ArrTime'].isna())]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay


In [22]:
# replace NaN with 0 for all flights that are cancelled 
# and check that there are now no flights with this combination

flights2017_df['ArrTime'][(flights2017_df['Cancelled'] == 1) & (flights2017_df['ArrTime'].isna())] = 0
flights2017_df['ArrDelay'][(flights2017_df['Cancelled'] == 1) & (flights2017_df['ArrDelay'].isna())] = 0
flights2017_df[(flights2017_df['Cancelled'] == 1) & (flights2017_df['ArrTime'])]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay


In [23]:
# check the status of NaNs

flights2017_df.isna().sum()

DayOfWeek                     0
FlightDate                    0
Reporting_Airline             0
Tail_Number                   0
Origin                        0
OriginCityName                0
OriginState                   0
Dest                          0
DestCityName                  0
DestState                     0
CRSDepTime                    0
DepTime                       0
DepDelay                      0
DepDel15                      0
DepartureDelayGroups          0
TaxiOut                   82145
WheelsOff                 82141
Cancelled                     0
CancellationCode        5581610
Distance                      0
DistanceGroup                 0
CarrierDelay            4636458
WeatherDelay            4636458
NASDelay                4636458
SecurityDelay           4636458
LateAircraftDelay       4636458
CRSArrTime                    0
ArrTime                       0
ArrDelay                  10531
dtype: int64

In [24]:
# look at the remaining flights with NaN values for ArrDelay

flights2017_df[(flights2017_df['ArrDelay'].isna()) & (flights2017_df['ArrTime'])]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay
452,6,2017-07-01,AS,N247AK,PDX,"Portland, OR",OR,MCO,"Orlando, FL",FL,640,639.0,-1.0,0.0,-1.0,18.0,657.0,0,,2534,11,,,,,,1507,1807.0,
1133,1,2017-07-03,AS,N577AS,ANC,"Anchorage, AK",AK,SCC,"Deadhorse, AK",AK,737,727.0,-10.0,0.0,-1.0,8.0,735.0,0,,627,3,,,,,,916,1019.0,
2550,3,2017-07-05,AS,N481AS,SEA,"Seattle, WA",WA,DFW,"Dallas/Fort Worth, TX",TX,1220,1212.0,-8.0,0.0,-1.0,24.0,1236.0,0,,1660,7,,,,,,1820,2058.0,
4740,7,2017-07-09,AS,N585AS,SEA,"Seattle, WA",WA,DFW,"Dallas/Fort Worth, TX",TX,700,654.0,-6.0,0.0,-1.0,15.0,709.0,0,,1660,7,,,,,,1256,1529.0,
4900,1,2017-07-10,AS,N525AS,LAX,"Los Angeles, CA",CA,DCA,"Washington, DC",VA,1230,1250.0,20.0,1.0,1.0,12.0,1302.0,0,,2311,10,,,,,,2054,2322.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5673978,2,2017-09-05,UA,N852UA,ORD,"Chicago, IL",IL,LGA,"New York, NY",NY,1500,1458.0,-2.0,0.0,-1.0,30.0,1528.0,0,,733,3,,,,,,1818,2300.0,
5673987,2,2017-09-05,UA,N63890,SAN,"San Diego, CA",CA,EWR,"Newark, NJ",NJ,1125,1201.0,36.0,1.0,2.0,20.0,1221.0,0,,2425,10,,,,,,2005,2359.0,
5674252,2,2017-09-05,UA,N69819,LAS,"Las Vegas, NV",NV,EWR,"Newark, NJ",NJ,1120,1118.0,-2.0,0.0,-1.0,31.0,1149.0,0,,2227,9,,,,,,1927,2301.0,
5674261,2,2017-09-05,UA,N24729,PHX,"Phoenix, AZ",AZ,EWR,"Newark, NJ",NJ,1036,1218.0,102.0,1.0,6.0,19.0,1237.0,0,,2133,9,,,,,,1832,2239.0,


In [25]:
# fill in these NaN values with the difference between the expected arrival time (CRSArrTime) and the actual arrival time (ArrTime)

flights2017_df['ArrDelay'].fillna((flights2017_df['ArrTime'] - flights2017_df['CRSArrTime']), inplace=True)
flights2017_df[(flights2017_df['ArrDelay'].isna()) & (flights2017_df['ArrTime'])]

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay


In [26]:
# check the status of NaNs

flights2017_df.isna().sum()

DayOfWeek                     0
FlightDate                    0
Reporting_Airline             0
Tail_Number                   0
Origin                        0
OriginCityName                0
OriginState                   0
Dest                          0
DestCityName                  0
DestState                     0
CRSDepTime                    0
DepTime                       0
DepDelay                      0
DepDel15                      0
DepartureDelayGroups          0
TaxiOut                   82145
WheelsOff                 82141
Cancelled                     0
CancellationCode        5581610
Distance                      0
DistanceGroup                 0
CarrierDelay            4636458
WeatherDelay            4636458
NASDelay                4636458
SecurityDelay           4636458
LateAircraftDelay       4636458
CRSArrTime                    0
ArrTime                       0
ArrDelay                      0
dtype: int64

In [27]:
# fill in the TaxiOut column with the median value 

flights2017_df['TaxiOut'].fillna(flights2017_df['TaxiOut'].median(), inplace=True)
flights2017_df.isna().sum()

DayOfWeek                     0
FlightDate                    0
Reporting_Airline             0
Tail_Number                   0
Origin                        0
OriginCityName                0
OriginState                   0
Dest                          0
DestCityName                  0
DestState                     0
CRSDepTime                    0
DepTime                       0
DepDelay                      0
DepDel15                      0
DepartureDelayGroups          0
TaxiOut                       0
WheelsOff                 82141
Cancelled                     0
CancellationCode        5581610
Distance                      0
DistanceGroup                 0
CarrierDelay            4636458
WeatherDelay            4636458
NASDelay                4636458
SecurityDelay           4636458
LateAircraftDelay       4636458
CRSArrTime                    0
ArrTime                       0
ArrDelay                      0
dtype: int64

In [28]:
# fill in WheelsOff with the CRSDepTime + the TaxiOut time

flights2017_df['WheelsOff'].fillna((flights2017_df['CRSDepTime'] + flights2017_df['TaxiOut']), inplace=True)
flights2017_df.isna().sum()

DayOfWeek                     0
FlightDate                    0
Reporting_Airline             0
Tail_Number                   0
Origin                        0
OriginCityName                0
OriginState                   0
Dest                          0
DestCityName                  0
DestState                     0
CRSDepTime                    0
DepTime                       0
DepDelay                      0
DepDel15                      0
DepartureDelayGroups          0
TaxiOut                       0
WheelsOff                     0
Cancelled                     0
CancellationCode        5581610
Distance                      0
DistanceGroup                 0
CarrierDelay            4636458
WeatherDelay            4636458
NASDelay                4636458
SecurityDelay           4636458
LateAircraftDelay       4636458
CRSArrTime                    0
ArrTime                       0
ArrDelay                      0
dtype: int64

In [29]:
# check the shapes of the whole dataframe and just the cancelled flights

display(flights2017_df.shape)
display(flights2017_df[flights2017_df['Cancelled'] == 1].shape)

(5664303, 29)

(82693, 29)

82,693 cancelled + 5,581,610 NaN Cancelation Code = 5,664,303 total

In [30]:
# replace NaN with 0 for CancellationCode and the reason for Delay

flights2017_df['CancellationCode'].fillna(0, inplace=True)
flights2017_df['CarrierDelay'].fillna(0, inplace=True)
flights2017_df['WeatherDelay'].fillna(0, inplace=True)
flights2017_df['NASDelay'].fillna(0, inplace=True)
flights2017_df['SecurityDelay'].fillna(0, inplace=True)
flights2017_df['LateAircraftDelay'].fillna(0, inplace=True)
flights2017_df.isna().sum()

DayOfWeek               0
FlightDate              0
Reporting_Airline       0
Tail_Number             0
Origin                  0
OriginCityName          0
OriginState             0
Dest                    0
DestCityName            0
DestState               0
CRSDepTime              0
DepTime                 0
DepDelay                0
DepDel15                0
DepartureDelayGroups    0
TaxiOut                 0
WheelsOff               0
Cancelled               0
CancellationCode        0
Distance                0
DistanceGroup           0
CarrierDelay            0
WeatherDelay            0
NASDelay                0
SecurityDelay           0
LateAircraftDelay       0
CRSArrTime              0
ArrTime                 0
ArrDelay                0
dtype: int64

In [31]:
# add a boolean column for whether a flight is delayed or not

flights2017_df['Delayed'] = 0

flights2017_df.loc[flights2017_df['ArrDelay'] <= 0, 'Delayed'] = 0
flights2017_df.loc[flights2017_df['ArrDelay'] > 0, 'Delayed'] = 1

flights2017_df['Delayed'].value_counts()

0    3713977
1    1950326
Name: Delayed, dtype: int64

In [32]:
# display the clean dataframe

flights2017_df

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay,Delayed
0,6,2017-07-01,AS,N559AS,DCA,"Washington, DC",VA,SEA,"Seattle, WA",WA,800,750.0,-10.0,0.0,-1.0,17.0,807.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,1053,1021.0,-32.0,0
1,6,2017-07-01,AS,N513AS,SEA,"Seattle, WA",WA,DCA,"Washington, DC",VA,1335,1330.0,-5.0,0.0,-1.0,15.0,1345.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,2153,2129.0,-24.0,0
2,6,2017-07-01,AS,N588AS,DCA,"Washington, DC",VA,SEA,"Seattle, WA",WA,1852,1856.0,4.0,0.0,0.0,9.0,1905.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,2157,2107.0,-50.0,0
3,6,2017-07-01,AS,N538AS,SEA,"Seattle, WA",WA,DCA,"Washington, DC",VA,800,840.0,40.0,1.0,2.0,9.0,849.0,0,0,2329,10,36.0,0.0,0.0,0.0,0.0,1617,1653.0,36.0,1
4,6,2017-07-01,AS,N536AS,DCA,"Washington, DC",VA,LAX,"Los Angeles, CA",CA,900,854.0,-6.0,0.0,-1.0,13.0,907.0,0,0,2311,10,0.0,0.0,0.0,0.0,0.0,1138,1115.0,-23.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5674616,2,2017-09-05,UA,N456UA,EWR,"Newark, NJ",NJ,LAS,"Las Vegas, NV",NV,842,949.0,67.0,1.0,4.0,24.0,1013.0,0,0,2227,9,20.0,0.0,0.0,0.0,27.0,1110,1157.0,47.0,1
5674617,2,2017-09-05,UA,N809UA,RDU,"Raleigh/Durham, NC",NC,DEN,"Denver, CO",CO,1623,1616.0,-7.0,0.0,-1.0,21.0,1637.0,0,0,1436,6,0.0,0.0,0.0,0.0,0.0,1817,1828.0,11.0,1
5674618,2,2017-09-05,UA,N69816,DEN,"Denver, CO",CO,SAT,"San Antonio, TX",TX,1526,1523.0,-3.0,0.0,-1.0,12.0,1535.0,0,0,794,4,0.0,0.0,0.0,0.0,0.0,1835,1817.0,-18.0,0
5674619,2,2017-09-05,UA,N17752,EWR,"Newark, NJ",NJ,DFW,"Dallas/Fort Worth, TX",TX,605,557.0,-8.0,0.0,-1.0,24.0,621.0,0,0,1372,6,0.0,0.0,0.0,0.0,0.0,845,827.0,-18.0,0


In [33]:
# export the clean dataframe with no NaN values to a csv

flights2017_df.to_csv('flights2017clean.csv')