**EDA Flights 2018**
<br/>Read in the clean 2018 flight csv and perform further EDA.
<br/>Also run a basic logistic regression model and results for 2017.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score

from sklearn.preprocessing import StandardScaler

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# increase display size to enable viewing of all data columns

pd.options.display.max_rows = 360
pd.options.display.max_columns = 35

In [4]:
# read in the clean flights csv

flights2018_df = pd.read_csv('flights2018clean.csv', index_col=0)
flights2018_df

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay,Delayed
0,5,2018-08-31,UA,N75432,EWR,"Newark, NJ",NJ,DEN,"Denver, CO",CO,1601,1559.0,-2.0,0.0,-1.0,17.0,1616.0,0,0,1605,7,0.0,0.0,0.0,0.0,0.0,1818,1802.0,-16.0,0
1,5,2018-08-31,UA,N461UA,ORD,"Chicago, IL",IL,GEG,"Spokane, WA",WA,1932,1926.0,-6.0,0.0,-1.0,45.0,2011.0,0,0,1498,6,0.0,0.0,0.0,0.0,0.0,2128,2137.0,9.0,1
2,5,2018-08-31,UA,N76514,SNA,"Santa Ana, CA",CA,DEN,"Denver, CO",CO,1327,1636.0,189.0,1.0,12.0,20.0,1656.0,0,0,846,4,0.0,54.0,5.0,0.0,135.0,1642,1956.0,194.0,1
3,5,2018-08-31,UA,N76533,LAX,"Los Angeles, CA",CA,SFO,"San Francisco, CA",CA,600,550.0,-10.0,0.0,-1.0,16.0,606.0,0,0,337,2,0.0,0.0,0.0,0.0,0.0,730,710.0,-20.0,0
4,5,2018-08-31,UA,N35204,SFO,"San Francisco, CA",CA,SAN,"San Diego, CA",CA,2140,2141.0,1.0,1.0,0.0,31.0,2212.0,0,0,447,2,0.0,0.0,0.0,0.0,0.0,2311,2317.0,6.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7213441,4,2018-06-21,DL,N970AT,ORD,"Chicago, IL",IL,ATL,"Atlanta, GA",GA,1335,1354.0,19.0,1.0,1.0,33.0,1427.0,0,0,606,3,0.0,19.0,18.0,0.0,0.0,1640,1717.0,37.0,1
7213442,4,2018-06-21,DL,N927AT,BNA,"Nashville, TN",TN,MSP,"Minneapolis, MN",MN,1425,1417.0,-8.0,0.0,-1.0,14.0,1431.0,0,0,695,3,0.0,0.0,0.0,0.0,0.0,1642,1614.0,-28.0,0
7213443,4,2018-06-21,DL,N927AT,MSP,"Minneapolis, MN",MN,BNA,"Nashville, TN",TN,1125,1120.0,-5.0,0.0,-1.0,14.0,1134.0,0,0,695,3,0.0,0.0,0.0,0.0,0.0,1333,1317.0,-16.0,0
7213444,4,2018-06-21,DL,N320NB,AUS,"Austin, TX",TX,SLC,"Salt Lake City, UT",UT,1905,1931.0,26.0,1.0,1.0,9.0,1940.0,0,0,1086,5,0.0,0.0,0.0,0.0,0.0,2058,2111.0,13.0,1


In [5]:
# set the FlightDate to be a datecolumn and extract each of day, month and year separately, create own column for each
# drop the orignal FlightDate column and check that the new columns are correct

pd.to_datetime(flights2018_df['FlightDate'])
flights2018_df['Day'] = pd.DatetimeIndex(flights2018_df['FlightDate']).day
flights2018_df['Month'] = pd.DatetimeIndex(flights2018_df['FlightDate']).month
flights2018_df['Year'] = pd.DatetimeIndex(flights2018_df['FlightDate']).year

flights2018_df.drop(['FlightDate'], axis=1, inplace=True)

flights2018_df.head()

Unnamed: 0,DayOfWeek,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay,Delayed,Day,Month,Year
0,5,UA,N75432,EWR,"Newark, NJ",NJ,DEN,"Denver, CO",CO,1601,1559.0,-2.0,0.0,-1.0,17.0,1616.0,0,0,1605,7,0.0,0.0,0.0,0.0,0.0,1818,1802.0,-16.0,0,31,8,2018
1,5,UA,N461UA,ORD,"Chicago, IL",IL,GEG,"Spokane, WA",WA,1932,1926.0,-6.0,0.0,-1.0,45.0,2011.0,0,0,1498,6,0.0,0.0,0.0,0.0,0.0,2128,2137.0,9.0,1,31,8,2018
2,5,UA,N76514,SNA,"Santa Ana, CA",CA,DEN,"Denver, CO",CO,1327,1636.0,189.0,1.0,12.0,20.0,1656.0,0,0,846,4,0.0,54.0,5.0,0.0,135.0,1642,1956.0,194.0,1,31,8,2018
3,5,UA,N76533,LAX,"Los Angeles, CA",CA,SFO,"San Francisco, CA",CA,600,550.0,-10.0,0.0,-1.0,16.0,606.0,0,0,337,2,0.0,0.0,0.0,0.0,0.0,730,710.0,-20.0,0,31,8,2018
4,5,UA,N35204,SFO,"San Francisco, CA",CA,SAN,"San Diego, CA",CA,2140,2141.0,1.0,1.0,0.0,31.0,2212.0,0,0,447,2,0.0,0.0,0.0,0.0,0.0,2311,2317.0,6.0,1,31,8,2018


In [6]:
# reorder the columns to move day, month and year next to DayofWeek

flights2018_df = flights2018_df[['DayOfWeek',
                                 'Day',
                                 'Month',
                                 'Year',
                                 'Reporting_Airline',
                                 'Tail_Number',
                                 'Origin',
                                 'OriginCityName',
                                 'OriginState',
                                 'Dest',
                                 'DestCityName',
                                 'DestState',
                                 'CRSDepTime',
                                 'DepTime',
                                 'DepDelay',
                                 'DepDel15',
                                 'DepartureDelayGroups',
                                 'TaxiOut',
                                 'WheelsOff',
                                 'Cancelled',
                                 'CancellationCode',
                                 'Distance',
                                 'DistanceGroup',
                                 'CarrierDelay',
                                 'WeatherDelay',
                                 'NASDelay',
                                 'SecurityDelay',
                                 'LateAircraftDelay',
                                 'CRSArrTime',
                                 'ArrTime',
                                 'ArrDelay',
                                 'Delayed']]

flights2018_df.head()

Unnamed: 0,DayOfWeek,Day,Month,Year,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay,Delayed
0,5,31,8,2018,UA,N75432,EWR,"Newark, NJ",NJ,DEN,"Denver, CO",CO,1601,1559.0,-2.0,0.0,-1.0,17.0,1616.0,0,0,1605,7,0.0,0.0,0.0,0.0,0.0,1818,1802.0,-16.0,0
1,5,31,8,2018,UA,N461UA,ORD,"Chicago, IL",IL,GEG,"Spokane, WA",WA,1932,1926.0,-6.0,0.0,-1.0,45.0,2011.0,0,0,1498,6,0.0,0.0,0.0,0.0,0.0,2128,2137.0,9.0,1
2,5,31,8,2018,UA,N76514,SNA,"Santa Ana, CA",CA,DEN,"Denver, CO",CO,1327,1636.0,189.0,1.0,12.0,20.0,1656.0,0,0,846,4,0.0,54.0,5.0,0.0,135.0,1642,1956.0,194.0,1
3,5,31,8,2018,UA,N76533,LAX,"Los Angeles, CA",CA,SFO,"San Francisco, CA",CA,600,550.0,-10.0,0.0,-1.0,16.0,606.0,0,0,337,2,0.0,0.0,0.0,0.0,0.0,730,710.0,-20.0,0
4,5,31,8,2018,UA,N35204,SFO,"San Francisco, CA",CA,SAN,"San Diego, CA",CA,2140,2141.0,1.0,1.0,0.0,31.0,2212.0,0,0,447,2,0.0,0.0,0.0,0.0,0.0,2311,2317.0,6.0,1


In [7]:
# drop columns with redundant, 'give-away', or unuseful information

flights2018_df.drop(['Tail_Number', 
                     'OriginCityName', 
                     'DestCityName',
                     'DepTime', 
                     'DepDel15', 
                     'DepartureDelayGroups', 
                     'TaxiOut', 
                     'WheelsOff',
                     'DistanceGroup',
                     'CarrierDelay',
                     'WeatherDelay',
                     'NASDelay',
                     'SecurityDelay',
                     'LateAircraftDelay',
                     'CancellationCode'], inplace=True, axis=1)
flights2018_df.head()

Unnamed: 0,DayOfWeek,Day,Month,Year,Reporting_Airline,Origin,OriginState,Dest,DestState,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,ArrTime,ArrDelay,Delayed
0,5,31,8,2018,UA,EWR,NJ,DEN,CO,1601,-2.0,0,1605,1818,1802.0,-16.0,0
1,5,31,8,2018,UA,ORD,IL,GEG,WA,1932,-6.0,0,1498,2128,2137.0,9.0,1
2,5,31,8,2018,UA,SNA,CA,DEN,CO,1327,189.0,0,846,1642,1956.0,194.0,1
3,5,31,8,2018,UA,LAX,CA,SFO,CA,600,-10.0,0,337,730,710.0,-20.0,0
4,5,31,8,2018,UA,SFO,CA,SAN,CA,2140,1.0,0,447,2311,2317.0,6.0,1


In [8]:
# create dummy variables for the airlines

airline_df = pd.get_dummies(flights2018_df['Reporting_Airline'], drop_first=True)
airline_df

Unnamed: 0,AA,AS,B6,DL,EV,F9,G4,HA,MQ,NK,OH,OO,UA,VX,WN,YV,YX
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7213441,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
7213442,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
7213443,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
7213444,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
# create dummy variables for the origin airports

origin_df = pd.get_dummies(flights2018_df['Origin'], drop_first=True)
origin_df = origin_df.add_prefix('oa_')
origin_df

Unnamed: 0,oa_ABI,oa_ABQ,oa_ABR,oa_ABY,oa_ACK,oa_ACT,oa_ACV,oa_ACY,oa_ADK,oa_ADQ,oa_AEX,oa_AGS,oa_AKN,oa_ALB,oa_ALO,oa_AMA,oa_ANC,...,oa_TUS,oa_TVC,oa_TWF,oa_TXK,oa_TYR,oa_TYS,oa_UIN,oa_USA,oa_VEL,oa_VLD,oa_VPS,oa_WRG,oa_WYS,oa_XNA,oa_YAK,oa_YNG,oa_YUM
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7213441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7213442,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7213443,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7213444,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
# create dummy variables for the orign states

origin_state_df = pd.get_dummies(flights2018_df['OriginState'], drop_first=True)
origin_state_df = origin_state_df.add_prefix('os_')
origin_state_df

Unnamed: 0,os_AL,os_AR,os_AZ,os_CA,os_CO,os_CT,os_FL,os_GA,os_HI,os_IA,os_ID,os_IL,os_IN,os_KS,os_KY,os_LA,os_MA,...,os_OR,os_PA,os_PR,os_RI,os_SC,os_SD,os_TN,os_TT,os_TX,os_UT,os_VA,os_VI,os_VT,os_WA,os_WI,os_WV,os_WY
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7213441,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7213442,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
7213443,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7213444,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [11]:
# create dummy variables for the destination aiports

dest_df = pd.get_dummies(flights2018_df['Dest'], drop_first=True)
dest_df = dest_df.add_prefix('da_')
dest_df

Unnamed: 0,da_ABI,da_ABQ,da_ABR,da_ABY,da_ACK,da_ACT,da_ACV,da_ACY,da_ADK,da_ADQ,da_AEX,da_AGS,da_AKN,da_ALB,da_ALO,da_AMA,da_ANC,...,da_TUS,da_TVC,da_TWF,da_TXK,da_TYR,da_TYS,da_UIN,da_USA,da_VEL,da_VLD,da_VPS,da_WRG,da_WYS,da_XNA,da_YAK,da_YNG,da_YUM
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7213441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7213442,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7213443,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7213444,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
# create dummy variables for the destination states

dest_state_df = pd.get_dummies(flights2018_df['DestState'], drop_first=True)
dest_state_df = dest_state_df.add_prefix('ds_')
dest_state_df

Unnamed: 0,ds_AL,ds_AR,ds_AZ,ds_CA,ds_CO,ds_CT,ds_FL,ds_GA,ds_HI,ds_IA,ds_ID,ds_IL,ds_IN,ds_KS,ds_KY,ds_LA,ds_MA,...,ds_OR,ds_PA,ds_PR,ds_RI,ds_SC,ds_SD,ds_TN,ds_TT,ds_TX,ds_UT,ds_VA,ds_VI,ds_VT,ds_WA,ds_WI,ds_WV,ds_WY
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7213441,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7213442,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7213443,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
7213444,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [13]:
# add the dummy variable dataframes back to the original dataframe

flights2018_df = pd.concat([flights2018_df, airline_df, origin_df, origin_state_df, dest_df, dest_state_df], axis=1) 
flights2018_df.drop(['Reporting_Airline', 'Origin', 'OriginState', 'Dest', 'DestState'], inplace=True, axis=1)
flights2018_df

Unnamed: 0,DayOfWeek,Day,Month,Year,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,ArrTime,ArrDelay,Delayed,AA,AS,B6,DL,EV,...,ds_OR,ds_PA,ds_PR,ds_RI,ds_SC,ds_SD,ds_TN,ds_TT,ds_TX,ds_UT,ds_VA,ds_VI,ds_VT,ds_WA,ds_WI,ds_WV,ds_WY
0,5,31,8,2018,1601,-2.0,0,1605,1818,1802.0,-16.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,5,31,8,2018,1932,-6.0,0,1498,2128,2137.0,9.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,5,31,8,2018,1327,189.0,0,846,1642,1956.0,194.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,5,31,8,2018,600,-10.0,0,337,730,710.0,-20.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,31,8,2018,2140,1.0,0,447,2311,2317.0,6.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7213441,4,21,6,2018,1335,19.0,0,606,1640,1717.0,37.0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7213442,4,21,6,2018,1425,-8.0,0,695,1642,1614.0,-28.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7213443,4,21,6,2018,1125,-5.0,0,695,1333,1317.0,-16.0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
7213444,4,21,6,2018,1905,26.0,0,1086,2058,2111.0,13.0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [14]:
# move the target column to the last position

cancel_df = flights2018_df.pop('Delayed')
flights2018_df['Delayed'] = cancel_df
flights2018_df

Unnamed: 0,DayOfWeek,Day,Month,Year,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,ArrTime,ArrDelay,AA,AS,B6,DL,EV,F9,...,ds_PA,ds_PR,ds_RI,ds_SC,ds_SD,ds_TN,ds_TT,ds_TX,ds_UT,ds_VA,ds_VI,ds_VT,ds_WA,ds_WI,ds_WV,ds_WY,Delayed
0,5,31,8,2018,1601,-2.0,0,1605,1818,1802.0,-16.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,5,31,8,2018,1932,-6.0,0,1498,2128,2137.0,9.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
2,5,31,8,2018,1327,189.0,0,846,1642,1956.0,194.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,5,31,8,2018,600,-10.0,0,337,730,710.0,-20.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,31,8,2018,2140,1.0,0,447,2311,2317.0,6.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7213441,4,21,6,2018,1335,19.0,0,606,1640,1717.0,37.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
7213442,4,21,6,2018,1425,-8.0,0,695,1642,1614.0,-28.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7213443,4,21,6,2018,1125,-5.0,0,695,1333,1317.0,-16.0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
7213444,4,21,6,2018,1905,26.0,0,1086,2058,2111.0,13.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1


In [15]:
# take a 1% sample to run an initial model on

sampleflights2018_df = flights2018_df.sample(frac=0.01)
sampleflights2018_df

Unnamed: 0,DayOfWeek,Day,Month,Year,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,ArrTime,ArrDelay,AA,AS,B6,DL,EV,F9,...,ds_PA,ds_PR,ds_RI,ds_SC,ds_SD,ds_TN,ds_TT,ds_TX,ds_UT,ds_VA,ds_VI,ds_VT,ds_WA,ds_WI,ds_WV,ds_WY,Delayed
5055877,2,20,11,2018,600,-6.0,0,660,806,734.0,-32.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6369418,6,1,12,2018,600,-4.0,0,1053,850,839.0,-11.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6740234,3,6,6,2018,1425,-5.0,0,488,1715,1704.0,-11.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1085028,1,24,9,2018,1408,-5.0,0,957,1653,1638.0,-15.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3348043,2,23,10,2018,810,-1.0,0,1034,1057,1056.0,-1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3686508,1,16,4,2018,555,33.0,0,1242,728,818.0,50.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5648882,5,20,7,2018,943,0.0,0,1086,1330,1323.0,-7.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1064713,6,22,9,2018,2215,-4.0,0,224,2318,2310.0,-8.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3618365,5,6,4,2018,1620,1082.0,0,1107,1929,1314.0,1065.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [16]:
# create a dataframe of Jan-Oct to be used as training data

train_df = sampleflights2018_df[sampleflights2018_df['Month'] <= 10]
train_df

Unnamed: 0,DayOfWeek,Day,Month,Year,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,ArrTime,ArrDelay,AA,AS,B6,DL,EV,F9,...,ds_PA,ds_PR,ds_RI,ds_SC,ds_SD,ds_TN,ds_TT,ds_TX,ds_UT,ds_VA,ds_VI,ds_VT,ds_WA,ds_WI,ds_WV,ds_WY,Delayed
6740234,3,6,6,2018,1425,-5.0,0,488,1715,1704.0,-11.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1085028,1,24,9,2018,1408,-5.0,0,957,1653,1638.0,-15.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3348043,2,23,10,2018,810,-1.0,0,1034,1057,1056.0,-1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
279067,7,26,8,2018,1549,3.0,0,859,1730,1718.0,-12.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2234035,2,13,3,2018,905,-2.0,0,1532,1102,1048.0,-14.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3686508,1,16,4,2018,555,33.0,0,1242,728,818.0,50.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5648882,5,20,7,2018,943,0.0,0,1086,1330,1323.0,-7.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1064713,6,22,9,2018,2215,-4.0,0,224,2318,2310.0,-8.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3618365,5,6,4,2018,1620,1082.0,0,1107,1929,1314.0,1065.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [17]:
# create a dataframe of Nov-Dec to be used as validation data

validation_df = sampleflights2018_df[sampleflights2018_df['Month'] >= 11]
validation_df

Unnamed: 0,DayOfWeek,Day,Month,Year,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,ArrTime,ArrDelay,AA,AS,B6,DL,EV,F9,...,ds_PA,ds_PR,ds_RI,ds_SC,ds_SD,ds_TN,ds_TT,ds_TX,ds_UT,ds_VA,ds_VI,ds_VT,ds_WA,ds_WI,ds_WV,ds_WY,Delayed
5055877,2,20,11,2018,600,-6.0,0,660,806,734.0,-32.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6369418,6,1,12,2018,600,-4.0,0,1053,850,839.0,-11.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5285994,4,22,11,2018,1404,24.0,0,345,1516,1532.0,16.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5003977,1,26,11,2018,930,-1.0,0,239,1035,1029.0,-6.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
6354306,3,5,12,2018,1820,4.0,0,2521,2147,2151.0,4.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6069715,5,21,12,2018,1329,-1.0,0,372,1451,1437.0,-14.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
6351416,3,19,12,2018,1730,-3.0,0,1608,2044,2029.0,-15.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6509596,7,2,12,2018,1425,-2.0,0,813,1546,1609.0,23.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
6324946,2,11,12,2018,1700,-3.0,0,678,1820,1823.0,3.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [18]:
# split the data in X (independent variables) and y (dependent variable)

X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:,-1]

X_validation = validation_df.iloc[:, :-1]
y_validation = validation_df.iloc[:,-1]

In [19]:
# scale the data

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_validation = scaler.transform(X_validation)

In [20]:
# run an inital logistic regression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print(logreg.score(X_train, y_train))
print(logreg.score(X_validation, y_validation))

0.9918332863981977
0.9853504812196576


In [21]:
y_pred = logreg.predict(X_validation)
print(classification_report(y_validation, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      7556
           1       0.99      0.97      0.98      4185

    accuracy                           0.99     11741
   macro avg       0.99      0.98      0.98     11741
weighted avg       0.99      0.99      0.99     11741

