**EDA Flights 2017**
<br/>Read in the clean 2017 flight csv and perform further EDA.
<br/>Also run a basic logistic regression model and results for 2017.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score

from sklearn.preprocessing import StandardScaler

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# increase display size to enable viewing of all data columns

pd.options.display.max_rows = 350
pd.options.display.max_columns = 35

In [4]:
# read in the clean flights csv

flights2017_df = pd.read_csv('flights2017clean.csv', index_col=0)
flights2017_df

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay,Delayed
0,6,2017-07-01,AS,N559AS,DCA,"Washington, DC",VA,SEA,"Seattle, WA",WA,800,750.0,-10.0,0.0,-1.0,17.0,807.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,1053,1021.0,-32.0,0
1,6,2017-07-01,AS,N513AS,SEA,"Seattle, WA",WA,DCA,"Washington, DC",VA,1335,1330.0,-5.0,0.0,-1.0,15.0,1345.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,2153,2129.0,-24.0,0
2,6,2017-07-01,AS,N588AS,DCA,"Washington, DC",VA,SEA,"Seattle, WA",WA,1852,1856.0,4.0,0.0,0.0,9.0,1905.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,2157,2107.0,-50.0,0
3,6,2017-07-01,AS,N538AS,SEA,"Seattle, WA",WA,DCA,"Washington, DC",VA,800,840.0,40.0,1.0,2.0,9.0,849.0,0,0,2329,10,36.0,0.0,0.0,0.0,0.0,1617,1653.0,36.0,1
4,6,2017-07-01,AS,N536AS,DCA,"Washington, DC",VA,LAX,"Los Angeles, CA",CA,900,854.0,-6.0,0.0,-1.0,13.0,907.0,0,0,2311,10,0.0,0.0,0.0,0.0,0.0,1138,1115.0,-23.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5674616,2,2017-09-05,UA,N456UA,EWR,"Newark, NJ",NJ,LAS,"Las Vegas, NV",NV,842,949.0,67.0,1.0,4.0,24.0,1013.0,0,0,2227,9,20.0,0.0,0.0,0.0,27.0,1110,1157.0,47.0,1
5674617,2,2017-09-05,UA,N809UA,RDU,"Raleigh/Durham, NC",NC,DEN,"Denver, CO",CO,1623,1616.0,-7.0,0.0,-1.0,21.0,1637.0,0,0,1436,6,0.0,0.0,0.0,0.0,0.0,1817,1828.0,11.0,1
5674618,2,2017-09-05,UA,N69816,DEN,"Denver, CO",CO,SAT,"San Antonio, TX",TX,1526,1523.0,-3.0,0.0,-1.0,12.0,1535.0,0,0,794,4,0.0,0.0,0.0,0.0,0.0,1835,1817.0,-18.0,0
5674619,2,2017-09-05,UA,N17752,EWR,"Newark, NJ",NJ,DFW,"Dallas/Fort Worth, TX",TX,605,557.0,-8.0,0.0,-1.0,24.0,621.0,0,0,1372,6,0.0,0.0,0.0,0.0,0.0,845,827.0,-18.0,0


In [5]:
# set the FlightDate to be a datecolumn and extract each of day, month and year separately, create own column for each
# drop the orignal FlightDate column and check that the new columns are correct

pd.to_datetime(flights2017_df['FlightDate'])
flights2017_df['Day'] = pd.DatetimeIndex(flights2017_df['FlightDate']).day
flights2017_df['Month'] = pd.DatetimeIndex(flights2017_df['FlightDate']).month
flights2017_df['Year'] = pd.DatetimeIndex(flights2017_df['FlightDate']).year

flights2017_df.drop(['FlightDate'], axis=1, inplace=True)

flights2017_df.head()

Unnamed: 0,DayOfWeek,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay,Delayed,Day,Month,Year
0,6,AS,N559AS,DCA,"Washington, DC",VA,SEA,"Seattle, WA",WA,800,750.0,-10.0,0.0,-1.0,17.0,807.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,1053,1021.0,-32.0,0,1,7,2017
1,6,AS,N513AS,SEA,"Seattle, WA",WA,DCA,"Washington, DC",VA,1335,1330.0,-5.0,0.0,-1.0,15.0,1345.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,2153,2129.0,-24.0,0,1,7,2017
2,6,AS,N588AS,DCA,"Washington, DC",VA,SEA,"Seattle, WA",WA,1852,1856.0,4.0,0.0,0.0,9.0,1905.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,2157,2107.0,-50.0,0,1,7,2017
3,6,AS,N538AS,SEA,"Seattle, WA",WA,DCA,"Washington, DC",VA,800,840.0,40.0,1.0,2.0,9.0,849.0,0,0,2329,10,36.0,0.0,0.0,0.0,0.0,1617,1653.0,36.0,1,1,7,2017
4,6,AS,N536AS,DCA,"Washington, DC",VA,LAX,"Los Angeles, CA",CA,900,854.0,-6.0,0.0,-1.0,13.0,907.0,0,0,2311,10,0.0,0.0,0.0,0.0,0.0,1138,1115.0,-23.0,0,1,7,2017


In [6]:
# reorder the columns to move day, month and year next to DayofWeek

flights2017_df = flights2017_df[['DayOfWeek',
                                 'Day',
                                 'Month',
                                 'Year',
                                 'Reporting_Airline',
                                 'Tail_Number',
                                 'Origin',
                                 'OriginCityName',
                                 'OriginState',
                                 'Dest',
                                 'DestCityName',
                                 'DestState',
                                 'CRSDepTime',
                                 'DepTime',
                                 'DepDelay',
                                 'DepDel15',
                                 'DepartureDelayGroups',
                                 'TaxiOut',
                                 'WheelsOff',
                                 'Cancelled',
                                 'CancellationCode',
                                 'Distance',
                                 'DistanceGroup',
                                 'CarrierDelay',
                                 'WeatherDelay',
                                 'NASDelay',
                                 'SecurityDelay',
                                 'LateAircraftDelay',
                                 'CRSArrTime',
                                 'ArrTime',
                                 'ArrDelay',
                                 'Delayed']]

flights2017_df.head()

Unnamed: 0,DayOfWeek,Day,Month,Year,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay,Delayed
0,6,1,7,2017,AS,N559AS,DCA,"Washington, DC",VA,SEA,"Seattle, WA",WA,800,750.0,-10.0,0.0,-1.0,17.0,807.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,1053,1021.0,-32.0,0
1,6,1,7,2017,AS,N513AS,SEA,"Seattle, WA",WA,DCA,"Washington, DC",VA,1335,1330.0,-5.0,0.0,-1.0,15.0,1345.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,2153,2129.0,-24.0,0
2,6,1,7,2017,AS,N588AS,DCA,"Washington, DC",VA,SEA,"Seattle, WA",WA,1852,1856.0,4.0,0.0,0.0,9.0,1905.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,2157,2107.0,-50.0,0
3,6,1,7,2017,AS,N538AS,SEA,"Seattle, WA",WA,DCA,"Washington, DC",VA,800,840.0,40.0,1.0,2.0,9.0,849.0,0,0,2329,10,36.0,0.0,0.0,0.0,0.0,1617,1653.0,36.0,1
4,6,1,7,2017,AS,N536AS,DCA,"Washington, DC",VA,LAX,"Los Angeles, CA",CA,900,854.0,-6.0,0.0,-1.0,13.0,907.0,0,0,2311,10,0.0,0.0,0.0,0.0,0.0,1138,1115.0,-23.0,0


In [7]:
# number of airlines

display(flights2017_df['Reporting_Airline'].nunique())

12

In [8]:
# check to see if origin, origin city and origin state have a one-to-one relationship

display(flights2017_df['Origin'].nunique())
display(flights2017_df['OriginCityName'].nunique())
display(flights2017_df['OriginState'].nunique())

319

315

52

In [9]:
# check to see if destination, destination city and destination state have a one-to-one relationship

display(flights2017_df['Dest'].nunique())
display(flights2017_df['DestCityName'].nunique())
display(flights2017_df['DestState'].nunique())

320

316

52

In [10]:
# drop columns with redundant, 'give-away', or unuseful information

flights2017_df.drop(['Tail_Number', 
                     'OriginCityName', 
                     'DestCityName',
                     'DepTime', 
                     'DepDel15', 
                     'DepartureDelayGroups', 
                     'TaxiOut', 
                     'WheelsOff',
                     'DistanceGroup',
                     'CarrierDelay',
                     'WeatherDelay',
                     'NASDelay',
                     'SecurityDelay',
                     'LateAircraftDelay',
                     'CancellationCode',
                     'Cancelled'], inplace=True, axis=1)
flights2017_df.head()

Unnamed: 0,DayOfWeek,Day,Month,Year,Reporting_Airline,Origin,OriginState,Dest,DestState,CRSDepTime,DepDelay,Distance,CRSArrTime,ArrTime,ArrDelay,Delayed
0,6,1,7,2017,AS,DCA,VA,SEA,WA,800,-10.0,2329,1053,1021.0,-32.0,0
1,6,1,7,2017,AS,SEA,WA,DCA,VA,1335,-5.0,2329,2153,2129.0,-24.0,0
2,6,1,7,2017,AS,DCA,VA,SEA,WA,1852,4.0,2329,2157,2107.0,-50.0,0
3,6,1,7,2017,AS,SEA,WA,DCA,VA,800,40.0,2329,1617,1653.0,36.0,1
4,6,1,7,2017,AS,DCA,VA,LAX,CA,900,-6.0,2311,1138,1115.0,-23.0,0


In [11]:
# create dummy variables for the airlines

airline_df = pd.get_dummies(flights2017_df['Reporting_Airline'], drop_first=True)
airline_df

Unnamed: 0,AS,B6,DL,EV,F9,HA,NK,OO,UA,VX,WN
0,1,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
5674616,0,0,0,0,0,0,0,0,1,0,0
5674617,0,0,0,0,0,0,0,0,1,0,0
5674618,0,0,0,0,0,0,0,0,1,0,0
5674619,0,0,0,0,0,0,0,0,1,0,0


In [12]:
# create dummy variables for the origin airports

origin_df = pd.get_dummies(flights2017_df['Origin'], drop_first=True)
origin_df = origin_df.add_prefix('oa_')
origin_df

Unnamed: 0,oa_ABI,oa_ABQ,oa_ABR,oa_ABY,oa_ACK,oa_ACT,oa_ACV,oa_ACY,oa_ADK,oa_ADQ,oa_AEX,oa_AGS,oa_AKN,oa_ALB,oa_ALO,oa_AMA,oa_ANC,...,oa_TTN,oa_TUL,oa_TUS,oa_TVC,oa_TWF,oa_TXK,oa_TYR,oa_TYS,oa_UIN,oa_UST,oa_VLD,oa_VPS,oa_WRG,oa_WYS,oa_XNA,oa_YAK,oa_YUM
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5674616,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5674617,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5674618,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5674619,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
# create dummy variables for the orign states

origin_state_df = pd.get_dummies(flights2017_df['OriginState'], drop_first=True)
origin_state_df = origin_state_df.add_prefix('os_')
origin_state_df

Unnamed: 0,os_AL,os_AR,os_AZ,os_CA,os_CO,os_CT,os_FL,os_GA,os_HI,os_IA,os_ID,os_IL,os_IN,os_KS,os_KY,os_LA,os_MA,...,os_OR,os_PA,os_PR,os_RI,os_SC,os_SD,os_TN,os_TT,os_TX,os_UT,os_VA,os_VI,os_VT,os_WA,os_WI,os_WV,os_WY
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5674616,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5674617,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5674618,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5674619,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [14]:
# create dummy variables for the destination aiports

dest_df = pd.get_dummies(flights2017_df['Dest'], drop_first=True)
dest_df = dest_df.add_prefix('da_')
dest_df

Unnamed: 0,da_ABI,da_ABQ,da_ABR,da_ABY,da_ACK,da_ACT,da_ACV,da_ACY,da_ADK,da_ADQ,da_AEX,da_AGS,da_AKN,da_ALB,da_ALO,da_AMA,da_ANC,...,da_TTN,da_TUL,da_TUS,da_TVC,da_TWF,da_TXK,da_TYR,da_TYS,da_UIN,da_UST,da_VLD,da_VPS,da_WRG,da_WYS,da_XNA,da_YAK,da_YUM
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5674616,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5674617,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5674618,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5674619,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
# create dummy variables for the destination states

dest_state_df = pd.get_dummies(flights2017_df['DestState'], drop_first=True)
dest_state_df = dest_state_df.add_prefix('ds_')
dest_state_df

Unnamed: 0,ds_AL,ds_AR,ds_AZ,ds_CA,ds_CO,ds_CT,ds_FL,ds_GA,ds_HI,ds_IA,ds_ID,ds_IL,ds_IN,ds_KS,ds_KY,ds_LA,ds_MA,...,ds_OR,ds_PA,ds_PR,ds_RI,ds_SC,ds_SD,ds_TN,ds_TT,ds_TX,ds_UT,ds_VA,ds_VI,ds_VT,ds_WA,ds_WI,ds_WV,ds_WY
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5674616,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5674617,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5674618,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
5674619,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [16]:
# add the dummy variable dataframes back to the original dataframe

flights2017_df = pd.concat([flights2017_df, airline_df, origin_df, origin_state_df, dest_df, dest_state_df], axis=1) 
flights2017_df.drop(['Reporting_Airline', 'Origin', 'OriginState', 'Dest', 'DestState'], inplace=True, axis=1)
flights2017_df

Unnamed: 0,DayOfWeek,Day,Month,Year,CRSDepTime,DepDelay,Distance,CRSArrTime,ArrTime,ArrDelay,Delayed,AS,B6,DL,EV,F9,HA,...,ds_OR,ds_PA,ds_PR,ds_RI,ds_SC,ds_SD,ds_TN,ds_TT,ds_TX,ds_UT,ds_VA,ds_VI,ds_VT,ds_WA,ds_WI,ds_WV,ds_WY
0,6,1,7,2017,800,-10.0,2329,1053,1021.0,-32.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,6,1,7,2017,1335,-5.0,2329,2153,2129.0,-24.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,6,1,7,2017,1852,4.0,2329,2157,2107.0,-50.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,6,1,7,2017,800,40.0,2329,1617,1653.0,36.0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,6,1,7,2017,900,-6.0,2311,1138,1115.0,-23.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5674616,2,5,9,2017,842,67.0,2227,1110,1157.0,47.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5674617,2,5,9,2017,1623,-7.0,1436,1817,1828.0,11.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5674618,2,5,9,2017,1526,-3.0,794,1835,1817.0,-18.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
5674619,2,5,9,2017,605,-8.0,1372,845,827.0,-18.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [17]:
# move the target column to the last position

cancel_df = flights2017_df.pop('Delayed')
flights2017_df['Delayed'] = cancel_df
flights2017_df

Unnamed: 0,DayOfWeek,Day,Month,Year,CRSDepTime,DepDelay,Distance,CRSArrTime,ArrTime,ArrDelay,AS,B6,DL,EV,F9,HA,NK,...,ds_PA,ds_PR,ds_RI,ds_SC,ds_SD,ds_TN,ds_TT,ds_TX,ds_UT,ds_VA,ds_VI,ds_VT,ds_WA,ds_WI,ds_WV,ds_WY,Delayed
0,6,1,7,2017,800,-10.0,2329,1053,1021.0,-32.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,6,1,7,2017,1335,-5.0,2329,2153,2129.0,-24.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,6,1,7,2017,1852,4.0,2329,2157,2107.0,-50.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,6,1,7,2017,800,40.0,2329,1617,1653.0,36.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
4,6,1,7,2017,900,-6.0,2311,1138,1115.0,-23.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5674616,2,5,9,2017,842,67.0,2227,1110,1157.0,47.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5674617,2,5,9,2017,1623,-7.0,1436,1817,1828.0,11.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5674618,2,5,9,2017,1526,-3.0,794,1835,1817.0,-18.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
5674619,2,5,9,2017,605,-8.0,1372,845,827.0,-18.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [18]:
# take a 1% sample to run an initial model on

sampleflights2017_df = flights2017_df.sample(frac=0.01)
sampleflights2017_df

Unnamed: 0,DayOfWeek,Day,Month,Year,CRSDepTime,DepDelay,Distance,CRSArrTime,ArrTime,ArrDelay,AS,B6,DL,EV,F9,HA,NK,...,ds_PA,ds_PR,ds_RI,ds_SC,ds_SD,ds_TN,ds_TT,ds_TX,ds_UT,ds_VA,ds_VI,ds_VT,ds_WA,ds_WI,ds_WV,ds_WY,Delayed
529910,1,5,6,2017,2245,223.0,1524,412,749.0,217.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
3134223,3,24,5,2017,1355,-4.0,264,1418,1416.0,-2.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2486960,7,12,11,2017,800,43.0,1390,1305,1405.0,60.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
2514322,4,23,11,2017,640,-1.0,189,735,731.0,-4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
5196064,7,13,8,2017,1545,-12.0,1721,1812,1748.0,-24.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3191944,3,3,5,2017,1330,-6.0,337,1459,1449.0,-10.0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5520821,7,24,9,2017,1710,-5.0,946,2000,1953.0,-7.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
397085,3,19,7,2017,600,-7.0,1067,754,741.0,-13.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2739451,1,6,11,2017,600,-3.0,461,740,726.0,-14.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
# create a dataframe of Jan-Oct to be used as training data

train_df = sampleflights2017_df[sampleflights2017_df['Month'] <= 10]
train_df

Unnamed: 0,DayOfWeek,Day,Month,Year,CRSDepTime,DepDelay,Distance,CRSArrTime,ArrTime,ArrDelay,AS,B6,DL,EV,F9,HA,NK,...,ds_PA,ds_PR,ds_RI,ds_SC,ds_SD,ds_TN,ds_TT,ds_TX,ds_UT,ds_VA,ds_VI,ds_VT,ds_WA,ds_WI,ds_WV,ds_WY,Delayed
529910,1,5,6,2017,2245,223.0,1524,412,749.0,217.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
3134223,3,24,5,2017,1355,-4.0,264,1418,1416.0,-2.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5196064,7,13,8,2017,1545,-12.0,1721,1812,1748.0,-24.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
5461002,4,21,9,2017,800,-8.0,1846,1032,1032.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2949225,7,7,5,2017,1055,-15.0,201,1202,1135.0,-27.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3736321,2,3,1,2017,1040,36.0,1476,1600,1628.0,28.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3191944,3,3,5,2017,1330,-6.0,337,1459,1449.0,-10.0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5520821,7,24,9,2017,1710,-5.0,946,2000,1953.0,-7.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
397085,3,19,7,2017,600,-7.0,1067,754,741.0,-13.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [20]:
# create a dataframe of Nov-Dec to be used as validation data

validation_df = sampleflights2017_df[sampleflights2017_df['Month'] >= 11]
validation_df

Unnamed: 0,DayOfWeek,Day,Month,Year,CRSDepTime,DepDelay,Distance,CRSArrTime,ArrTime,ArrDelay,AS,B6,DL,EV,F9,HA,NK,...,ds_PA,ds_PR,ds_RI,ds_SC,ds_SD,ds_TN,ds_TT,ds_TX,ds_UT,ds_VA,ds_VI,ds_VT,ds_WA,ds_WI,ds_WV,ds_WY,Delayed
2486960,7,12,11,2017,800,43.0,1390,1305,1405.0,60.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
2514322,4,23,11,2017,640,-1.0,189,735,731.0,-4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2859310,4,9,11,2017,1222,6.0,1269,1720,1720.0,0.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2674223,5,10,11,2017,2200,105.0,358,2332,1303.0,-1029.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2775781,1,27,11,2017,1610,-4.0,808,1745,1718.0,-27.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1336065,2,26,12,2017,1855,-4.0,404,2025,2021.0,-4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1072991,1,18,12,2017,1330,14.0,1670,1552,1549.0,-3.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2848275,6,18,11,2017,1510,-1.0,2611,1840,1828.0,-12.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1073651,1,18,12,2017,1509,27.0,782,1638,1647.0,9.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [21]:
# split the data in X (independent variables) and y (dependent variable)

X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:,-1]

X_validation = validation_df.iloc[:, :-1]
y_validation = validation_df.iloc[:,-1]

In [22]:
# scale the data

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_validation = scaler.transform(X_validation)

In [23]:
# run an inital logistic regression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print(logreg.score(X_train, y_train))
print(logreg.score(X_validation, y_validation))

0.9889694295619288
0.982316534040672


In [24]:
y_pred = logreg.predict(X_validation)
print(classification_report(y_validation, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      6364
           1       0.99      0.95      0.97      2684

    accuracy                           0.98      9048
   macro avg       0.99      0.97      0.98      9048
weighted avg       0.98      0.98      0.98      9048

