**Burbank Initial Modelling**
<br/>Read in the 2017 and 2018 Burbank dataframes.
<br/>Run a number of initial models (logistic regression, KNN, decision tree, random forest, MLP Classifier).

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# increase display size to enable viewing of all data columns

pd.options.display.max_columns = 40

In [4]:
# read in the 2017 burbank csv

burbank2017_df = pd.read_csv('burbank2017.csv', index_col=0)
burbank2017_df.drop(['Cancelled', 'DepDelay'], axis=1, inplace=True)
burbank2017_df

Unnamed: 0,DayOfWeek,CRSDepTime,Distance,CRSArrTime,Origin_WIND,Origin_PRCP,Origin_SNOW,Origin_TEMP,Dest_WIND,Dest_PRCP,Dest_SNOW,Dest_TEMP,Day,Month,B6,OO,UA,WN,da_DEN,da_JFK,da_LAS,da_OAK,da_PDX,da_PHX,da_SEA,da_SFO,da_SJC,da_SLC,da_SMF,Delayed
0,6,700,937,930,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
1,6,1730,937,1958,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,6,1215,937,1443,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,7,700,937,930,7.38,0.0,0.0,71.5,4.92,0.0,0.0,62.0,2,7,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
4,7,1730,937,1958,7.38,0.0,0.0,71.5,4.92,0.0,0.0,62.0,2,7,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25100,2,708,326,833,6.71,0.0,0.0,67.5,15.66,0.0,0.0,64.0,19,9,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1
25101,2,2015,326,2141,6.71,0.0,0.0,67.5,15.66,0.0,0.0,64.0,19,9,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
25102,2,700,850,1029,6.71,0.0,0.0,67.5,14.09,0.0,0.0,72.0,19,9,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
25103,2,1220,326,1339,10.51,0.0,0.0,77.5,7.61,0.0,0.0,76.0,5,9,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0


In [5]:
# read in the 2018 burbank csv

burbank2018_df = pd.read_csv('burbank2018.csv', index_col=0)
burbank2018_df.drop(['Cancelled', 'DepDelay'], axis=1, inplace=True)
burbank2018_df

Unnamed: 0,DayOfWeek,CRSDepTime,Distance,CRSArrTime,Origin_WIND,Origin_PRCP,Origin_SNOW,Origin_TEMP,Dest_WIND,Dest_PRCP,Dest_SNOW,Dest_TEMP,Day,Month,B6,OO,UA,WN,da_DEN,da_JFK,da_LAS,da_OAK,da_PDX,da_PHX,da_SEA,da_SFO,da_SJC,da_SLC,da_SMF,Delayed
0,5,2030,326,2200,6.93,0.0,0.0,74.0,15.21,0.0,0.0,62.0,31,8,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
1,5,700,326,826,6.93,0.0,0.0,74.0,15.21,0.0,0.0,62.0,31,8,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1
2,5,715,850,1035,6.93,0.0,0.0,74.0,9.40,0.0,0.0,76.0,31,8,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
3,4,2030,326,2200,7.38,0.0,0.0,75.5,11.18,0.0,0.0,65.0,30,8,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
4,4,700,326,826,7.38,0.0,0.0,75.5,11.18,0.0,0.0,65.0,30,8,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25222,2,1956,326,2115,8.05,0.0,0.0,68.5,14.09,0.0,0.0,60.0,19,6,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1
25223,2,700,326,835,8.05,0.0,0.0,68.5,14.09,0.0,0.0,60.0,19,6,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
25224,1,1956,326,2115,11.18,0.0,0.0,65.0,12.75,0.0,0.0,65.0,18,6,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
25225,1,700,326,835,11.18,0.0,0.0,65.0,12.75,0.0,0.0,65.0,18,6,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1


In [6]:
burbank2017_df.isna().sum()

DayOfWeek      0
CRSDepTime     0
Distance       0
CRSArrTime     0
Origin_WIND    0
Origin_PRCP    0
Origin_SNOW    0
Origin_TEMP    0
Dest_WIND      0
Dest_PRCP      0
Dest_SNOW      0
Dest_TEMP      0
Day            0
Month          0
B6             0
OO             0
UA             0
WN             0
da_DEN         0
da_JFK         0
da_LAS         0
da_OAK         0
da_PDX         0
da_PHX         0
da_SEA         0
da_SFO         0
da_SJC         0
da_SLC         0
da_SMF         0
Delayed        0
dtype: int64

In [7]:
# split the data in X (independent variables) and y (dependent variable)

X_train = burbank2017_df.iloc[:, :-1]
y_train = burbank2017_df.iloc[:,-1]

X_validation = burbank2018_df.iloc[:, :-1]
y_validation = burbank2018_df.iloc[:,-1]

In [8]:
# scale the data

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_validation = scaler.transform(X_validation)

In [9]:
# run an inital logistic regression model

burbank_lr = LogisticRegression()
burbank_lr.fit(X_train, y_train)
print(burbank_lr.score(X_train, y_train))
print(burbank_lr.score(X_validation, y_validation))

0.6297550288787094
0.6338050501446862


In [10]:
# review the precision and recall

y_pred = burbank_lr.predict(X_validation)
print(classification_report(y_validation, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.84      0.74     15738
           1       0.52      0.30      0.38      9489

    accuracy                           0.63     25227
   macro avg       0.59      0.57      0.56     25227
weighted avg       0.61      0.63      0.60     25227



In [11]:
coefficient_df = pd.DataFrame(columns=['Feature', 'Coefficient'])

coefficient_df['Feature'] = burbank2017_df.iloc[:, :-1].columns.T
coefficient_df['Coefficient'] = burbank_lr.coef_.T.reshape(29)

display(coefficient_df.sort_values(by = ['Coefficient'], ascending=False).head(10))
display(coefficient_df.sort_values(by = ['Coefficient']).head(10))

Unnamed: 0,Feature,Coefficient
17,WN,0.560756
3,CRSArrTime,0.277365
24,da_SEA,0.254192
15,OO,0.23111
25,da_SFO,0.174271
11,Dest_TEMP,0.130767
16,UA,0.105221
4,Origin_WIND,0.101379
14,B6,0.099092
19,da_JFK,0.099092


Unnamed: 0,Feature,Coefficient
27,da_SLC,-0.190258
26,da_SJC,-0.0687
13,Month,-0.05808
23,da_PHX,-0.05118
28,da_SMF,-0.046278
2,Distance,-0.024648
7,Origin_TEMP,-0.015379
18,da_DEN,-0.005296
6,Origin_SNOW,0.0
5,Origin_PRCP,0.000177


In [12]:
# try out KNN

burbank_KNN = KNeighborsClassifier()
burbank_KNN.fit(X_train, y_train)
    
train_predictions = burbank_KNN.predict(X_train)
validation_predictions = burbank_KNN.predict(X_validation)
print(f'The accuracy score on the train set is: {accuracy_score(train_predictions, y_train)}')
print(f'The accuracy score on the vaidation set is: {accuracy_score(validation_predictions, y_validation)}')

The accuracy score on the train set is: 0.7740689105755826
The accuracy score on the vaidation set is: 0.5969794268046141


In [13]:
# try a decision tree

burbank_DT = DecisionTreeClassifier(max_depth=5)
burbank_DT.fit(X_train, y_train)
 
print(f'The accuracy score on the train set is: {burbank_DT.score(X_train, y_train)}')    
print(f'The accuracy score on the validation set is: {burbank_DT.score(X_validation, y_validation)}')

The accuracy score on the train set is: 0.6520215096594304
The accuracy score on the validation set is: 0.6259959567130455


In [14]:
# try a random forest

burbank_RF = RandomForestClassifier()
burbank_RF.fit(X_train, y_train)

print(f'The accuracy score on the train set is: {burbank_RF.score(X_train, y_train)}')
print(f'The accuracy score on the validation set is: {burbank_RF.score(X_validation, y_validation)}')

The accuracy score on the train set is: 0.9805218084047003
The accuracy score on the validation set is: 0.6202085067586316


In [15]:
# neural network

burbank_nn = MLPClassifier(hidden_layer_sizes=(5),solver='lbfgs')
burbank_nn.fit(X_train, y_train);

print(f'Train Score: {burbank_nn.score(X_train, y_train)}')
print(f'Test Score: {burbank_nn.score(X_validation, y_validation)}')

Train Score: 0.6641704839673371
Test Score: 0.6335275696674199


In [16]:
burbank2017_df.corr()

Unnamed: 0,DayOfWeek,CRSDepTime,Distance,CRSArrTime,Origin_WIND,Origin_PRCP,Origin_SNOW,Origin_TEMP,Dest_WIND,Dest_PRCP,Dest_SNOW,Dest_TEMP,Day,Month,B6,OO,UA,WN,da_DEN,da_JFK,da_LAS,da_OAK,da_PDX,da_PHX,da_SEA,da_SFO,da_SJC,da_SLC,da_SMF,Delayed
DayOfWeek,1.0,-0.000553,0.019127,0.001165,0.008637,0.071583,,-0.00833,-0.008441,-0.010666,-0.006254,-0.011965,-0.010528,0.018023,0.008478,0.014184,0.001675,-0.023655,0.00179,0.008478,0.011075,-0.002899,0.015897,0.000659,0.015444,-0.0129,-0.000667,-0.001507,-0.023498,0.03983
CRSDepTime,-0.000553,1.0,0.055114,0.792442,-0.005738,0.004843,,-0.014191,-0.017344,0.016764,-0.015504,-0.000509,0.000336,-0.000735,0.205913,-0.036122,-0.095128,0.041163,-0.142564,0.205913,0.018249,0.051269,0.036474,-0.002249,-0.062752,-0.007952,0.006622,-0.080393,0.058314,0.14578
Distance,0.019127,0.055114,1.0,-0.153592,0.000358,0.00044,,-0.005162,0.164622,0.084298,0.077609,-0.151594,-0.002715,-0.016384,0.727014,0.080042,0.03279,-0.425425,0.291147,0.727014,-0.274164,-0.17621,0.249894,-0.079875,0.310131,-0.155829,-0.193044,0.09246,-0.099296,-0.04048
CRSArrTime,0.001165,0.792442,-0.153592,1.0,-0.010424,0.008258,,-0.016549,-0.0456,0.003659,-0.013746,-0.012561,0.000686,0.004042,-0.22301,0.034697,-0.065123,0.057423,-0.03347,-0.22301,-0.012547,0.037154,0.073731,0.01592,-0.017997,-0.012333,-0.014811,0.009953,0.0412,0.139958
Origin_WIND,0.008637,-0.005738,0.000358,-0.010424,1.0,0.247466,,-0.037313,0.224653,0.046596,-0.00349,0.212782,0.08829,-0.26092,-0.002968,0.019031,-0.004347,-0.004546,0.001847,-0.002968,0.000247,-0.003897,0.007112,-0.005616,-0.001869,-0.001443,0.002379,0.005221,0.000566,0.075288
Origin_PRCP,0.071583,0.004843,0.00044,0.008258,0.247466,1.0,,-0.274038,0.074095,0.215374,0.013978,-0.139829,-0.021969,-0.263094,-0.002278,0.011494,-0.020874,-0.001577,0.000376,-0.002278,0.004678,0.003796,0.007007,0.009686,0.002956,0.002886,-0.018349,-0.013492,0.002619,0.029258
Origin_SNOW,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Origin_TEMP,-0.00833,-0.014191,-0.005162,-0.016549,-0.037313,-0.274038,,1.0,-0.075711,-0.137406,-0.075014,0.593243,0.057309,0.363153,-0.002504,-0.00798,0.03792,0.001298,0.004278,-0.002504,-0.004714,-0.011795,-0.015151,-0.021231,-0.007297,-0.000498,0.030812,0.029,-0.003944,0.001801
Dest_WIND,-0.008441,-0.017344,0.164622,-0.0456,0.224653,0.074095,,-0.075711,1.0,0.150286,0.033496,-0.112876,-0.040062,-0.21524,0.112402,0.116446,0.111962,-0.178086,0.153015,0.112402,-0.066103,0.04762,-0.001784,-0.115625,0.007697,0.272281,-0.137226,0.004901,-0.224901,0.058645
Dest_PRCP,-0.010666,0.016764,0.084298,0.003659,0.046596,0.215374,,-0.137406,0.150286,1.0,0.075954,-0.124881,-0.029388,-0.15183,0.032354,0.014556,-0.017176,-0.056446,-0.030941,0.032354,-0.098119,0.056801,0.081502,-0.064996,0.082914,0.021881,-0.046264,-0.020715,0.031963,0.046104


In [17]:
# try running without desintation airport information (too highly related to destination weather)

burbank2017na_df = burbank2017_df.copy()

burbank2017na_df.drop(columns=['da_DEN',
                               'da_JFK',
                               'da_LAS',
                               'da_OAK',
                               'da_PDX',
                               'da_PDX',
                               'da_PHX',
                               'da_SEA',
                               'da_SFO',
                               'da_SJC',
                               'da_SLC',
                               'da_SMF'], axis=1, inplace=True)
burbank2017na_df 

Unnamed: 0,DayOfWeek,CRSDepTime,Distance,CRSArrTime,Origin_WIND,Origin_PRCP,Origin_SNOW,Origin_TEMP,Dest_WIND,Dest_PRCP,Dest_SNOW,Dest_TEMP,Day,Month,B6,OO,UA,WN,Delayed
0,6,700,937,930,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,0,0,0,0,1
1,6,1730,937,1958,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,0,0,0,0,1
2,6,1215,937,1443,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,0,0,0,0,1
3,7,700,937,930,7.38,0.0,0.0,71.5,4.92,0.0,0.0,62.0,2,7,0,0,0,0,1
4,7,1730,937,1958,7.38,0.0,0.0,71.5,4.92,0.0,0.0,62.0,2,7,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25100,2,708,326,833,6.71,0.0,0.0,67.5,15.66,0.0,0.0,64.0,19,9,0,0,1,0,1
25101,2,2015,326,2141,6.71,0.0,0.0,67.5,15.66,0.0,0.0,64.0,19,9,0,0,1,0,0
25102,2,700,850,1029,6.71,0.0,0.0,67.5,14.09,0.0,0.0,72.0,19,9,0,0,1,0,0
25103,2,1220,326,1339,10.51,0.0,0.0,77.5,7.61,0.0,0.0,76.0,5,9,0,0,1,0,0


In [18]:
# try running without desintation airport information (too highly related to destination weather)

burbank2018na_df = burbank2018_df.copy()

burbank2018na_df.drop(columns=['da_DEN',
                               'da_JFK',
                               'da_LAS',
                               'da_OAK',
                               'da_PDX',
                               'da_PDX',
                               'da_PHX',
                               'da_SEA',
                               'da_SFO',
                               'da_SJC',
                               'da_SLC',
                               'da_SMF'], axis=1, inplace=True)
burbank2018na_df 

Unnamed: 0,DayOfWeek,CRSDepTime,Distance,CRSArrTime,Origin_WIND,Origin_PRCP,Origin_SNOW,Origin_TEMP,Dest_WIND,Dest_PRCP,Dest_SNOW,Dest_TEMP,Day,Month,B6,OO,UA,WN,Delayed
0,5,2030,326,2200,6.93,0.0,0.0,74.0,15.21,0.0,0.0,62.0,31,8,0,0,1,0,0
1,5,700,326,826,6.93,0.0,0.0,74.0,15.21,0.0,0.0,62.0,31,8,0,0,1,0,1
2,5,715,850,1035,6.93,0.0,0.0,74.0,9.40,0.0,0.0,76.0,31,8,0,0,1,0,0
3,4,2030,326,2200,7.38,0.0,0.0,75.5,11.18,0.0,0.0,65.0,30,8,0,0,1,0,0
4,4,700,326,826,7.38,0.0,0.0,75.5,11.18,0.0,0.0,65.0,30,8,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25222,2,1956,326,2115,8.05,0.0,0.0,68.5,14.09,0.0,0.0,60.0,19,6,0,0,1,0,1
25223,2,700,326,835,8.05,0.0,0.0,68.5,14.09,0.0,0.0,60.0,19,6,0,0,1,0,0
25224,1,1956,326,2115,11.18,0.0,0.0,65.0,12.75,0.0,0.0,65.0,18,6,0,0,1,0,0
25225,1,700,326,835,11.18,0.0,0.0,65.0,12.75,0.0,0.0,65.0,18,6,0,0,1,0,1


In [19]:
# split the data in X (independent variables) and y (dependent variable)

X_train = burbank2017na_df.iloc[:, :-1]
y_train = burbank2017na_df.iloc[:,-1]

X_validation = burbank2018na_df.iloc[:, :-1]
y_validation = burbank2018na_df.iloc[:,-1]

In [20]:
# scale the data

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_validation = scaler.transform(X_validation)

In [21]:
# run an inital logistic regression model

burbank_lr = LogisticRegression()
burbank_lr.fit(X_train, y_train)
print(burbank_lr.score(X_train, y_train))
print(burbank_lr.score(X_validation, y_validation))

0.6279625572595101
0.6366987751218932


In [22]:
# review the precision and recall

y_pred = burbank_lr.predict(X_validation)
print(classification_report(y_validation, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.85      0.74     15738
           1       0.53      0.28      0.37      9489

    accuracy                           0.64     25227
   macro avg       0.60      0.57      0.56     25227
weighted avg       0.61      0.64      0.60     25227



In [23]:
coefficient_df = pd.DataFrame(columns=['Feature', 'Coefficient'])

coefficient_df['Feature'] = burbank2017na_df.iloc[:, :-1].columns.T
coefficient_df['Coefficient'] = burbank_lr.coef_.T.reshape(18)

display(coefficient_df.sort_values(by = ['Coefficient'], ascending=False).head(10))
display(coefficient_df.sort_values(by = ['Coefficient']).head(10))

Unnamed: 0,Feature,Coefficient
3,CRSArrTime,0.234852
8,Dest_WIND,0.147697
14,B6,0.102662
1,CRSDepTime,0.101431
0,DayOfWeek,0.094371
11,Dest_TEMP,0.093935
4,Origin_WIND,0.092777
9,Dest_PRCP,0.080024
12,Day,0.029892
10,Dest_SNOW,0.029786


Unnamed: 0,Feature,Coefficient
15,OO,-0.263201
2,Distance,-0.143145
13,Month,-0.066259
16,UA,-0.058963
17,WN,-0.031654
6,Origin_SNOW,0.0
5,Origin_PRCP,0.000768
7,Origin_TEMP,0.006547
10,Dest_SNOW,0.029786
12,Day,0.029892


In [24]:
# try out KNN

burbank_KNN = KNeighborsClassifier()
burbank_KNN.fit(X_train, y_train)
    
train_predictions = burbank_KNN.predict(X_train)
validation_predictions = burbank_KNN.predict(X_validation)
print(f'The accuracy score on the train set is: {accuracy_score(train_predictions, y_train)}')
print(f'The accuracy score on the vaidation set is: {accuracy_score(validation_predictions, y_validation)}')

The accuracy score on the train set is: 0.7712009559848636
The accuracy score on the vaidation set is: 0.5947992230546637


In [25]:
# try a decision tree

burbank_DT = DecisionTreeClassifier(max_depth=5)
burbank_DT.fit(X_train, y_train)
 
print(f'The accuracy score on the train set is: {burbank_DT.score(X_train, y_train)}')    
print(f'The accuracy score on the validation set is: {burbank_DT.score(X_validation, y_validation)}')

The accuracy score on the train set is: 0.6523401712806214
The accuracy score on the validation set is: 0.6269869584175685


In [26]:
# try a random forest

burbank_RF = RandomForestClassifier()
burbank_RF.fit(X_train, y_train)

print(f'The accuracy score on the train set is: {burbank_RF.score(X_train, y_train)}')
print(f'The accuracy score on the validation set is: {burbank_RF.score(X_validation, y_validation)}')

The accuracy score on the train set is: 0.9797649870543716
The accuracy score on the validation set is: 0.6229833115312958


In [27]:
# neural network

burbank_nn = MLPClassifier(hidden_layer_sizes=(10),solver='lbfgs')
burbank_nn.fit(X_train, y_train);

print(f'Train Score: {burbank_nn.score(X_train, y_train)}')
print(f'Test Score: {burbank_nn.score(X_validation, y_validation)}')

Train Score: 0.6691097390957976
Test Score: 0.6397907004400047
