In [1]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn.pipeline                import Pipeline
from sklearn.model_selection         import train_test_split, GridSearchCV
from sklearn.linear_model            import LogisticRegression
from sklearn.ensemble                import BaggingClassifier,RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier

from sklearn.tree                    import DecisionTreeClassifier
from sklearn.svm                     import SVC

import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
from sklearn.preprocessing import StandardScaler



from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

In [3]:
data = pd.read_csv('~/ga/projects/capstone_data/data/data_model.csv')

In [4]:
data.columns

Index(['Unnamed: 0', 'month', 'day_of_month', 'day_of_week', 'fl_date',
       'op_carrier', 'op_carrier_fl_num', 'origin', 'origin_city_name',
       'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name',
       'dest_state_abr', 'dest_state_nm', 'dep_delay', 'arr_delay', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'carrier', 'total_delay'],
      dtype='object')

In [5]:
data.drop(columns = ['Unnamed: 0'], inplace=True)

In [6]:
data['delay_indicator'] = np.where(data['total_delay']>14,1,0)

In [7]:
data['delay_indicator'].value_counts(normalize=True)

0    0.818538
1    0.181462
Name: delay_indicator, dtype: float64

In [8]:
data_negative = data[data['delay_indicator'] == 1]

data_pozitive = data[data['delay_indicator'] == 0].sample(data_negative.shape[0])

data_balanced = pd.concat([data_pozitive,data_negative])
data_balanced.shape

(644232, 25)

In [9]:
X = data_balanced.drop(columns=['delay_indicator', 'total_delay','dep_delay','arr_delay'])
y = data_balanced['delay_indicator']

In [10]:
X = pd.get_dummies(X,columns = ['origin','dest','carrier'],drop_first=True )

In [11]:
X = X._get_numeric_data()
X.shape

(644232, 717)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

## FFNN ##

In [14]:
model = Sequential()

model.add(Dense(32,
         input_shape = (717,),
         activation = 'relu'))

model.add(Dense(32,
               activation='relu'))

model.add(Dense(1,
               activation = 'sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [18]:
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=512,
    verbose=2
)

Train on 483174 samples, validate on 161058 samples
Epoch 1/10
483174/483174 - 42s - loss: 0.6658 - accuracy: 0.7316 - val_loss: 0.3154 - val_accuracy: 0.9764
Epoch 2/10
483174/483174 - 28s - loss: 0.1787 - accuracy: 0.9646 - val_loss: 0.1198 - val_accuracy: 0.9847
Epoch 3/10
483174/483174 - 19s - loss: 0.1065 - accuracy: 0.9751 - val_loss: 0.0666 - val_accuracy: 0.9858
Epoch 4/10
483174/483174 - 19s - loss: 0.1128 - accuracy: 0.9724 - val_loss: 0.5135 - val_accuracy: 0.8027
Epoch 5/10
483174/483174 - 18s - loss: 0.1030 - accuracy: 0.9760 - val_loss: 0.0652 - val_accuracy: 0.9858
Epoch 6/10
483174/483174 - 20s - loss: 0.0698 - accuracy: 0.9857 - val_loss: 0.0657 - val_accuracy: 0.9858
Epoch 7/10
483174/483174 - 19s - loss: 0.0953 - accuracy: 0.9783 - val_loss: 0.0828 - val_accuracy: 0.9856
Epoch 8/10
483174/483174 - 18s - loss: 0.1048 - accuracy: 0.9755 - val_loss: 0.0642 - val_accuracy: 0.9858
Epoch 9/10
483174/483174 - 20s - loss: 0.0921 - accuracy: 0.9795 - val_loss: 0.0870 - val_ac