Random Forest and Decision Trees did not end up being useful for this dataset.  14 Models were tested and those 14, 13 produced an R2 less than 85%.  Not terrible but not nothing to inspire confidence for predictions.

The one model to break the cycle was prediction Departure Delays that were 15 minutes or more.  Utilizing the previous best model with 18 features, this model produced an R2 of 98.665% by far the best score of any model tested of all the different procedures so far.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# RNG used for seeding
rng = int(np.random.randint(low=1, high=2000, size=1))

In [2]:
# Read in first quarter dataset
delays_df = pd.read_csv("Delay_first_quarter.csv")

In [3]:
# Do some additional cleaning
delays_df = delays_df.fillna(0)

In [4]:
# Dummy variables for flights of east coast origin/destination 
# Dummy variables for flights of west coast origin/destination -> both fixed

delays_df['EAST_COAST_ORIGIN'] = 1*np.ravel(delays_df["ORIGIN_LONGITUDE"] >= -83)
delays_df['EAST_COAST_DEST'] = 1*np.ravel(delays_df["DEST_LONGITUDE"] >= -83)
delays_df['WEST_COAST_ORIGIN'] = 1*np.ravel(delays_df["ORIGIN_LONGITUDE"] <= -114)
delays_df['WEST_COAST_DEST'] = 1*np.ravel(delays_df["DEST_LONGITUDE"] <= -114)

In [5]:
# Create more dummy variables for categorical data:

Weekday = {
           "Monday": 1,
           "Tueday": 2, 
           "Wednesday": 3, 
           "Thursday" : 4,
           "Friday": 5,
           "Saturday": 6,
           "Sunday": 7
          }

Airline = {
        "UA": 1,
        "AA": 2,
        "9E": 3,
        "B6": 4,
        "EV": 5,
        "F9": 6,
        "G4": 7,
        "HA": 8,
        "MQ": 9,
        "NK": 10,
        "OH": 11,
        "OO": 12,
        "VX": 13,
        "WN": 14,
        "YV": 15,
        "YX": 16,
        "AS": 17,
        "DL": 18
}

In [6]:
delays_df['WEEKDAY_DUMMY'] = delays_df['WEEKDAY'].apply(  \
                            lambda x: next((y for z, y in Weekday.items() if x in z), 0))

In [7]:
delays_df['AIRLINE_DUMMY'] = delays_df['OP_CARRIER'].apply(  \
                            lambda x: next((y for z, y in Airline.items() if x in z), 0))

In [8]:
# Make a proper categorical delay dummy variable
# Get Month into a categorical value
# This uses less memory than using lambda.

delay_dict = {0: "On Time", 1: "Delayed"}
delays_df["DEPARTURE_DELAY_DUMMY"]=delays_df["DEPARTURE_DELAY"].copy()
delays_df["DEPARTURE_DELAY"].replace(delay_dict, inplace = True)

In [9]:
arrive_dict = {0: "On Time", 1: "Delayed"}
delays_df["ARRIVAL_DELAY_DUMMY"]=delays_df["ARRIVAL_DELAY"].copy()
delays_df["ARRIVAL_DELAY"].replace(delay_dict, inplace = True)

In [10]:
Delay_dict = {range(-2000, 1): "On Time",
               range(1, 16): 'Small Delay', 
               range(16, 2000): 'Long Delay',}

In [11]:
delays_df['DEPARTURE_DELAY_TEST'] = delays_df['DEP_DELAY'].apply(  \
                            lambda x: next((y for z, y in Delay_dict.items() if x in z), 0))

In [12]:
delays_df['ARRIVAL_DELAY_TEST'] = delays_df['ARR_DELAY'].apply(  \
                            lambda x: next((y for z, y in Delay_dict.items() if x in z), 0))

FIRST DEPARTURE MODEL  - BOTH DECISION TREE (68%) AND RANDOM FOREST (68.24)%

In [13]:
# As this follows a similar path to logistic regression, the same initial steps will be used.
# Start with first model

# Departure delay logistic ML model -> 10 features

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEPARTURE_TIME_OF_DAY_DUMMY", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
               "OP_CARRIER_FL_NUM", "TAXI_OUT", "WHEELS_OFF","WEATHER_DELAY",]]

y = delays_df["DEPARTURE_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 10) (1683475, 1)


In [14]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)

In [15]:
# Set up the random forest classifier

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=rng, oob_score=True)
clf.fit(X_train, y_train) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=263, verbose=0,
                       warm_start=False)

In [16]:
print(clf.feature_importances_)

[1.45541134e-02 3.14812073e-07 2.99546628e-01 1.78531886e-01
 3.16942750e-05 1.59597439e-02 1.36454548e-03 5.46013824e-02
 2.83306557e-01 1.52103135e-01]


In [18]:
from sklearn.metrics import accuracy_score
predictions = clf.predict(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_test, predictions)*100}")

Accuracy Score: 68.05371742751308


In [19]:
# Um good at predicting on time but not delays?
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,Delayed,On Time
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1
Delayed,3606,134446
On Time,6,282811


In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

clf = DecisionTreeClassifier(criterion = "entropy", random_state=rng)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=552, splitter='best')

In [25]:
y_transposed = (np.transpose(y_test)).flatten()

In [26]:
predictions = clf.predict(X_test)

In [27]:
print(f"Accuracy Score: {accuracy_score(y_test, predictions)*100}")

Accuracy Score: 72.70148193380838


In [28]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,Delayed,On Time
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1
Delayed,79726,58326
On Time,56565,226252


In [None]:
SECOND DEPARTURE DELAY 4 FEATURES -> RANDOM FOREST (68%)

In [20]:
# A complete stripdown to see if any useful analysis can be done with this method on delay data.
# Only 4 features

X = delays_df[["DEP_TIME", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", "WEATHER_DELAY",]]

y = delays_df["DEPARTURE_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 4) (1683475, 1)


In [21]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)

In [22]:
# Set up the random forest classifier

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=rng, oob_score=True)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=263, verbose=0,
                       warm_start=False)

In [23]:
# Hey look at that.  Numbers greater than 0.00001
print(clf.feature_importances_)

[0.58423113 0.00058848 0.09236184 0.32281855]


In [24]:
predictions = clf.predict(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_test, predictions)*100}")

Accuracy Score: 68.24926521079006


In [25]:
# As predicted, not useful.
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,Delayed,On Time
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1
Delayed,4429,133623
On Time,6,282811


ARRIVAL DELAYS WITH 18 FEATURES -> RANDOM FOREST (84.3%)

In [26]:
# Trying out Arrival data before utilizing a PCA.

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEP_DELAY", "DEPARTURE_TIME_OF_DAY_DUMMY", \
              "OP_CARRIER_FL_NUM", "TAXI_OUT", "AIR_TIME", "TAXI_IN", \
              "ARRIVAL_TIME_OF_DAY_DUMMY", "CARRIER_DELAY", "DISTANCE", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
       "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY",]]
y = delays_df["ARRIVAL_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 18) (1683475, 1)


In [27]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)

In [30]:
# Set up the random forest classifier

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=rng, oob_score=True)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=263, verbose=0,
                       warm_start=False)

In [31]:
print(clf.feature_importances_)

[2.38248330e-03 0.00000000e+00 2.21831389e-02 3.46906273e-01
 7.59596161e-03 0.00000000e+00 7.62767784e-02 1.04203195e-02
 4.41071562e-03 8.00889291e-04 1.30628755e-01 0.00000000e+00
 0.00000000e+00 1.57298067e-04 2.27499549e-02 1.78827198e-01
 0.00000000e+00 1.96660234e-01]


In [32]:
# Well that is better
predictions = clf.predict(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_test, predictions)*100}")

Accuracy Score: 84.297251638871


In [33]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,Delayed,On Time
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1
Delayed,72321,65388
On Time,700,282460


ARRIVAL DELAYS WITH 8 FEATURES -> RANDOM FOREST (82.62%)

In [34]:
# Strip features down further.

X = delays_df[["DEP_TIME", "DEP_DELAY", "DEPARTURE_TIME_OF_DAY_DUMMY", \
              "OP_CARRIER_FL_NUM", "AIR_TIME",
              "ARRIVAL_TIME_OF_DAY_DUMMY","WEEKDAY_DUMMY", "AIRLINE_DUMMY", ]]
y = delays_df["ARRIVAL_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 8) (1683475, 1)


In [35]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)

In [36]:
# Set up the random forest classifier

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=rng, oob_score=True)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=263, verbose=0,
                       warm_start=False)

In [37]:
print(clf.feature_importances_)

[0.20364205 0.44558092 0.08063153 0.0218261  0.11859745 0.05763265
 0.00220724 0.06988208]


In [38]:
# Stripping out the variables made the model worse.
predictions = clf.predict(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_test, predictions)*100}")

Accuracy Score: 82.61573078558887


In [39]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,Delayed,On Time
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1
Delayed,73709,64000
On Time,9165,273995


ARRIVAL DELAYS WITH 3 FEATURES -> RANDOM FOREST (83.67%)

In [40]:
# Strip features down further. Going with the feature importance recommendation.

X = delays_df[["DEP_TIME", "DEP_DELAY", "AIR_TIME"]]
y = delays_df["ARRIVAL_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 3) (1683475, 1)


In [41]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)

In [42]:
# Set up the random forest classifier

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=rng, oob_score=True)
clf.fit(X_test, y_test)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=263, verbose=0,
                       warm_start=False)

In [43]:
print(clf.feature_importances_)

[0.18859445 0.61788045 0.1935251 ]


In [44]:
# Well this is the best model so far.  This may be the best we can get.
predictions = clf.predict(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_test, predictions)*100}")

Accuracy Score: 83.67425493443328


In [64]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,Delayed,On Time
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1
Delayed,80028,57681
On Time,10663,272497


DEPARTURE DELAY 10 FEATURES DECISION TREE WITH PCA -> (79.4%)

In [45]:
# Try out best departure and arrival models with pca
# Start with Departure delay logistic ML model -> 10 features

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEPARTURE_TIME_OF_DAY_DUMMY", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
               "OP_CARRIER_FL_NUM", "TAXI_OUT", "WHEELS_OFF","WEATHER_DELAY",]]

y = delays_df["DEPARTURE_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 10) (1683475, 1)


In [46]:
from sklearn.decomposition import PCA
pca = PCA(.95)
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [47]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)

In [58]:
X_train_transformed = pca.transform(X_train)

In [60]:
X_test_transformed = pca.transform(X_test)

In [59]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

clf = DecisionTreeClassifier(random_state=rng)
clf.fit(X_train_transformed, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=263, splitter='best')

In [61]:
predictions = clf.predict(X_test_transformed)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")

Accuracy Score: 79.39596406482778


In [62]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,Delayed,On Time
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1
Delayed,91909,46143
On Time,40573,242244


ARRIVAL DELAYS 3 FEATURES DECISION TREE WITH PCA (69.97%)

In [66]:
# Try out the PCA with arrival delays using the best previous model (3 features)

X = delays_df[["DEP_TIME", "DEP_DELAY", "AIR_TIME"]]
y = delays_df["ARRIVAL_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 3) (1683475, 1)


In [67]:
from sklearn.decomposition import PCA
pca = PCA(.95)
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [68]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)

In [69]:
X_train_transformed = pca.transform(X_train)

In [70]:
X_test_transformed = pca.transform(X_test)

In [71]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

clf = DecisionTreeClassifier(random_state=rng)
clf.fit(X_train_transformed, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=263, splitter='best')

In [72]:
predictions = clf.predict(X_test_transformed)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")

Accuracy Score: 69.97450513105028


In [73]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,Delayed,On Time
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1
Delayed,67488,70221
On Time,56147,227013


ARRIVAL MODEL 18 FEATURES DECISION TREE WITH PCA (76.06%)

In [74]:
# Try out the original arrival model with 18 features as it was indifferent to the one with 3 features in accuracy

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEP_DELAY", "DEPARTURE_TIME_OF_DAY_DUMMY", \
              "OP_CARRIER_FL_NUM", "TAXI_OUT", "AIR_TIME", "TAXI_IN", \
              "ARRIVAL_TIME_OF_DAY_DUMMY", "CARRIER_DELAY", "DISTANCE", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
       "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY",]]
y = delays_df["ARRIVAL_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 18) (1683475, 1)


In [75]:
from sklearn.decomposition import PCA
pca = PCA(.95)
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [76]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)

In [77]:
X_train_transformed = pca.transform(X_train)

In [78]:
X_test_transformed = pca.transform(X_test)

In [80]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

clf = DecisionTreeClassifier(random_state=rng)
clf.fit(X_train_transformed, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=263, splitter='best')

In [81]:
predictions = clf.predict(X_test_transformed)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")

Accuracy Score: 76.06119718962432


In [82]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,Delayed,On Time
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1
Delayed,84424,53285
On Time,47466,235694


DEPATURE DELAYS 10 FEATURES RANDOM FOREST WITH GRID SEARCH (68.42%)

In [83]:
# Try out the grid search with the random forest classifier as a final test
# Departure Delay with 10 features

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEPARTURE_TIME_OF_DAY_DUMMY", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
               "OP_CARRIER_FL_NUM", "TAXI_OUT", "WHEELS_OFF","WEATHER_DELAY",]]

y = delays_df["DEPARTURE_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 10) (1683475, 1)


In [56]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)

In [73]:
# Set up the random forest classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=rng, oob_score=True)

grid_search = GridSearchCV(clf, {'n_estimators': [1,6,11]}, cv = 2, scoring = "roc_auc", return_train_score=True)

grid_search.fit(X_test, y_test)

GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=2,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=True, random_state=1767,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=None, param_grid={'n_estimators': [1,

In [74]:
# This is about the same as the original model.
clf_model = grid_search.best_estimator_
predictions = clf_model.predict(X_test)
prediction_p = clf_model.predict_proba(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")

Accuracy Score: 68.42319106420287


In [75]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,Delayed,On Time
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1
Delayed,5164,132888
On Time,9,282808


ARRIVAL DELAYS 3 FEATURES RANDOM FOREST WITH GRID SEARCH (83.6%)

In [87]:
# Try out the random forest classifier with grid search on the best arrival delay model.

X = delays_df[["DEP_TIME", "DEP_DELAY", "AIR_TIME"]]
y = delays_df["ARRIVAL_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 3) (1683475, 1)


In [88]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)

In [91]:
# Set up the random forest classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=rng, oob_score=True)

grid_search = GridSearchCV(clf, {'n_estimators': [1,6,11]}, return_train_score=True)

grid_search.fit(X_test, y_test)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=2,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=True, random_state=1767,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=None, param_grid={'n_estimators'

In [90]:
# This is about the same as the original model as well.
clf_model = grid_search.best_estimator_
predictions = clf_model.predict(X_test)
prediction_p = clf_model.predict_proba(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")

Accuracy Score: 83.60653790134222


In [92]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,Delayed,On Time
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1
Delayed,79541,58168
On Time,10827,272333


DEPARTURE DELAYS 15+ MINUTES DECSION TREE WITH PCA (98.665%)

In [84]:
# Test if the Decision Tree with PCA can predict departure delays of 15 minutes+

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEPARTURE_TIME_OF_DAY_DUMMY", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
               "OP_CARRIER_FL_NUM", "TAXI_OUT", "WHEELS_OFF","WEATHER_DELAY",]]

y = delays_df["DEPARTURE_DELAY_OVER_15_MINUTES"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 10) (1683475, 1)


In [85]:
from sklearn.decomposition import PCA
pca = PCA(.95)
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [86]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)

In [87]:
X_train_transformed = pca.transform(X_train)

In [88]:
X_test_transformed = pca.transform(X_test)

In [89]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

clf = DecisionTreeClassifier(random_state=rng)
clf.fit(X_train_transformed, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=263, splitter='best')

In [90]:
predictions = clf.predict(X_test_transformed)
prediction_p = clf.predict_proba(X_test_transformed)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")

Accuracy Score: 98.6551634831741


In [91]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,0,1
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1
0,413909,2764
1,2896,1300


ARRIVAL DELAYS 15+ MINUTES DECISION TRESS WITH PCA (100%)

In [92]:
# Try it out for arrival delays of 15 minutes+

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEP_DELAY", "DEPARTURE_TIME_OF_DAY_DUMMY", \
              "OP_CARRIER_FL_NUM", "TAXI_OUT", "AIR_TIME", "TAXI_IN", \
              "ARRIVAL_TIME_OF_DAY_DUMMY", "CARRIER_DELAY", "DISTANCE", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
       "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY",]]
y = delays_df["ARRIVAL_DELAY_OVER_15_MINUTES"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 18) (1683475, 1)


In [93]:
from sklearn.decomposition import PCA
pca = PCA(.95)
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [94]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)

In [95]:
X_train_transformed = pca.transform(X_train)

In [96]:
X_test_transformed = pca.transform(X_test)

In [97]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

clf = DecisionTreeClassifier(random_state=rng)
clf.fit(X_train_transformed, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=263, splitter='best')

In [98]:
predictions = clf.predict(X_test_transformed)
prediction_p = clf.predict_proba(X_test_transformed)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")

Accuracy Score: 77.2632339278968


In [99]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,0,1
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1
0,256111,49061
1,46631,69066


ARRIVAL DELAY 8 FEATURES DECISION TREE WITH PCA

In [123]:
# Since the departure delay 15+ minutes model was by far the best, more arrival delay models will be tested to try to match it

X = delays_df[["DEP_TIME", "DEP_DELAY", "DEPARTURE_TIME_OF_DAY_DUMMY", \
              "OP_CARRIER_FL_NUM", "AIR_TIME",
              "ARRIVAL_TIME_OF_DAY_DUMMY","WEEKDAY_DUMMY", "AIRLINE_DUMMY", ]]
y = delays_df["ARRIVAL_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 8) (1683475, 1)


In [124]:
from sklearn.decomposition import PCA
pca = PCA(.95)
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [125]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)

In [126]:
X_train_transformed = pca.transform(X_train)
X_test_transformed = pca.transform(X_test)

In [127]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

clf = DecisionTreeClassifier(random_state=rng)
clf.fit(X_train_transformed, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=263, splitter='best')

In [128]:
predictions = clf.predict(X_test_transformed)
prediction_p = clf.predict_proba(X_test_transformed)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")

Accuracy Score: 72.4897771040395


In [122]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,Delayed,On Time
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1
Delayed,67488,70221
On Time,56147,227013


DEPARTURE DELAY 3 CATEGORY SPLIT DECISION TREE (66.5%)

In [103]:
# A final test for predictions of on times, small delays (<15 minutes) and long delays (15+ minutes)

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEPARTURE_TIME_OF_DAY_DUMMY", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
               "OP_CARRIER_FL_NUM", "TAXI_OUT", "WHEELS_OFF","WEATHER_DELAY",]]

y = delays_df["DEPARTURE_DELAY_TEST"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 10) (1683475, 1)


In [104]:
y = pd.factorize(delays_df["DEPARTURE_DELAY_TEST"])[0]

In [105]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)

In [106]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

clf = DecisionTreeClassifier(random_state=rng)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=263, splitter='best')

In [107]:
predictions = clf.predict(X_test)
prediction_p = clf.predict_proba(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")

Accuracy Score: 66.49693847729341


In [108]:
# Figure out why there is a 4th dummy variable
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,0,1,2,3
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,224018,24099,34700,0
1,23910,32119,13042,3
2,32663,12586,23728,0
3,0,1,0,0


ARRIVAL DELAY 3 CATEGORIES DECISION TREE (100%)

In [109]:
# Test out on Arrival Delays with 3 categories

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEP_DELAY", "DEPARTURE_TIME_OF_DAY_DUMMY", \
              "OP_CARRIER_FL_NUM", "TAXI_OUT", "AIR_TIME", "TAXI_IN", \
              "ARRIVAL_TIME_OF_DAY_DUMMY", "CARRIER_DELAY", "DISTANCE", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
       "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY",]]
y = delays_df["ARRIVAL_DELAY_TEST"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 18) (1683475, 1)


In [110]:
y = pd.factorize(delays_df["ARRIVAL_DELAY_TEST"])[0]

In [111]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)

In [112]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

clf = DecisionTreeClassifier(random_state=rng)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=263, splitter='best')

In [113]:
predictions = clf.predict(X_test)
prediction_p = clf.predict_proba(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")

Accuracy Score: 85.85902026521318


In [114]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,0,1,2,3
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,252944,14,30202,0
1,23,69600,152,0
2,29019,105,38809,0
3,0,0,0,1


In [115]:
Decision_Trees_ML_outcomes = {
    "Model": ["Depature Delay", "", "", "", "", "Departure Delay 15+ Minutes", "Departure Delay 3 Categories", \
              "Arrival Delay", "", "", "", "", "", "Arrival Delay 15+ Minutes", "Arrival Delay 3 Categories"],
    
    "Model Type": ["Decision Tree", "Random Forest", "Random Forest", \
                   "Decision Tree with PCA", "Random Forest with Grid Search", \
                   "Decision Tree with PCA", "Decision Tree"
                   "Random Forest", "Random Forest", "Random Forest", \
                   "Decision Tree with PCA", "Decision Tree with PCA", "Forest with Grid Search", \
                   "Random Forest", "Random Forest", "Random Forest"],
    
    "Features": [10, 10, 4, 10, 10, 10, 10, 18, 8, 3, 3, 18, 3, 18, 18],
    
    "Test Data R2": [.68, .6824, .6823, .799, .6842, .98665, .665, .843, .8262, .8367, .6997, .7606, .836, .7726, .8586]
    
                }

Decision_Trees_ML_outcomes_df = pd.DataFrame(Decision_Trees_ML_outcomes)
Decision_Trees_ML_outcomes_df

Unnamed: 0,Model,Model Type,Features,Test Data R2
0,Depature Delay,Decision Tree,10,0.68
1,,Random Forest,10,0.6824
2,,Random Forest,4,0.6823
3,,Decision Tree with PCA,10,0.799
4,,Random Forest with Grid Search,10,0.6842
5,Departure Delay 15+ Minutes,Decision Tree with PCA,10,0.98665
6,Departure Delay 3 Categories,Decision TreeRandom Forest,10,0.665
7,Arrival Delay,Random Forest,18,0.843
8,,Random Forest,8,0.8262
9,,Decision Tree with PCA,3,0.8367
