Logistic regression analysis produced some interesting results.  A rather simplified model using 10 featured for departure
delays produced a fantastic model with an R2 of 95.86% while this same model using departure delays over 15 minutes produced 
a model R2 of 99%!

The same cannot be said with regards to arrival delays.  The best R2 that could be achieved was 88.54% using a model with
18 features.  While this is not bad, a 90% R2 would be prefered.  To make things worse, this model produced a R2 of 78.36% 
when used for arrival delays over 15 minutes.  Many models were tested but the 18 feature model produced the best R2.  It is
unfortunate that it had the opposite effect when arrival delays of 15 minutes was used.

In [85]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# RNG used for seeding
rng = int(np.random.randint(low=1, high=2000, size=1))

In [2]:
# Read in first quarter dataset
delays_df = pd.read_csv("Delay_first_quarter.csv")

In [3]:
delays_df = delays_df.fillna(0)

In [4]:
# Create more dummy variables for categorical data:

Weekday = {
           "Monday": 1,
           "Tueday": 2, 
           "Wednesday": 3, 
           "Thursday" : 4,
           "Friday": 5,
           "Saturday": 6,
           "Sunday": 7
          }

Airline = {
        "UA": 1,
        "AA": 2,
        "9E": 3,
        "B6": 4,
        "EV": 5,
        "F9": 6,
        "G4": 7,
        "HA": 8,
        "MQ": 9,
        "NK": 10,
        "OH": 11,
        "OO": 12,
        "VX": 13,
        "WN": 14,
        "YV": 15,
        "YX": 16,
        "AS": 17,
        "DL": 18
}

In [5]:
delays_df['WEEKDAY_DUMMY'] = delays_df['WEEKDAY'].apply(  \
                            lambda x: next((y for z, y in Weekday.items() if x in z), 0))

In [6]:
delays_df['AIRLINE_DUMMY'] = delays_df['OP_CARRIER'].apply(  \
                            lambda x: next((y for z, y in Airline.items() if x in z), 0))

In [7]:
# Dummy variables for flights of east coast origin/destination 
# Dummy variables for flights of west coast origin/destination -> both fixed

delays_df['EAST_COAST_ORIGIN'] = 1*np.ravel(delays_df["ORIGIN_LONGITUDE"] >= -83)
delays_df['EAST_COAST_DEST'] = 1*np.ravel(delays_df["DEST_LONGITUDE"] >= -83)
delays_df['WEST_COAST_ORIGIN'] = 1*np.ravel(delays_df["ORIGIN_LONGITUDE"] <= -114)
delays_df['WEST_COAST_DEST'] = 1*np.ravel(delays_df["DEST_LONGITUDE"] <= -114)

In [8]:
# Fix east/west coast
delays_df.columns

Index(['FL_DATE', 'DAY', 'MONTH', 'WEEKDAY', 'OP_CARRIER', 'AIRLINE',
       'OP_CARRIER_FL_NUM', 'ORIGIN', 'ORIGIN_AIRPORT', 'ORIGIN_CITY',
       'ORIGIN_STATE', 'ORIGIN_LATITUDE', 'ORIGIN_LONGITUDE',
       'EAST_COAST_ORIGIN', 'WEST_COAST_ORIGIN', 'CRS_DEP_TIME', 'DEP_TIME',
       'DEPARTURE_TIME_OF_DAY', 'DEPARTURE_TIME_OF_DAY_DUMMY', 'DEP_DELAY',
       'DEPARTURE_DELAY', 'DEPARTURE_DELAY_OVER_15_MINUTES',
       'DEPARTURE_DELAY_OVER_30_MINUTES', 'DEPARTURE_DELAY_OVER_45_MINUTES',
       'DEPARTURE_DELAY_OVER_60_MINUTES', 'TAXI_OUT', 'WHEELS_OFF', 'AIR_TIME',
       'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'DISTANCE', 'WHEELS_ON',
       'TAXI_IN', 'DEST', 'DEST_AIRPORT', 'DEST_CITY', 'DEST_STATE',
       'DEST_LATITUDE', 'DEST_LONGITUDE', 'EAST_COAST_DEST', 'WEST_COAST_DEST',
       'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY', 'ARRIVAL_DELAY',
       'ARRIVAL_DELAY_OVER_15_MINUTES', 'ARRIVAL_DELAY_OVER_30_MINUTES',
       'ARRIVAL_DELAY_OVER_45_MINUTES', 'ARRIVAL_DELAY_OVER_60_M

In [9]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
delays_df = delays_df.select_dtypes(include=numerics)

In [11]:
delays_df.columns

Index(['DAY', 'MONTH', 'OP_CARRIER_FL_NUM', 'ORIGIN_LATITUDE',
       'ORIGIN_LONGITUDE', 'EAST_COAST_ORIGIN', 'WEST_COAST_ORIGIN',
       'CRS_DEP_TIME', 'DEP_TIME', 'DEPARTURE_TIME_OF_DAY_DUMMY', 'DEP_DELAY',
       'DEPARTURE_DELAY', 'DEPARTURE_DELAY_OVER_15_MINUTES',
       'DEPARTURE_DELAY_OVER_30_MINUTES', 'DEPARTURE_DELAY_OVER_45_MINUTES',
       'DEPARTURE_DELAY_OVER_60_MINUTES', 'TAXI_OUT', 'WHEELS_OFF', 'AIR_TIME',
       'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'DISTANCE', 'WHEELS_ON',
       'TAXI_IN', 'DEST_LATITUDE', 'DEST_LONGITUDE', 'EAST_COAST_DEST',
       'WEST_COAST_DEST', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY',
       'ARRIVAL_DELAY', 'ARRIVAL_DELAY_OVER_15_MINUTES',
       'ARRIVAL_DELAY_OVER_30_MINUTES', 'ARRIVAL_DELAY_OVER_45_MINUTES',
       'ARRIVAL_DELAY_OVER_60_MINUTES', 'ARRIVAL_DELAY_OVER_60_MINUTES.1',
       'ARRIVAL_TIME_OF_DAY_DUMMY', 'CANCELLED', 'DIVERTED', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELA

In [30]:
# Add a categorical dummy for departure delay
delays_df['IS_DEPARTURE_DELAYED'] = np.ravel(delays_df["DEP_DELAY"] < 0)

In [31]:
# Departure delay logistic ML model -> 16 features

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEPARTURE_TIME_OF_DAY_DUMMY", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
              "EAST_COAST_ORIGIN", "WEST_COAST_ORIGIN", "OP_CARRIER_FL_NUM", "TAXI_OUT", "WHEELS_OFF", "CARRIER_DELAY", \
       "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY"]]


y = delays_df["IS_DEPARTURE_DELAYED"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 16) (1683475, 1)


In [33]:
# Test for significant variables in proposed model
# Flight number is not significant.

from sklearn.feature_selection import f_regression
pd.set_option('display.max_columns', None)

delays_sig = f_regression(X, y)
delays_sig = pd.DataFrame(delays_sig)
info_column = pd.DataFrame({"Test Statistic": ["F", "p-value"]})
delays_sig = info_column.join(delays_sig)
delays_sig

Unnamed: 0,Test Statistic,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,F,11869.371573,75.74547,11971.215899,26413.427028,25.10707,3291.657044,107.4975,308.6042,0.178121,1632.590007,10113.470897,43594.391263,5474.409782,30672.903311,484.1086,95956.794844
1,p-value,0.0,3.229733e-18,0.0,0.0,5.423911e-07,0.0,3.469488e-25,4.461418e-69,0.672993,0.0,0.0,0.0,0.0,0.0,2.8242689999999997e-107,0.0


In [38]:
# Departure delay logistic ML model -> 15 features without flight number variable

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEPARTURE_TIME_OF_DAY_DUMMY", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
              "EAST_COAST_ORIGIN", "WEST_COAST_ORIGIN", "TAXI_OUT", "WHEELS_OFF", "CARRIER_DELAY", \
       "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY"]]
y = delays_df["IS_DEPARTURE_DELAYED"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 15) (1683475, 1)


In [39]:
# Test for significant variables in proposed model
# All variables are significant

from sklearn.feature_selection import f_regression
pd.set_option('display.max_columns', None)

delays_sig = f_regression(X, y)
delays_sig = pd.DataFrame(delays_sig)
info_column = pd.DataFrame({"Test Statistic": ["F", "p-value"]})
delays_sig = info_column.join(delays_sig)
delays_sig

Unnamed: 0,Test Statistic,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,F,11869.371573,75.74547,11971.215899,26413.427028,25.10707,3291.657044,107.4975,308.6042,1632.590007,10113.470897,43594.391263,5474.409782,30672.903311,484.1086,95956.794844
1,p-value,0.0,3.229733e-18,0.0,0.0,5.423911e-07,0.0,3.469488e-25,4.461418e-69,0.0,0.0,0.0,0.0,0.0,2.8242689999999997e-107,0.0


In [114]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)

In [115]:
# Create the logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(penalty='l2')
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [116]:
# Fit the model
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [118]:
# Check the model R2 scores

print(f"Training Data R2: {round((classifier.score(X_train, y_train)), 4)*100}%")
print(f"Testing Data R2: {round((classifier.score(X_test, y_test)), 4)*100}%")

Training Data R2: 76.14%
Testing Data R2: 76.13%


In [119]:
# Make predictions and check against test data
# y_test was coming back as a 2D array which had to be changed into a 1D array for anaylsis
# Overall not terrible but we can do better.

predictions = classifier.predict(X_test)
predict_df = pd.DataFrame({"Predictions": predictions[:100], "Test": ((np.transpose(y_test)).flatten())[:100]})
predict_df["Different"] = 1*np.ravel(predict_df["Predictions"] == predict_df["Test"])
correct = predict_df.loc[predict_df["Different"]==1,:]
print(f"Prediction Accurany: {round(((len(correct))/len(predict_df)), 4)*100}%")

Prediction Accurany: 76.0%


In [120]:
# Create the logistic regression model -> try lasso instead of ridge
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(penalty='l1')
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [121]:
# Fit the model
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [122]:
# Check the model R2 scores

print(f"Training Data R2: {round((classifier.score(X_train, y_train)), 4)*100}%")
print(f"Testing Data R2: {round((classifier.score(X_test, y_test)), 4)*100}%")

Training Data R2: 76.17%
Testing Data R2: 76.16000000000001%


In [123]:
# Make predictions and check against test data
# y_test was coming back as a 2D array which had to be changed into a 1D array for anaylsis
# Using lasso over ridge did not change the result.  Time to simplify the model.

predictions = classifier.predict(X_test)
predict_df = pd.DataFrame({"Predictions": predictions[:100], "Test": ((np.transpose(y_test)).flatten())[:100]})
predict_df["Different"] = 1*np.ravel(predict_df["Predictions"] == predict_df["Test"])
correct = predict_df.loc[predict_df["Different"]==1,:]
print(f"Prediction Accurany: {round(((len(correct))/len(predict_df)), 4)*100}%")

Prediction Accurany: 76.0%


In [124]:
# Simpler departure delay logistic ML model -> 10 features

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEPARTURE_TIME_OF_DAY_DUMMY", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
               "OP_CARRIER_FL_NUM", "TAXI_OUT", "WHEELS_OFF","WEATHER_DELAY",]]


y = delays_df["IS_DEPARTURE_DELAYED"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 10) (1683475, 1)


In [125]:
# Test for significant variables in proposed model
# All variables are significant

from sklearn.feature_selection import f_regression
pd.set_option('display.max_columns', None)

delays_sig = f_regression(X, y)
delays_sig = pd.DataFrame(delays_sig)
info_column = pd.DataFrame({"Test Statistic": ["F", "p-value"]})
delays_sig = info_column.join(delays_sig)
delays_sig

Unnamed: 0,Test Statistic,0,1,2,3,4,5,6,7,8,9
0,F,11869.371573,75.74547,11971.215899,26413.427028,25.10707,3291.657044,0.178121,1632.590007,10113.470897,5474.409782
1,p-value,0.0,3.229733e-18,0.0,0.0,5.423911e-07,0.0,0.672993,0.0,0.0,0.0


In [170]:
# Create the logistic regression model -> try lasso instead of ridge
# Split for train and test datasets
# Fit the model

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
classifier.fit(X_train, y_train)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)
classifier = LogisticRegression(penalty='l2')
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [172]:
# Fit the model & check the model R2 scores

classifier.fit(X_train, y_train)
print(f"Training Data R2: {round((classifier.score(X_train, y_train)), 4)*100}%")
print(f"Testing Data R2: {round((classifier.score(X_test, y_test)), 4)*100}%")

Training Data R2: 95.87%
Testing Data R2: 95.86%


In [173]:
# Simplifying produced much better results.

predictions = classifier.predict(X_test)
predict_df = pd.DataFrame({"Predictions": predictions[:100], "Test": ((np.transpose(y_test)).flatten())[:100]})
predict_df["Different"] = 1*np.ravel(predict_df["Predictions"] == predict_df["Test"])
correct = predict_df.loc[predict_df["Different"]==1,:]
print(f"Prediction Accurany: {round(((len(correct))/len(predict_df)), 4)*100}%")

Prediction Accurany: 96.0%


In [174]:
# Simpler departure delay logistic ML model -> 10 features
# This uses departure delays over 15 minutes for y

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEPARTURE_TIME_OF_DAY_DUMMY", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
               "OP_CARRIER_FL_NUM", "TAXI_OUT", "WHEELS_OFF","WEATHER_DELAY",]]


y = delays_df["DEPARTURE_DELAY_OVER_15_MINUTES"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 10) (1683475, 1)


In [175]:
# Test for significant variables in proposed model
# All variables are significant

from sklearn.feature_selection import f_regression
pd.set_option('display.max_columns', None)

delays_sig = f_regression(X, y)
delays_sig = pd.DataFrame(delays_sig)
info_column = pd.DataFrame({"Test Statistic": ["F", "p-value"]})
delays_sig = info_column.join(delays_sig)
delays_sig

Unnamed: 0,Test Statistic,0,1,2,3,4,5,6,7,8,9
0,F,300.2312,501.7404,569.1862,921.2482,10.553183,415.933,94.2125,187.469,570.9858,36.913
1,p-value,2.973395e-67,4.1266069999999995e-111,8.857314000000001e-126,2.676752e-202,0.00116,1.922637e-92,2.837863e-22,1.1404979999999999e-42,3.597254e-126,1.235462e-09


In [176]:
# Create the logistic regression model -> try lasso instead of ridge
# Split for train and test datasets
# Fit the model

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
classifier.fit(X_train, y_train)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)
classifier = LogisticRegression(penalty='l2')
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [177]:
# Fit the model & check the model R2 scores

classifier.fit(X_train, y_train)
print(f"Training Data R2: {round((classifier.score(X_train, y_train)), 4)*100}%")
print(f"Testing Data R2: {round((classifier.score(X_test, y_test)), 4)*100}%")

Training Data R2: 99.0%
Testing Data R2: 99.0%


In [178]:
# Even better results for departures over 15 minutes.

predictions = classifier.predict(X_test)
predict_df = pd.DataFrame({"Predictions": predictions[:100], "Test": ((np.transpose(y_test)).flatten())[:100]})
predict_df["Different"] = 1*np.ravel(predict_df["Predictions"] == predict_df["Test"])
correct = predict_df.loc[predict_df["Different"]==1,:]
print(f"Prediction Accurany: {round(((len(correct))/len(predict_df)), 4)*100}%")

Prediction Accurany: 96.0%


In [179]:
# Arrival delay logistic ML model -> 22 features

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEPARTURE_DELAY", "DEPARTURE_TIME_OF_DAY_DUMMY", \
              "OP_CARRIER_FL_NUM", "TAXI_OUT", "WHEELS_OFF", "AIR_TIME", "WHEELS_ON", "TAXI_IN", \
              "ARRIVAL_TIME_OF_DAY_DUMMY", "CARRIER_DELAY", "DISTANCE", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
       "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY","EAST_COAST_ORIGIN", "WEST_COAST_ORIGIN"]]
y = delays_df["ARRIVAL_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 22) (1683475, 1)


In [180]:
# Test for significant variables in proposed model
# All variables are significant

from sklearn.feature_selection import f_regression
pd.set_option('display.max_columns', None)

delays_sig = f_regression(X, y)
delays_sig = pd.DataFrame(delays_sig)
info_column = pd.DataFrame({"Test Statistic": ["F", "p-value"]})
delays_sig = info_column.join(delays_sig)
delays_sig

Unnamed: 0,Test Statistic,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
0,F,3193.528491,236.9648,40550.192998,769455.694014,8602.841077,2084.429974,191508.925232,43997.948904,3612.654486,22453.786242,35658.509333,9813.590464,60852.660928,248.6169,393.0774,65.37229,7571.812112,105310.596669,667.2119,135148.41474,16.657393,111.1714
1,p-value,0.0,1.820468e-53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.24791e-56,1.8111509999999999e-87,6.204522e-16,0.0,0.0,4.312451e-147,0.0,4.5e-05,5.437112e-26


In [181]:
# Create the logistic regression model -> try lasso instead of ridge
# Split for train and test datasets
# Fit the model

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
classifier.fit(X_train, y_train)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)
classifier = LogisticRegression(penalty='l2')
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [182]:
# Fit the model & check the model R2 scores

classifier.fit(X_train, y_train)
print(f"Training Data R2: {round((classifier.score(X_train, y_train)), 4)*100}%")
print(f"Testing Data R2: {round((classifier.score(X_test, y_test)), 4)*100}%")

Training Data R2: 86.64%
Testing Data R2: 86.6%


In [183]:
# Not bad.  Much better than the original 15 feature model for departure delays.  Simplifying should help.

predictions = classifier.predict(X_test)
predict_df = pd.DataFrame({"Predictions": predictions[:100], "Test": ((np.transpose(y_test)).flatten())[:100]})
predict_df["Different"] = 1*np.ravel(predict_df["Predictions"] == predict_df["Test"])
correct = predict_df.loc[predict_df["Different"]==1,:]
print(f"Prediction Accurany: {round(((len(correct))/len(predict_df)), 4)*100}%")

Prediction Accurany: 92.0%


In [186]:
# Simple arrival delay logistic ML model -> 8 features

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEP_DELAY", "AIR_TIME", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
       "WEATHER_DELAY"]]
y = delays_df["ARRIVAL_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 8) (1683475, 1)


In [187]:
# Test for significant variables in proposed model
# All variables are significant

from sklearn.feature_selection import f_regression
pd.set_option('display.max_columns', None)

delays_sig = f_regression(X, y)
delays_sig = pd.DataFrame(delays_sig)
info_column = pd.DataFrame({"Test Statistic": ["F", "p-value"]})
delays_sig = info_column.join(delays_sig)
delays_sig

Unnamed: 0,Test Statistic,0,1,2,3,4,5,6,7
0,F,3193.528491,236.9648,40550.192998,311052.234297,3612.654486,393.0774,65.37229,7571.812112
1,p-value,0.0,1.820468e-53,0.0,0.0,0.0,1.8111509999999999e-87,6.204522e-16,0.0


In [188]:
# Create the logistic regression model -> try lasso instead of ridge
# Split for train and test datasets
# Fit the model

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
classifier.fit(X_train, y_train)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)
classifier = LogisticRegression(penalty='l2')
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [189]:
# Fit the model & check the model R2 scores

classifier.fit(X_train, y_train)
print(f"Training Data R2: {round((classifier.score(X_train, y_train)), 4)*100}%")
print(f"Testing Data R2: {round((classifier.score(X_test, y_test)), 4)*100}%")

Training Data R2: 83.57%
Testing Data R2: 83.48%


In [190]:
# Better results but still not over 90%.

predictions = classifier.predict(X_test)
predict_df = pd.DataFrame({"Predictions": predictions[:100], "Test": ((np.transpose(y_test)).flatten())[:100]})
predict_df["Different"] = 1*np.ravel(predict_df["Predictions"] == predict_df["Test"])
correct = predict_df.loc[predict_df["Different"]==1,:]
print(f"Prediction Accurany: {round(((len(correct))/len(predict_df)), 4)*100}%")

Prediction Accurany: 89.0%


In [191]:
# Simple arrival delay logistic ML model -> 8 features
# Try delays over 15 minutes.

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEP_DELAY", "AIR_TIME", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
       "WEATHER_DELAY"]]
y = delays_df["ARRIVAL_DELAY_OVER_15_MINUTES"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 8) (1683475, 1)


In [192]:
# Test for significant variables in proposed model
# All variables are significant.

from sklearn.feature_selection import f_regression
pd.set_option('display.max_columns', None)

delays_sig = f_regression(X, y)
delays_sig = pd.DataFrame(delays_sig)
info_column = pd.DataFrame({"Test Statistic": ["F", "p-value"]})
delays_sig = info_column.join(delays_sig)
delays_sig

Unnamed: 0,Test Statistic,0,1,2,3,4,5,6,7
0,F,3861.090768,3308.693263,538.8344,70059.393367,14644.128026,229.4719,617.8351,1390.975
1,p-value,0.0,0.0,3.530173e-119,0.0,0.0,7.833001e-52,2.340468e-136,2.560997e-304


In [None]:
# Create the logistic regression model -> try lasso instead of ridge
# Split for train and test datasets
# Fit the model

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
classifier.fit(X_train, y_train)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)
classifier = LogisticRegression(penalty='l2')
classifier

In [195]:
# Fit the model & check the model R2 scores

classifier.fit(X_train, y_train)
print(f"Training Data R2: {round((classifier.score(X_train, y_train)), 4)*100}%")
print(f"Testing Data R2: {round((classifier.score(X_test, y_test)), 4)*100}%")

Training Data R2: 74.37%
Testing Data R2: 74.31%


In [196]:
# Model is not good for arrival delays over 15 minutes.

predictions = classifier.predict(X_test)
predict_df = pd.DataFrame({"Predictions": predictions[:100], "Test": ((np.transpose(y_test)).flatten())[:100]})
predict_df["Different"] = 1*np.ravel(predict_df["Predictions"] == predict_df["Test"])
correct = predict_df.loc[predict_df["Different"]==1,:]
print(f"Prediction Accurany: {round(((len(correct))/len(predict_df)), 4)*100}%")

Prediction Accurany: 80.0%


In [221]:
# Final simple arrival delay logistic ML model -> 18 features
# Find an arrival model that matches the departure R2 scores.

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEP_DELAY", "DEPARTURE_TIME_OF_DAY_DUMMY", \
              "OP_CARRIER_FL_NUM", "TAXI_OUT", "AIR_TIME", "TAXI_IN", \
              "ARRIVAL_TIME_OF_DAY_DUMMY", "CARRIER_DELAY", "DISTANCE", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
       "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY",]]
y = delays_df["ARRIVAL_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 18) (1683475, 1)


In [222]:
# Test for significant variables in proposed model
# All variables are significant.

from sklearn.feature_selection import f_regression
pd.set_option('display.max_columns', None)

delays_sig = f_regression(X, y)
delays_sig = pd.DataFrame(delays_sig)
info_column = pd.DataFrame({"Test Statistic": ["F", "p-value"]})
delays_sig = info_column.join(delays_sig)
delays_sig

Unnamed: 0,Test Statistic,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,F,3193.528491,236.9648,40550.192998,311052.234297,8602.841077,2084.429974,191508.925232,3612.654486,35658.509333,9813.590464,60852.660928,248.6169,393.0774,65.37229,7571.812112,105310.596669,667.2119,135148.41474
1,p-value,0.0,1.820468e-53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.24791e-56,1.8111509999999999e-87,6.204522e-16,0.0,0.0,4.312451e-147,0.0


In [223]:
# Create the logistic regression model -> try lasso instead of ridge
# Split for train and test datasets
# Fit the model

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
classifier.fit(X_train, y_train)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)
classifier = LogisticRegression(penalty='l2')
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [224]:
# Fit the model & check the model R2 scores

classifier.fit(X_train, y_train)
print(f"Training Data R2: {round((classifier.score(X_train, y_train)), 4)*100}%")
print(f"Testing Data R2: {round((classifier.score(X_test, y_test)), 4)*100}%")

Training Data R2: 88.58%
Testing Data R2: 88.53999999999999%


In [212]:
# After trying out many feature combinations, this one produces the best R2 scores.

predictions = classifier.predict(X_test)
predict_df = pd.DataFrame({"Predictions": predictions[:100], "Test": ((np.transpose(y_test)).flatten())[:100]})
predict_df["Different"] = 1*np.ravel(predict_df["Predictions"] == predict_df["Test"])
correct = predict_df.loc[predict_df["Different"]==1,:]
print(f"Prediction Accurany: {round(((len(correct))/len(predict_df)), 4)*100}%")

Prediction Accurany: 95.0%


In [225]:
# Final simple arrival delay logistic ML model -> 18 features
# Arrival delays of 15 minutes are used for y.

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEP_DELAY", "DEPARTURE_TIME_OF_DAY_DUMMY", \
              "OP_CARRIER_FL_NUM", "TAXI_OUT", "AIR_TIME", "TAXI_IN", \
              "ARRIVAL_TIME_OF_DAY_DUMMY", "CARRIER_DELAY", "DISTANCE", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
       "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY",]]
y = delays_df["ARRIVAL_DELAY_OVER_15_MINUTES"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 18) (1683475, 1)


In [226]:
# Test for significant variables in proposed model
# All variables are significant.

from sklearn.feature_selection import f_regression
pd.set_option('display.max_columns', None)

delays_sig = f_regression(X, y)
delays_sig = pd.DataFrame(delays_sig)
info_column = pd.DataFrame({"Test Statistic": ["F", "p-value"]})
delays_sig = info_column.join(delays_sig)
delays_sig

Unnamed: 0,Test Statistic,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,F,3861.090768,3308.693263,538.8344,70059.393367,2046.853302,3079.679044,49600.332554,14644.128026,8653.753384,1316.441,10898.595508,19095.709664,229.4719,617.8351,1390.975,18474.396958,122.9797,23387.104217
1,p-value,0.0,0.0,3.530173e-119,0.0,0.0,0.0,0.0,0.0,0.0,3.910628e-288,0.0,0.0,7.833001e-52,2.340468e-136,2.560997e-304,0.0,1.412076e-28,0.0


In [227]:
# Create the logistic regression model -> try lasso instead of ridge
# Split for train and test datasets
# Fit the model

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
classifier.fit(X_train, y_train)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y)
classifier = LogisticRegression(penalty='l2')
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [228]:
# Fit the model & check the model R2 scores

classifier.fit(X_train, y_train)
print(f"Training Data R2: {round((classifier.score(X_train, y_train)), 4)*100}%")
print(f"Testing Data R2: {round((classifier.score(X_test, y_test)), 4)*100}%")

Training Data R2: 78.39%
Testing Data R2: 78.36%


In [229]:
# While 78% is not bad, more models would need to be tested for arrival delays of 15 minutes+.

predictions = classifier.predict(X_test)
predict_df = pd.DataFrame({"Predictions": predictions[:100], "Test": ((np.transpose(y_test)).flatten())[:100]})
predict_df["Different"] = 1*np.ravel(predict_df["Predictions"] == predict_df["Test"])
correct = predict_df.loc[predict_df["Different"]==1,:]
print(f"Prediction Accurany: {round(((len(correct))/len(predict_df)), 4)*100}%")

Prediction Accurany: 82.0%


In [233]:
Logistic_Regression_ML_outcomes = {
    "Model Type": ["Depature Delay", "", "Departure Delay > 15 minutes", \
                   "Arrival Delay", "", "Arrival Delay > 15 minutes" \
                  ],
    
    "Features": [15, 10, 10, 22, 18, 18],
    
    "Train Data R2": [.7614, .9587, .99, .8664, .8858, .7839],
    
    
    "Test Data R2": [.7613, .9586, .99, .866, .8854, .7836]
    
                }

Logistic_Regression_ML_outcomes_df = pd.DataFrame(Logistic_Regression_ML_outcomes)
Logistic_Regression_ML_outcomes_df

Unnamed: 0,Model Type,Features,Train Data R2,Test Data R2
0,Depature Delay,15,0.7614,0.7613
1,,10,0.9587,0.9586
2,Departure Delay > 15 minutes,10,0.99,0.99
3,Arrival Delay,22,0.8664,0.866
4,,18,0.8858,0.8854
5,Arrival Delay > 15 minutes,18,0.7839,0.7836
