Stochastic Gradient Boosting has some pretty nice effects with our flight delay data.  For departure delays, our best model utilized 10 features with delays of 15+ minutes which produced another great R2 score, this time at 99%!.  SGB with scaling provided another excellent model for departure delays.

However, arrival delays once again fell short of this standard.  While better than any previous machine learning methods outside of straigh linear/logistic regression, the best R2 score achieved was 88.96% which was found with a model for arrival departures of 15+ minutes with 26 features.  While 88.96% is not bad, a threshold of 90% at the minimum is preferred so other methods will have to be tried to finally achieve this.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import time

import warnings
warnings.simplefilter('ignore')

# RNG used for seeding
rng = int(np.random.randint(low=1, high=2000, size=1))

# xgboost has similar problems to tensorflow when installing so it will not be utilizied. Same with parfit.
# from xgboost import XGBClassifier
# import parfit.parfit as pf

In [2]:
# Read in first quarter dataset
delays_df = pd.read_csv("Delay_first_quarter1.csv")

In [3]:
# Do some additional cleaning
delays_df = delays_df.fillna(0)

In [4]:
# Dummy variables for flights of east coast origin/destination 
# Dummy variables for flights of west coast origin/destination -> both fixed

delays_df['EAST_COAST_ORIGIN'] = 1*np.ravel(delays_df["ORIGIN_LONGITUDE"] >= -83)
delays_df['EAST_COAST_DEST'] = 1*np.ravel(delays_df["DEST_LONGITUDE"] >= -83)
delays_df['WEST_COAST_ORIGIN'] = 1*np.ravel(delays_df["ORIGIN_LONGITUDE"] <= -114)
delays_df['WEST_COAST_DEST'] = 1*np.ravel(delays_df["DEST_LONGITUDE"] <= -114)

In [5]:
# Create more dummy variables for categorical data:

Weekday = {
           "Monday": 1,
           "Tueday": 2, 
           "Wednesday": 3, 
           "Thursday" : 4,
           "Friday": 5,
           "Saturday": 6,
           "Sunday": 7
          }

Airline = {
        "UA": 1,
        "AA": 2,
        "9E": 3,
        "B6": 4,
        "EV": 5,
        "F9": 6,
        "G4": 7,
        "HA": 8,
        "MQ": 9,
        "NK": 10,
        "OH": 11,
        "OO": 12,
        "VX": 13,
        "WN": 14,
        "YV": 15,
        "YX": 16,
        "AS": 17,
        "DL": 18
}

In [6]:
delays_df['WEEKDAY_DUMMY'] = delays_df['WEEKDAY'].apply(  \
                            lambda x: next((y for z, y in Weekday.items() if x in z), 0))

In [7]:
delays_df['AIRLINE_DUMMY'] = delays_df['OP_CARRIER'].apply(  \
                            lambda x: next((y for z, y in Airline.items() if x in z), 0))

In [8]:
# Make a proper categorical delay dummy variable
# Get Month into a categorical value
# This uses less memory than using lambda.

delay_dict = {0: "On Time", 1: "Delayed"}
delays_df["DEPARTURE_DELAY_DUMMY"]=delays_df["DEPARTURE_DELAY"].copy()
delays_df["DEPARTURE_DELAY"].replace(delay_dict, inplace = True)

In [9]:
arrive_dict = {0: "On Time", 1: "Delayed"}
delays_df["ARRIVAL_DELAY_DUMMY"]=delays_df["ARRIVAL_DELAY"].copy()
delays_df["ARRIVAL_DELAY"].replace(delay_dict, inplace = True)

In [14]:
Delay_dict = {range(-3000, 1): "On Time",
               range(1, 16): 'Small Delay', 
               range(16, 3000): 'Long Delay',}

In [15]:
start = time.time()
delays_df['DEPARTURE_DELAY_TEST'] = delays_df['DEP_DELAY'].apply(  \
                            lambda x: next((y for z, y in Delay_dict.items() if x in z), 0))
end = time.time()
print(f"{end-start} seconds")

405.1665036678314 seconds


In [16]:
start = time.time()
delays_df['ARRIVAL_DELAY_TEST'] = delays_df['ARR_DELAY'].apply(  \
                            lambda x: next((y for z, y in Delay_dict.items() if x in z), 0))
end = time.time()
print(f"{end-start} seconds")

453.0730800628662 seconds


In [17]:
# Output new csv as these additional variables take way too long each time jupyter is fired up
delays_df.to_csv("Delay_first_quarter1.csv",index=False)

DEPARTURE DELAY 10 FEATURES SCALED SGB (73.316%)

In [4]:
# Try out Stochastic Gradient Boosting on the previous best Departure Delay model
# Departure delay logistic ML model -> 10 features

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEPARTURE_TIME_OF_DAY_DUMMY", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
               "OP_CARRIER_FL_NUM", "TAXI_OUT", "WHEELS_OFF","WEATHER_DELAY",]]

y = delays_df["DEPARTURE_DELAY_DUMMY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 10) (1683475, 1)


In [12]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

In [13]:
# Try out a min/max scaler for the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [25]:
# Set up and fit the Gradient Booster

from sklearn.ensemble import GradientBoostingClassifier
start = time.time()

gbc = GradientBoostingClassifier(n_estimators=100, random_state=rng)
gbc.fit(X_train, y_train)
end = time.time()
print(f"{end-start} seconds")

326.1881728172302 seconds


In [27]:
# Meh as usual
from sklearn.metrics import accuracy_score
predictions = gbc.predict(X_test)
prediction_p = gbc.predict_proba(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")

Accuracy Score: 73.33540840499063


In [28]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,0,1
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1
0,266054,16287
1,95936,42592


DEPARTURE DELAY 15+ MINUTES 10 FEATURES SCALED SGB (99%)

In [29]:
# Try out Stochastic Gradient Boosting on the previous best Departure Delay model with delays of 15+ minutes
# Departure delay logistic ML model -> 10 features

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEPARTURE_TIME_OF_DAY_DUMMY", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
               "OP_CARRIER_FL_NUM", "TAXI_OUT", "WHEELS_OFF","WEATHER_DELAY",]]

y = delays_df["DEPARTURE_DELAY_OVER_15_MINUTES"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 10) (1683475, 1)


In [30]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

In [31]:
# Try out a min/max scaler for the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [32]:
# Set up and fit the Gradient Booster

from sklearn.ensemble import GradientBoostingClassifier
start = time.time()

gbc = GradientBoostingClassifier(n_estimators=100, random_state=rng)
gbc.fit(X_train, y_train)
end = time.time()
print(f"{end-start} seconds")

317.8484616279602 seconds


In [34]:
# Well that was unexpected but we will take it.  99% is tough to argue with here.
from sklearn.metrics import accuracy_score
start = time.time()

predictions = gbc.predict(X_test)
prediction_p = gbc.predict_proba(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")
end = time.time()
print(f"{end-start} seconds")

Accuracy Score: 99.01014329874617
2.774925708770752 seconds


ARRIVAL DELAYS 18 FEATURES SCALED SGB (88.83%)

In [35]:
# Try out the same procedure for the best model for arrival delays.

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEP_DELAY", "DEPARTURE_TIME_OF_DAY_DUMMY", \
              "OP_CARRIER_FL_NUM", "TAXI_OUT", "AIR_TIME", "TAXI_IN", \
              "ARRIVAL_TIME_OF_DAY_DUMMY", "CARRIER_DELAY", "DISTANCE", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
       "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY",]]
y = delays_df["ARRIVAL_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 18) (1683475, 1)


In [36]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split
start = time.time()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
end = time.time()
print(f"{end-start} seconds")

1.0429489612579346 seconds


In [37]:
# Try out a min/max scaler for the data
from sklearn.preprocessing import MinMaxScaler
start = time.time()

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
end = time.time()
print(f"{end-start} seconds")

1.5062346458435059 seconds


In [38]:
# Set up and fit the Gradient Booster

from sklearn.ensemble import GradientBoostingClassifier
start = time.time()

gbc = GradientBoostingClassifier(n_estimators=100, random_state=rng)
gbc.fit(X_train, y_train)
end = time.time()
print(f"{end-start} seconds")

451.72132897377014 seconds


In [39]:
# This is much better for arrival delays than what we got with the decision trees!  Perhaps we are to something.
from sklearn.metrics import accuracy_score
start = time.time()

predictions = gbc.predict(X_test)
prediction_p = gbc.predict_proba(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")
end = time.time()
print(f"{end-start} seconds")

Accuracy Score: 88.8768239048255
6.374078750610352 seconds


ARRIVAL DELAYS 15+ MINUTES 18 FEATURES SCALED SGB (80.98%)

In [63]:
# Try out the same procedure for the best model for arrival delays.

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEP_DELAY", "DEPARTURE_TIME_OF_DAY_DUMMY", \
              "OP_CARRIER_FL_NUM", "TAXI_OUT", "AIR_TIME", "TAXI_IN", \
              "ARRIVAL_TIME_OF_DAY_DUMMY", "CARRIER_DELAY", "DISTANCE", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
       "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY",]]
y = delays_df["ARRIVAL_DELAY_OVER_15_MINUTES"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 18) (1683475, 1)


In [64]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split
start = time.time()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
end = time.time()
print(f"{end-start} seconds")

0.9321613311767578 seconds


In [65]:
# Try out a min/max scaler for the data
from sklearn.preprocessing import MinMaxScaler
start = time.time()

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
end = time.time()
print(f"{end-start} seconds")

1.2436063289642334 seconds


In [66]:
# Set up and fit the Gradient Booster

from sklearn.ensemble import GradientBoostingClassifier
start = time.time()

gbc = GradientBoostingClassifier(n_estimators=100, random_state=rng)
gbc.fit(X_train, y_train)
end = time.time()
print(f"{end-start} seconds")

380.3071200847626 seconds


In [67]:
# And it went done to 80.98%.
from sklearn.metrics import accuracy_score
start = time.time()

predictions = gbc.predict(X_test)
prediction_p = gbc.predict_proba(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")
end = time.time()
print(f"{end-start} seconds")

Accuracy Score: 80.98030503553362
4.571773052215576 seconds


In [68]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,0,1
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1
0,277290,27816
1,52232,63531


ARRIVAL DELAY 8 FEATURES SCALED SGB (84.04%)

In [4]:
# Try to get an arrival delay model over 90%

X = delays_df[["DEP_TIME", "DEP_DELAY", "DEPARTURE_TIME_OF_DAY_DUMMY", \
              "OP_CARRIER_FL_NUM", "AIR_TIME",
              "ARRIVAL_TIME_OF_DAY_DUMMY","WEEKDAY_DUMMY", "AIRLINE_DUMMY", ]]
y = delays_df["ARRIVAL_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 8) (1683475, 1)


In [5]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split
start = time.time()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
end = time.time()
print(f"{end-start} seconds")

0.6772623062133789 seconds


In [6]:
# Try out a min/max scaler for the data
from sklearn.preprocessing import MinMaxScaler
start = time.time()

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
end = time.time()
print(f"{end-start} seconds")

0.8187565803527832 seconds


In [7]:
# Set up and fit the Gradient Booster

from sklearn.ensemble import GradientBoostingClassifier
start = time.time()

gbc = GradientBoostingClassifier(n_estimators=100, random_state=rng)
gbc.fit(X_train, y_train)
end = time.time()
print(f"{end-start} seconds")

243.99114513397217 seconds


In [8]:
# And it went done to 84.034%.
from sklearn.metrics import accuracy_score
start = time.time()

predictions = gbc.predict(X_test)
prediction_p = gbc.predict_proba(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")
end = time.time()
print(f"{end-start} seconds")

Accuracy Score: 84.03374921887809
6.459548234939575 seconds


ARRIVAL DELAY 3 FEATURES SCALED SGB (83.87%)

In [24]:
# Try again to get an arrival delay model over 90%

X = delays_df[["DEP_TIME", "DEP_DELAY", "AIR_TIME"]]
y = delays_df["ARRIVAL_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 3) (1683475, 1)


In [25]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split
start = time.time()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
end = time.time()
print(f"{end-start} seconds")

0.46752142906188965 seconds


In [26]:
# Try out a min/max scaler for the data
from sklearn.preprocessing import MinMaxScaler
start = time.time()

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
end = time.time()
print(f"{end-start} seconds")

0.13906121253967285 seconds


In [27]:
# Set up and fit the Gradient Booster

from sklearn.ensemble import GradientBoostingClassifier
start = time.time()

gbc = GradientBoostingClassifier(n_estimators=100, random_state=rng)
gbc.fit(X_train, y_train)
end = time.time()
print(f"{end-start} seconds")

170.77050852775574 seconds


In [28]:
# And it went done to 83.869%.
from sklearn.metrics import accuracy_score
start = time.time()

predictions = gbc.predict(X_test)
prediction_p = gbc.predict_proba(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")
end = time.time()
print(f"{end-start} seconds")

Accuracy Score: 83.86908990683561
6.020639896392822 seconds


In [29]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,Delayed,On Time
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1
Delayed,81686,56099
On Time,11791,271293


ARRIVAL DELAY 26b FEATURES SCALED SGB

In [34]:
# Try out a final model for arrival delays.  Seems more features the better.

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEP_DELAY", "DEPARTURE_TIME_OF_DAY_DUMMY", \
              "OP_CARRIER_FL_NUM", "TAXI_OUT", "AIR_TIME", "TAXI_IN", "WHEELS_ON", "WHEELS_OFF",\
              "ARRIVAL_TIME_OF_DAY_DUMMY", "CARRIER_DELAY", "DISTANCE", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
       "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY", "CANCELLED", "DIVERTED", \
              "EAST_COAST_ORIGIN", "WEST_COAST_ORIGIN", "EAST_COAST_DEST", "WEST_COAST_DEST",]]
y = delays_df["ARRIVAL_DELAY"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 26) (1683475, 1)


In [35]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split
start = time.time()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
end = time.time()
print(f"{end-start} seconds")

1.2902185916900635 seconds


In [36]:
# Try out a min/max scaler for the data
from sklearn.preprocessing import MinMaxScaler
start = time.time()

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
end = time.time()
print(f"{end-start} seconds")

1.980269432067871 seconds


In [37]:
# Set up and fit the Gradient Booster

from sklearn.ensemble import GradientBoostingClassifier
start = time.time()

gbc = GradientBoostingClassifier(n_estimators=100, random_state=rng)
gbc.fit(X_train, y_train)
end = time.time()
print(f"{end-start} seconds")

467.5918173789978 seconds


In [38]:
# This is the best of the bunch for arrival delays
from sklearn.metrics import accuracy_score
start = time.time()

predictions = gbc.predict(X_test)
prediction_p = gbc.predict_proba(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")
end = time.time()
print(f"{end-start} seconds")

Accuracy Score: 88.95570830828595
7.048933982849121 seconds


In [None]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

DEPARTURE DELAY 3 CATEGORIES SCALED SGB (70.52%)

In [10]:
# Try the 3 categorical y for Departure Delays

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEPARTURE_TIME_OF_DAY_DUMMY", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
               "OP_CARRIER_FL_NUM", "TAXI_OUT", "WHEELS_OFF","WEATHER_DELAY",]]

y = delays_df["DEPARTURE_DELAY_TEST"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 10) (1683475, 1)


In [11]:
y = pd.factorize(delays_df["DEPARTURE_DELAY_TEST"])[0]

In [12]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split
start = time.time()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
end = time.time()
print(f"{end-start} seconds")

0.5515913963317871 seconds


In [13]:
# Try out a min/max scaler for the data
from sklearn.preprocessing import MinMaxScaler
start = time.time()

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
end = time.time()
print(f"{end-start} seconds")

0.9294767379760742 seconds


In [14]:
# Set up and fit the Gradient Booster

from sklearn.ensemble import GradientBoostingClassifier
start = time.time()

gbc = GradientBoostingClassifier(n_estimators=100, random_state=rng)
gbc.fit(X_train, y_train)
end = time.time()
print(f"{end-start} seconds")

933.300167798996 seconds


In [15]:
# 15 minutes for 70.52%....
from sklearn.metrics import accuracy_score
start = time.time()

predictions = gbc.predict(X_test)
prediction_p = gbc.predict_proba(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")
end = time.time()
print(f"{end-start} seconds")

Accuracy Score: 70.52622074802373
11.630414724349976 seconds


In [16]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,0,1,2
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,280489,1174,791
1,53169,14803,1259
2,65522,2131,1531


ARRIVAL DELAY 3 CATEGORIES SCALED SGB

In [17]:
# Try the 3 categorical y for Arrival Delays

X = delays_df[["DAY", "MONTH", "DEP_TIME", "DEPARTURE_TIME_OF_DAY_DUMMY", "WEEKDAY_DUMMY", "AIRLINE_DUMMY", \
               "OP_CARRIER_FL_NUM", "TAXI_OUT", "WHEELS_OFF","WEATHER_DELAY",]]

y = delays_df["ARRIVAL_DELAY_TEST"].values.reshape(-1, 1)
print(X.shape, y.shape)

(1683475, 10) (1683475, 1)


In [18]:
y = pd.factorize(delays_df["ARRIVAL_DELAY_TEST"])[0]

In [19]:
# Split for train and test datasets
from sklearn.model_selection import train_test_split
start = time.time()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
end = time.time()
print(f"{end-start} seconds")

0.5445461273193359 seconds


In [20]:
# Try out a min/max scaler for the data
from sklearn.preprocessing import MinMaxScaler
start = time.time()

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
end = time.time()
print(f"{end-start} seconds")

0.9118351936340332 seconds


In [21]:
# Set up and fit the Gradient Booster

from sklearn.ensemble import GradientBoostingClassifier
start = time.time()

gbc = GradientBoostingClassifier(n_estimators=100, random_state=rng)
gbc.fit(X_train, y_train)
end = time.time()
print(f"{end-start} seconds")

913.2641911506653 seconds


In [22]:
# Another 15 minutes for 72.271%...

from sklearn.metrics import accuracy_score
start = time.time()

predictions = gbc.predict(X_test)
prediction_p = gbc.predict_proba(X_test)
y_transposed = (np.transpose(y_test)).flatten()
print(f"Accuracy Score: {accuracy_score(y_transposed, predictions)*100}")
end = time.time()
print(f"{end-start} seconds")

Accuracy Score: 72.27094416552418
10.930592060089111 seconds


In [23]:
pd.crosstab(y_transposed, predictions, rownames=["Actual Delays"], colnames=["Predicted Delays"])

Predicted Delays,0,1,2
Actual Delays,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,277940,2590,2554
1,44885,21990,3074
2,58797,4803,4236


In [39]:
SGB_ML_outcomes = {
    "Model": ["Depature Delay", "Departure Delay 15+ Minutes", "Departure Delay 3 Categories", \
              "Arrival Delay", "", "", "", "Arrival Delay 15+ Minutes", "Arrival Delay 3 Categories"],
    
    "Model Type": ["Scaled", "Scaled", "Scaled", "Scaled", "Scaled", "Scaled", "Scaled", "Scaled", "Scaled", ],
    
    "Features": [10, 10, 10, 18, 18, 8, 3, 26, 18],
    
    "Test Data R2": [.7316, .99, .7052, .8883, .8098, .8404, .8387, .8896, .72271]
    
                }

SGB_ML_outcomes_df = pd.DataFrame(SGB_ML_outcomes)
SGB_ML_outcomes_df

Unnamed: 0,Model,Model Type,Features,Test Data R2
0,Depature Delay,Scaled,10,0.7316
1,Departure Delay 15+ Minutes,Scaled,10,0.99
2,Departure Delay 3 Categories,Scaled,10,0.7052
3,Arrival Delay,Scaled,18,0.8883
4,,Scaled,18,0.8098
5,,Scaled,8,0.8404
6,,Scaled,3,0.8387
7,Arrival Delay 15+ Minutes,Scaled,26,0.8896
8,Arrival Delay 3 Categories,Scaled,18,0.72271
