In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from data_cleaning import clean_flights_df, clean_passengers_df, avg_passengers, clean_fuel_df, avg_carrier_arr_delay, avg_fuel_use, avg_taxi_time

### Import datasets

In [2]:
flights_df = clean_flights_df('data/flights_sample.csv')
flights_df.head()

Unnamed: 0,mkt_unique_carrier,op_unique_carrier,origin,dest,crs_dep_time,dep_time,dep_delay,taxi_out,taxi_in,crs_arr_time,arr_time,arr_delay,crs_elapsed_time,actual_elapsed_time,distance,year,month,day_of_month,day_of_week,dep_hour
0,UA,UA,SFO,LAX,630,627.0,-3.0,16.0,13.0,816,753.0,-23.0,106.0,86.0,337,2019,7,17,2,6
1,WN,WN,BWI,BOS,1340,1400.0,20.0,36.0,5.0,1510,1609.0,59.0,90.0,129.0,369,2018,6,9,5,13
2,B6,B6,BDL,MCO,600,551.0,-9.0,16.0,9.0,906,845.0,-21.0,186.0,174.0,1050,2019,11,6,2,6
3,WN,WN,LAS,OMA,950,958.0,8.0,16.0,2.0,1425,1417.0,-8.0,155.0,139.0,1099,2019,12,24,1,9
4,WN,WN,MCO,MDW,2045,2042.0,-3.0,11.0,5.0,2235,2231.0,-4.0,170.0,169.0,990,2018,1,14,6,20


In [3]:
flights_df = flights_df.sample(frac=.1, random_state=58)
flights_df.reset_index(inplace=True)
flights_df.shape

(468152, 21)

In [4]:
flights_df.head()

Unnamed: 0,index,mkt_unique_carrier,op_unique_carrier,origin,dest,crs_dep_time,dep_time,dep_delay,taxi_out,taxi_in,...,arr_time,arr_delay,crs_elapsed_time,actual_elapsed_time,distance,year,month,day_of_month,day_of_week,dep_hour
0,891643,WN,WN,SDF,MDW,2120,2126.0,6.0,6.0,3.0,...,2124.0,-6.0,70.0,58.0,271,2018,6,25,0,21
1,3506629,DL,OO,PDX,SEA,1314,1306.0,-8.0,38.0,18.0,...,1437.0,20.0,63.0,91.0,129,2018,7,30,0,13
2,4059215,UA,EV,EWR,PVD,2100,2054.0,-6.0,38.0,5.0,...,2211.0,1.0,70.0,77.0,160,2019,8,5,0,21
3,3284365,AA,AA,JFK,SAN,1930,1950.0,20.0,38.0,4.0,...,2312.0,18.0,384.0,382.0,2446,2018,11,19,0,19
4,1958211,AA,AA,BWI,CLT,700,653.0,-7.0,30.0,3.0,...,828.0,-19.0,107.0,95.0,361,2019,11,26,1,7


In [5]:
passengers_df = clean_passengers_df('data/passengers.csv')
fuel_df = clean_fuel_df('data/fuel_consumption.csv')

### Encode Categorical Variables used in analysis

In [6]:
cat_cols = ['origin', 'dest']

enc = OneHotEncoder(drop='first')
encoded = enc.fit_transform(flights_df[cat_cols])
encoded_df = pd.DataFrame.sparse.from_spmatrix(encoded)
print(encoded_df.shape)
encoded_df.head()

(468152, 747)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,737,738,739,740,741,742,743,744,745,746
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
flights_enc = pd.concat([flights_df, encoded_df], axis=1)
print(flights_enc.shape)

(468152, 768)


In [8]:
flights_enc.drop('index', axis=1, inplace=True)
flights_enc.head()

Unnamed: 0,mkt_unique_carrier,op_unique_carrier,origin,dest,crs_dep_time,dep_time,dep_delay,taxi_out,taxi_in,crs_arr_time,...,737,738,739,740,741,742,743,744,745,746
0,WN,WN,SDF,MDW,2120,2126.0,6.0,6.0,3.0,2130,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DL,OO,PDX,SEA,1314,1306.0,-8.0,38.0,18.0,1417,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,UA,EV,EWR,PVD,2100,2054.0,-6.0,38.0,5.0,2210,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AA,AA,JFK,SAN,1930,1950.0,20.0,38.0,4.0,2254,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,AA,AA,BWI,CLT,700,653.0,-7.0,30.0,3.0,847,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Train Test Split and Feature Engineering

In [9]:
X_train, X_test, y_train, y_test = train_test_split(flights_enc, flights_enc['arr_delay'], test_size=.3, random_state=58)

In [10]:
# add average passengers and avg fuel use columns

X_train = avg_passengers(X_train, passengers_df)
X_train = avg_fuel_use(X_train, fuel_df)

In [11]:
# create average taxi in and out columns.
taxi_dep_mean_dict = round(X_train.groupby(X_train['dep_time']//100)['taxi_out'].mean(), 2).to_dict()
taxi_arr_mean_dict = round(X_train.groupby(X_train['arr_time']//100)['taxi_in'].mean(), 2).to_dict()

X_train['arr_hour'] = X_train['arr_time'] // 100
X_train['taxi_dep_mean'] = X_train['dep_hour'].map(taxi_dep_mean_dict)
X_train['taxi_arr_mean'] = X_train['arr_hour'].map(taxi_arr_mean_dict)
X_train.drop('arr_hour', axis=1, inplace=True)

In [12]:
# create avg_carrier_delay column
avg_carrier_arr_delay_dict = round(X_train.groupby('op_unique_carrier')['arr_delay'].mean(), 2).to_dict()
X_train['avg_carrier_arr_delay'] = X_train['op_unique_carrier'].map(avg_carrier_arr_delay_dict)

In [13]:
X_train.columns.tolist()

['mkt_unique_carrier',
 'op_unique_carrier',
 'origin',
 'dest',
 'crs_dep_time',
 'dep_time',
 'dep_delay',
 'taxi_out',
 'taxi_in',
 'crs_arr_time',
 'arr_time',
 'arr_delay',
 'crs_elapsed_time',
 'actual_elapsed_time',
 'distance',
 'year',
 'month',
 'day_of_month',
 'day_of_week',
 'dep_hour',
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,

In [14]:
# drop columns from train data that won't be used in model
X_train.drop(columns=['mkt_unique_carrier','op_unique_carrier', 'dest', 'origin', 'dep_time', 'dep_delay', 'taxi_in', 'taxi_out', 'arr_time', 'arr_delay', 'actual_elapsed_time'], inplace=True)
X_train.shape

(327706, 762)

#### Add Engineered features to test data

In [15]:
X_test = avg_passengers(X_test, passengers_df)
X_test = avg_fuel_use(X_test, fuel_df)

In [16]:
# map average delays and taxi times from training data onto test data
X_test['taxi_dep_mean'] = X_test['dep_hour'].map(taxi_dep_mean_dict)

X_test['arr_hour'] = X_test['crs_arr_time'] // 100
X_test['taxi_arr_mean'] = X_test['arr_hour'].map(taxi_arr_mean_dict)
X_test.drop('arr_hour', axis=1, inplace=True)


In [17]:
X_test['avg_carrier_arr_delay'] = X_test['op_unique_carrier'].map(avg_carrier_arr_delay_dict)

In [18]:
# check if engineered features created any null values
X_train.isna().sum().sum()

1

In [19]:
X_train[X_train.isna().any(axis=1)]

Unnamed: 0,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,year,month,day_of_month,day_of_week,dep_hour,0,...,743,744,745,746,monthly_avg_passengers,avg_monthly_fuel_gallons,avg_monthly_fuel_cost,taxi_dep_mean,taxi_arr_mean,avg_carrier_arr_delay
287991,1935,2302,327.0,2029,2018,11,24,5,19,0.0,...,0.0,0.0,0.0,0.0,,19589589.0,33303519.0,17.53,6.82,11.84


In [20]:
X_train.dropna(inplace=True)


In [21]:
y_train.drop(index=287991, inplace=True)

In [22]:
X_test.isna().sum().sum()

0

In [23]:
# drop columns from test data that won't be used in model
X_test.drop(columns=['mkt_unique_carrier', 'op_unique_carrier', 'dest', 'origin', 'dep_time', 'dep_delay', 'taxi_in', 'taxi_out', 'arr_time', 'arr_delay', 'actual_elapsed_time'], inplace=True)
print(X_test.shape)
print(X_train.shape)

(140446, 762)
(327705, 762)


In [24]:
print(X_test.shape, y_test.shape)
print(X_train.shape, y_train.shape)

(140446, 762) (140446,)
(327705, 762) (327705,)


In [25]:
# change all column names to strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

### Scale Numeric Features

In [26]:
X_train.reset_index(inplace=True, drop=True)
X_train.head()

Unnamed: 0,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,year,month,day_of_month,day_of_week,dep_hour,0,...,743,744,745,746,monthly_avg_passengers,avg_monthly_fuel_gallons,avg_monthly_fuel_cost,taxi_dep_mean,taxi_arr_mean,avg_carrier_arr_delay
0,2012,2125,73.0,185,2018,10,9,1,20,0.0,...,0.0,0.0,0.0,0.0,5384.0,300692722.0,524396865.0,18.09,7.98,7.22
1,1750,1915,85.0,388,2019,8,2,4,17,0.0,...,0.0,0.0,0.0,0.0,2409.0,331639036.0,569413868.0,17.87,8.5,7.22
2,951,1141,110.0,534,2019,12,2,0,9,0.0,...,0.0,0.0,0.0,0.0,2500.0,268893739.0,531949462.0,18.46,7.35,0.39
3,2230,2313,43.0,74,2019,8,25,6,22,0.0,...,0.0,0.0,0.0,0.0,1241.0,337147231.0,612541493.0,17.17,6.82,7.25
4,905,1018,73.0,255,2019,10,10,3,9,0.0,...,0.0,0.0,0.0,0.0,664.0,284918524.0,578736044.0,18.46,7.87,7.25


In [27]:
num_cols = ['crs_dep_time', 'crs_arr_time', 'crs_elapsed_time', 'distance', 'year', 'month', 'day_of_month', 'day_of_week', 'dep_hour', 'monthly_avg_passengers','avg_monthly_fuel_gallons', 'avg_monthly_fuel_cost', 'taxi_dep_mean', 'taxi_arr_mean', 'avg_carrier_arr_delay']
X_train_num = X_train[num_cols]

In [28]:
scaler = MinMaxScaler()
X_train_num_scaled = pd.DataFrame(scaler.fit_transform(X_train_num))

In [29]:
# replace unscaled num columns with scaled ones
X_train_scaled = pd.concat([X_train.drop(columns=num_cols), X_train_num_scaled], axis=1)
X_train_scaled.shape

(327705, 762)

In [30]:
X_train_scaled.isna().sum().sum()

0

In [31]:
# apply scaler to test data
X_test.reset_index(inplace=True, drop=True)
X_test_num = X_test[num_cols]

X_test_num_scaled = pd.DataFrame(scaler.transform(X_test_num))
X_test_scaled = pd.concat([X_test.drop(columns=num_cols), X_test_num_scaled], axis=1)
X_test_scaled.shape

(140446, 762)

In [32]:
X_test_scaled.isna().sum().sum()

0

## Model Training

### RidgeCV Model

In [43]:
X_train_scaled.columns = X_train_scaled.columns.astype(str)
X_test_scaled.columns = X_test_scaled.columns.astype(str)

In [39]:
from sklearn.linear_model import RidgeCV

rcv = RidgeCV()

rcv.fit(X_train_scaled, y_train)



In [41]:
import pickle
filename = 'Saved_models/Ben_RCVmodel1.sav'
pickle.dump(rcv, open(filename, 'wb'))

In [45]:
y_pred = rcv.predict(X_test_scaled)



In [47]:
print('RMSE: ', mean_squared_error(y_test, y_pred, squared=False))
print('R^2: ', r2_score(y_test, y_pred))

RMSE:  49.15911244406843
R^2:  0.013547816451841443


### AdaBoost Model

In [50]:
from sklearn.ensemble import AdaBoostRegressor

abr = AdaBoostRegressor()
abr.fit(X_train, y_train)



In [51]:
import pickle
filename = 'Saved_models/Ben_ABRmodel1.sav'
pickle.dump(abr, open(filename, 'wb'))

In [52]:
y_pred = abr.predict(X_test)
print('RMSE: ', mean_squared_error(y_test, y_pred, squared=False))
print('R^2: ', r2_score(y_test, y_pred))



RMSE:  51.59243906775994
R^2:  -0.08652592169270412


### XGBoost Model

In [None]:
# This is done with the data before encoding or scaling.

cats = X_train.select_dtypes(exclude=np.number).columns.tolist()

for col in cats:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

import xgboost as xgb

dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

params = {'objective':'reg:squarederror', 'tree_method':'hist'}
n=1000
evals=[(dtrain_reg, 'train'), (dtest_reg, 'validation')]

model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=n,
    evals=evals,
    verbose_eval=25,
    early_stopping_rounds=50
)

y_pred = model.predict(dtest_reg)

print(mean_squared_error(y_test, y_pred, squared=False))
print(r2_score(y_test, y_pred))

import pickle
filename = 'Ben_XGBmodel1.sav'
pickle.dump(model, open(filename, 'wb'))

### ElasticNet with GridSearch

In [33]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

elnet = ElasticNet()
params = {'alpha': [.001, .01, .1, 1], 'l1_ratio': [0, .25, .5, .75, 1 ]}

grid = GridSearchCV(estimator=elnet, param_grid=params)


In [36]:
X_train_scaled.columns=X_train_scaled.columns.astype(str)

In [37]:
grid.fit(X_train_scaled, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [38]:
X_test_scaled.columns = X_test_scaled.columns.astype(str)

y_pred = grid.predict(X_test_scaled)




In [39]:
import pickle
filename = 'Saved_models/Ben_ENmodel1.sav'
pickle.dump(grid, open(filename, 'wb'))

In [40]:
print('RMSE: ', mean_squared_error(y_test, y_pred, squared=False))
print('R^2: ', r2_score(y_test, y_pred))

RMSE:  49.139500751691195
R^2:  0.014334736183719166
