# Modeling 

## Loading modules and data

In [3]:
import numpy as np
import pandas as pd
from scipy import stats
import modules.help_functions as hf
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE

import pickle

from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import seaborn as sns

In [91]:
# read in file
df = pd.read_csv('../data/encoded_training_data_v7.csv')

## XGBOOST

### XGboost Model

In [85]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
data_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

xgb_reg = xgb.XGBRegressor(objective = 'reg:squarederror',
                           max_depth = 8,
                           learning_rate = 0.40,
                           alpha = 67,  
                           n_estimators = 210,
                           colsample_bytree = 0.75,
                           reg_lambda = 47                           
                          )



xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}')
print(f'R2(test): {r2_score(y_test, y_pred)}') 
print(f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

MSE(test): 254.28943528355336
R2(test): 0.1364009119288616
MAE(test): 12.336826289482662


['origin_state_PA', 'dest_state_PA']

In [118]:
y_pred.max()

27.358557

In [119]:
y_pred.mean()

-9.536758

In [121]:
np.median(y_pred)

-9.452347

27.358557

In [139]:
pickle.dump(xgb_reg, open('xgboost_regressor_flight_delay_prediction.pkl', 'wb'))

                           max_depth = 8,
                           learning_rate = 0.40,
                           alpha = 67,  
                           n_estimators = 210,
                           colsample_bytree = 0.75,
                           reg_lambda = 47  

MSE(test): 254.28943528355336
R2(test): 0.1364009119288616
MAE(test): 12.336826289482662

### XGboost Cross Validation

In [5]:

import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from numpy import absolute
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=88)
data_dmatrix = xgb.DMatrix(data=X_train,label=y_train)

xgb = xgb.XGBRegressor(max_depth=8
                        )

# do the split
cv = KFold(n_splits=5)

# perform validation
scorers = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']
scores = cross_validate(xgb, X_train, y_train, scoring=scorers, cv=cv, n_jobs=5)

# Display the CV result
mse = absolute(scores['test_neg_mean_squared_error']).mean()
r2 = scores['test_r2'].mean()
mae = absolute(scores['test_neg_mean_absolute_error'].mean())
print(f'MSE(train_cv): {mse}',
      f'R2(train_cv): {r2}',
      f'MAE(train_cv): {mae}')

### XGBoost Grid Search

In [141]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
data_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

xgb_estimator = xgb.XGBRegressor()


params = {
       'learning_rate': [0.4],
       'max_depth': [8],
       'alpha': [67],
       'n_estimators': [210],
       'colsample_bytree': [0.75],
       'reg_lambda': [47],
#        'subsample': [0.85],
    
#     'learning_rate': np.arange(0.39, 0.41, 0.05),
#     'max_depth': np.arange(2, 7, 1),
#     'alpha': np.arange(70, 75, 1),
#     'n_estimators': np.arange(110, 240, 100),
#     'colsample_bytree': np.arange(0.4, 0.6, 0.1),
#     'reg_lambda': np.arange(46, 47, 0.2),
#     'subsample': np.arange(0.5, 0.8, 0.1)
    }

# scorers = ['r2']
grid_search_xg = GridSearchCV(xgb_estimator, 
                              param_grid=params, 
                              cv=5, 
                              n_jobs=7, 
                              error_score='raise', 
#                               scoring=scorers,
#                               refit='r2',
                              verbose=True
                              )

grid_search_xg.fit(X_train, y_train)

# display the training results
print('\nTRAINING RESULTS: \n')
print('Best Training Score: ', grid_search_xg.best_score_, '\n')
print('Best Training Params: ', grid_search_xg.best_params_, '\n')

# predict the test set
y_pred = grid_search_xg.predict(X_test)

# display the test results
print('\nTESTING RESULTS: \n')
print(f'MSE(test): {mean_squared_error(y_test, y_pred)}\n')
print(f'R2(test): {r2_score(y_test, y_pred)}\n')
print(f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

Fitting 5 folds for each of 1 candidates, totalling 5 fits

TRAINING RESULTS: 

Best Training Score:  0.11872217665583107 

Best Training Params:  {'alpha': 67, 'colsample_bytree': 0.75, 'learning_rate': 0.4, 'max_depth': 9, 'n_estimators': 215, 'reg_lambda': 47} 


TESTING RESULTS: 

MSE(test): 258.2332949830316

R2(test): 0.12655033237352886

MAE(test): 12.458929761800258


TRAINING RESULTS: 

Best Training Score:  0.12707881846818264 

Best Training Params:  {'alpha': 67, 'colsample_bytree': 0.75, 'learning_rate': 0.4, 'max_depth': 8, 'n_estimators': 210, 'reg_lambda': 47} 


TESTING RESULTS: 

MSE(test): 256.0030885554617

R2(test): 0.1387974194963002

MAE(test): 12.371004241235651

TRAINING RESULTS: 

Best Training Score:  0.12976675549019118 

Best Training Params:  {'alpha': 67, 'colsample_bytree': 0.75, 'learning_rate': 0.4, 'max_depth': 8, 'n_estimators': 150, 'reg_lambda': 47} 


TESTING RESULTS: 

MSE(test): 257.5834202293785

R2(test): 0.1350875364320151

MAE(test): 12.428264899925443

TRAINING RESULTS: 

Best Training Score:  0.12712034854855964 

Best Training Params:  {'alpha': 67, 'colsample_bytree': 0.75, 'learning_rate': 0.4, 'max_depth': 8, 'n_estimators': 98, 'reg_lambda': 47} 


TESTING RESULTS: 

MSE(test): 254.77896373729288

R2(test): 0.13461637547722827

MAE(test): 12.39622662771655

TRAINING RESULTS: 

Best Training Score:  0.12695598410296655 

Best Training Params:  {'alpha': 67, 'colsample_bytree': 0.75, 'learning_rate': 0.4, 'max_depth': 8, 'n_estimators': 95, 'reg_lambda': 47} 


TESTING RESULTS: 

MSE(test): 254.5986314126004

R2(test): 0.13475513521485982

MAE(test): 12.37124815463066

TRAINING RESULTS: 

Best Training Score:  0.12740440831402772 

Best Training Params:  {'alpha': 67, 'colsample_bytree': 0.75, 'learning_rate': 0.4, 'max_depth': 8, 'n_estimators': 95, 'reg_lambda': 48} 


TESTING RESULTS: 

MSE(test): 255.33450792199608

R2(test): 0.13356728212474045

MAE(test): 12.367355220133263

TRAINING RESULTS: 

Best Training Score:  0.11666210635790775 

Best Training Params:  {'alpha': 66, 'colsample_bytree': 0.75, 'learning_rate': 0.4, 'max_depth': 8, 'n_estimators': 95, 'subsample': 0.7999999999999999} 


TESTING RESULTS: 

MSE(test): 259.1570317268451

R2(test): 0.13046664778186456

MAE(test): 12.464682652738391

TRAINING RESULTS: 

Best Training Score:  0.12021755236706036 

Best Training Params:  {'alpha': 66, 'colsample_bytree': 0.75, 'learning_rate': 0.4, 'max_depth': 8, 'n_estimators': 95, 'subsample': 0.9} 


TESTING RESULTS: 

MSE(test): 256.1074340949077

R2(test): 0.12963970581453366

MAE(test): 12.389870936478864

Fitting 5 folds for each of 2 candidates, totalling 10 fits

TRAINING RESULTS: 

Best Training Score:  0.12751638841283003 

Best Training Params:  {'alpha': 66, 'colsample_bytree': 0.75, 'learning_rate': 0.4, 'max_depth': 8, 'n_estimators': 89, 'reg_lambda': 50} 


TESTING RESULTS: 

MSE(test): 258.0654389585468

R2(test): 0.13102962457716028

MAE(test): 12.442337417718852

Fitting 5 folds for each of 3 candidates, totalling 15 fits

TRAINING RESULTS: 

Best Training Score:  0.12923086543391069 

Best Training Params:  {'alpha': 66, 'colsample_bytree': 0.75, 'learning_rate': 0.4, 'max_depth': 8, 'n_estimators': 89, 'reg_lambda': 50} 


TESTING RESULTS: 

MSE(test): 255.50967649063617

R2(test): 0.1276314184474704

MAE(test): 12.381488035703404

Best Training Params:  {'alpha': 66, 'colsample_bytree': 0.7, 'learning_rate': 0.4, 'max_depth': 8, 'n_estimators': 89} 


TESTING RESULTS: 

MSE(test): 258.93913444282003

R2(test): 0.13097904773991476

MAE(test): 12.446733791732251

Best Training Params:  {'alpha': 66, 'learning_rate': 0.4, 'max_depth': 8, 'n_estimators': 89} 


TESTING RESULTS: 

MSE(test): 257.8733703365043

R2(test): 0.13188754725021345

MAE(test): 12.443249307597307

Best Training Params:  {'alpha': 82, 'learning_rate': 0.4, 'max_depth': 6} 


TESTING RESULTS: 

MSE(test): 258.33438266944904

R2(test): 0.12594918248523412

MAE(test): 12.461076971570323

TRAINING RESULTS: 

Best Training Score:  0.1271900214900052 

Best Training Params:  {'alpha': 67, 'colsample_bytree': 0.799, 'learning_rate': 0.5, 'max_depth': 9, 'n_estimators': 84, 'reg_lambda': 50} 


TESTING RESULTS: 

MSE(test): 256.1002528751888

R2(test): 0.13422303610153852

### XGboost Unseen Data Testing

In [112]:
df_t = pd.read_csv('../data/testing_test_data_v2.csv')
df_t.drop(columns=['origin_state_PA', 'dest_state_PA'], inplace=True, errors='ignore')
test_data = df_t.to_numpy()
y_pred = xgb_reg.predict(test_data)
df_flights_test = pd.read_csv('../data/flights_test.csv')
s_pred = pd.Series(y_pred, name='predicted_delay')
df_results = pd.concat([df_flights_test, s_pred], axis=1)
df_results.to_csv('../data/submission.csv', index=False)

## AdaBoost Regressor

In [25]:
from sklearn.ensemble import AdaBoostRegressor 
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

ada = AdaBoostRegressor()

ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

MSE(test): 299.1954934737365 R2(test): -0.016231253066080198 MAE(test): 14.059891568429437


## SVM

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

svr = SVR()

svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

# Random Forest

In [27]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

rfr = RandomForestRegressor()

rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

MSE(test): 317.7557421592085 R2(test): -0.07927199127997642 MAE(test): 13.726514526508495


## Bagging Regressor

In [28]:
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

bag = BaggingRegressor()

bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

MSE(test): 335.0010462906602 R2(test): -0.1378464598441198 MAE(test): 14.09878593390384


## GradientBoosting Regressor

In [29]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

gb_reg = GradientBoostingRegressor()

gb_reg.fit(X_train, y_train)
y_pred = gb_reg.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

MSE(test): 272.3731765263293 R2(test): 0.07487197995768824 MAE(test): 12.850633238016195


## Voting Regressor

In [30]:
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

v_reg = VotingRegressor(estimators=)

v_reg.fit(X_train, y_train)
y_pred = v_reg.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

TypeError: __init__() missing 1 required positional argument: 'estimators'

## HistGradientBoosting Regressor

In [40]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

hgb_reg = HistGradientBoostingRegressor(learning_rate=0.1,
                                        max_iter=100,
                                        max_leaf_nodes=31,
                                        max_depth=20,
                                        min_samples_leaf=20,
                                        l2_regularization=0.1)

hgb_reg.fit(X_train, y_train)
y_pred = hgb_reg.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

MSE(test): 263.30785151482485 R2(test): 0.10566277325785911 MAE(test): 12.628034138924694


## Stacking Regressor

In [38]:
# DONT' RUN
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

st_reg = StackingRegressor(estimators=)

st_reg.fit(X_train, y_train)
y_pred = st_reg.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

TypeError: 'numpy.int32' object is not iterable