## Feature Engineering 

In [17]:
import numpy as np
import pandas as pd
from scipy import stats
import modules.help_functions as hf
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE

import pickle

from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
# read in file
df = pd.read_csv('../data/encoded_training_data_v5.csv')
# df_delays = hf.get_avg_dest_delay(df, ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay'])

### XGBOOST

In [19]:
# XGboost Model
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)
data_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

xgb = xgb.XGBRegressor(learning_rate = 0.51,
                       max_depth = 7,                        
                       alpha = 60,
                       n_estimators = 84, 
                       n_jobs = -1, 
                       subsample = 0.85,
                       colsample_bytree = 0.85,
                       reg_lambda = 59.1,
                       gamma = 0,
                       min_child_weight = 1,
                       max_delta_step = 0,
                       sampling_method = 'uniform'        
                      )


xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

MSE(test): 255.1673464202419 R2(test): 0.13331237317933997 MAE(test): 12.401466715820652


In [20]:
# XGboost Cross Validation
# import xgboost as xgb
# from sklearn.metrics import mean_squared_error, r2_score
# from sklearn.model_selection import train_test_split
# from numpy import absolute
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import RepeatedKFold
# from sklearn.model_selection import KFold
# from sklearn.model_selection import cross_validate

# y = df.arr_delay.to_numpy()
# X = df.drop(columns=['arr_delay']).to_numpy()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)
# data_dmatrix = xgb.DMatrix(data=X_train,label=y_train)

# xgb = xgb.XGBRegressor(
#                         )

# # do the split
# cv = KFold(n_splits=5)

# # perform validation
# scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2', 'neg_root_mean_squared_error']
# scores = cross_validate(xgb, X_train, y_train, scoring=scoring, cv=cv, n_jobs=-1)

In [21]:
# Display the CV result
# mse = absolute(scores['test_neg_mean_squared_error']).mean()
# r2 = scores['test_r2'].mean()
# mae = absolute(scores['test_neg_mean_absolute_error'].mean())
# print(f'MSE(train_cv): {mse}',
#       f'R2(train_cv): {r2}',
#       f'MAE(train_cv): {mae}')

In [22]:
# Grid Search for XBBoost
# DON'T RUN
# from sklearn.model_selection import GridSearchCV
# from xgboost import XGBRegressor

# y = df.arr_delay.to_numpy()
# X = df.drop(columns=['arr_delay']).to_numpy()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# xgb = XGBRegressor()

# params = {
#     'learning_rate': np.arange(1, 1.1, 0.01),
#     'max_depth': np.arange(6, 9, 1),
#     'alpha': np.arange(50, 55, 1),
#     'n_estimators': np.arange(25, 30, 1),
#     'colsample_bytree': np.arange(0.7, 0.9, 0.05)
# }

# xgb_grid_search = GridSearchCV(xgb, 
#                                param_grid=params, 
#                                cv=5, 
#                                n_jobs=6, 
#                                error_score='raise', 
#                                verbose=True)
# xgb_grid_search.fit(X_train, y_train)

### SVM 

In [23]:
# # Grid Search for estimator SVM
# import xgboost as xgb
# from sklearn.metrics import mean_squared_error, r2_score
# from sklearn.model_selection import train_test_split
# from sklearn.svm import SVR

# y = df.arr_delay.to_numpy()
# X = df.drop(columns=['arr_delay']).to_numpy()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# svm = svm.SVR()

# params = {
#     'C': np.arange(1, 2.5, 0.1),
#     'probability': [True, False],
# }

# svm_grid_search = GridSearchCV(svm, 
#                            param_grid=params, 
#                            cv=5, 
#                            n_jobs=-1, 
#                            error_score='raise', 
#                            verbose=True)
# svm_grid_search.fit(X_train, y_train)

In [24]:
# svm model
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

#Create a svm Classifier
svm_reg = SVR(verbose=True)

#Train the model using the training sets
svm_reg.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = svm_reg.predict(X_test)
print(f'MSE: {mean_squared_error(y_test, y_pred)}', f'R2: {r2_score(y_test, y_pred)}')

[LibSVM]MSE: 272.4362563578315 R2: 0.07465772640905877


## AdaBoost Regressor

In [25]:
from sklearn.ensemble import AdaBoostRegressor 
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

ada = AdaBoostRegressor()

ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

MSE(test): 299.1954934737365 R2(test): -0.016231253066080198 MAE(test): 14.059891568429437


## SVM

In [26]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

svr = SVR()

svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

ModuleNotFoundError: No module named 'skearn'

# Random Forest

In [27]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

rfr = RandomForestRegressor()

rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

MSE(test): 317.7557421592085 R2(test): -0.07927199127997642 MAE(test): 13.726514526508495


## Bagging Regressor

In [28]:
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

bag = BaggingRegressor()

bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

MSE(test): 335.0010462906602 R2(test): -0.1378464598441198 MAE(test): 14.09878593390384


## GradientBoosting Regressor

In [29]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

gb_reg = GradientBoostingRegressor()

gb_reg.fit(X_train, y_train)
y_pred = gb_reg.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

MSE(test): 272.3731765263293 R2(test): 0.07487197995768824 MAE(test): 12.850633238016195


## Voting Regressor

In [30]:
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

v_reg = VotingRegressor(estimators=)

v_reg.fit(X_train, y_train)
y_pred = v_reg.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

TypeError: __init__() missing 1 required positional argument: 'estimators'

## HistGradientBoosting Regressor

In [40]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

hgb_reg = HistGradientBoostingRegressor(learning_rate=0.1,
                                        max_iter=100,
                                        max_leaf_nodes=31,
                                        max_depth=20,
                                        min_samples_leaf=20,
                                        l2_regularization=0.1)

hgb_reg.fit(X_train, y_train)
y_pred = hgb_reg.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

MSE(test): 263.30785151482485 R2(test): 0.10566277325785911 MAE(test): 12.628034138924694


## Stacking Regressor

In [38]:
# DONT' RUN
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

st_reg = StackingRegressor(estimators=)

st_reg.fit(X_train, y_train)
y_pred = st_reg.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

TypeError: 'numpy.int32' object is not iterable