# XGBoost Model

## Import modules

In [1]:
import numpy as np
import pandas as pd
import sklearn
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import xgboost as xgb

# Load the data set

In [2]:
df = pd.read_csv('../data/encoded_training_data_v7.csv')

## Train Test Split

In [3]:
# convert to numpy array
y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)


## Grid Search

In [1]:
# convert the format
data_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

# instantiate a xgboost object
xgb_estimator = xgb.XGBRegressor()

# setting parameters
params = {
#        'learning_rate': [0.4],
#        'max_depth': [8],
#        'alpha': [67],
#        'n_estimators': [210],
#        'colsample_bytree': [0.75],
#        'reg_lambda': [47],
#        'subsample': [0.85],
    
#     'learning_rate': np.arange(0.39, 0.41, 0.05),
#     'max_depth': np.arange(2, 7, 1),
#     'alpha': np.arange(70, 75, 1),
#     'n_estimators': np.arange(110, 240, 100),
#     'colsample_bytree': np.arange(0.4, 0.6, 0.1),
#     'reg_lambda': np.arange(46, 47, 0.2),
#     'subsample': np.arange(0.5, 0.8, 0.1)
    }

scorers = ['r2']  # setting the metrics 

# perform the search
grid_search_xg = GridSearchCV(xgb_estimator, 
                              param_grid=params, 
                              cv=5, 
                              n_jobs=-1, 
                              error_score='raise', 
                              scoring=scorers,
                              refit='r2',
                              verbose=True
                              )
# fit the model
grid_search_xg.fit(X_train, y_train)

# display the training results
print('\nTRAINING RESULTS: \n')
print('Best Training Score: ', grid_search_xg.best_score_, '\n')
print('Best Training Params: ', grid_search_xg.best_params_, '\n')

# Perform the test with the model
# predict the test set
y_pred = grid_search_xg.predict(X_test)

# display the test results
print('\nTESTING RESULTS: \n')
print(f'MSE(test): {mean_squared_error(y_test, y_pred)}\n')
print(f'R2(test): {r2_score(y_test, y_pred)}\n')
print(f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

## Final Model

In [None]:

data_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

# instantiate an object
xgb_reg = xgb.XGBRegressor(objective = 'reg:squarederror',
                           max_depth = 8,
                           learning_rate = 0.40,
                           alpha = 67,  
                           n_estimators = 210,
                           colsample_bytree = 0.75,
                           reg_lambda = 47                           
                          )


# fit and predict
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}')
print(f'R2(test): {r2_score(y_test, y_pred)}') 
print(f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

## Result

```
TRAINING RESULTS: 

Best Training Score:  0.12707881846818264 

Best Training Params:  {'alpha': 67, 'colsample_bytree': 0.75, 'learning_rate': 0.4, 'max_depth': 8, 'n_estimators': 210, 'reg_lambda': 47} 


TESTING RESULTS: 

MSE(test): 256.0030885554617

R2(test): 0.1387974194963002

MAE(test): 12.371004241235651
```

## Save the Model

In [None]:
pickle.dump(xgb_reg, open('xgboost_regressor_flight_delay_prediction.pkl', 'wb'))