# XGBoost

In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, cross_validate, RepeatedKFold
import numpy as np
from numpy import sqrt
import time
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score,r2_score
import joblib
import os
from sklearn.model_selection import KFold
import random

In [2]:
df = pd.read_csv('../data/df_pkill.csv', delimiter = ',', header=0)

In [3]:
df

Unnamed: 0,alt_sht,vel_sht,pit_sht,alt_tgt,vel_tgt,hdg_tgt,rgt_tgt,dist,delay,turn_dg,pkill
0,1900.094833,516.257515,-9.115002,9411.674491,389.059516,-179.508724,28.173016,22.107204,24.962478,5.107690,0.087
1,1668.030968,384.114554,-9.291737,6424.170046,538.332386,150.564222,21.635788,42.957053,27.889110,140.635249,0.016
2,1430.046650,552.946387,-16.802029,11292.969864,536.495037,-71.258760,47.099787,17.904767,28.814680,13.967480,0.070
3,1819.812543,524.681447,2.326836,3983.583185,354.232941,-147.655375,-25.450868,38.858097,25.736859,47.110727,0.020
4,1348.600786,367.370819,13.801087,8269.417066,533.015720,85.274320,-47.226769,27.176054,17.912385,137.619306,0.024
...,...,...,...,...,...,...,...,...,...,...,...
2855050,44376.436551,544.473934,-4.773440,16868.508237,620.848721,83.157352,32.501000,32.909110,21.978916,71.510008,0.031
2855051,44396.062176,555.458863,-12.651975,44331.857296,626.047542,28.146378,-26.005787,42.962424,27.934929,145.538436,0.024
2855052,44373.716110,609.965112,-24.517640,16754.178594,580.015564,168.638639,-16.550910,43.350364,15.081863,79.534893,0.043
2855053,44432.347726,617.846742,11.024131,24860.378262,658.322238,-145.516952,14.465779,40.760772,26.462078,84.072836,0.111


In [4]:
X = df.drop(['pkill'],axis=1)
y = df['pkill']

## Train Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42)

## Training Process

### Hyperparameters

In [6]:
params = {'n_estimators': [1_000_000],
          'max_depth': [10, 12, 14, 16, 18, 20],
          'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],
          'subsample': np.arange(0.5, 1.0, 0.1),
          'colsample_bytree': np.arange(0.5, 1.0, 0.1),
          'colsample_bylevel': np.arange(0.5, 1.0, 0.1),
          'min_child_weight':[1, 3, 5], 
          'gamma': [ 0.0, 0.1, 0.2 , 0.3, 0.4],
          'n_jobs': [-1]
         }

n_iter = 50
patience = 10

In [7]:
def save_dict(dic, path_to_save, filename):
    if not os.path.exists(path_to_save):
        os.makedirs(path_to_save)

    a_file = open(f'{path_to_save}{filename}.pkl', "wb")
    pickle.dump(dic, a_file)
    a_file.close()
    
def save_dict_csv(dic, path_to_save, filename):
    if not os.path.exists(path_to_save):
        os.makedirs(path_to_save)
    with open(f'{path_to_save}{filename}.csv', 'w') as f:
        for k in dic.keys():
            f.write("%s,%s\n" % (k, dic[k]))

In [None]:
for n in range(1,n_iter+1):
    if n<=42:
        continue
        
    print(f'\nInteraciton #{n}:')

    metrics = pd.DataFrame(columns=['fold', 'mae', 'mse', 'rmse', 'r2', 'training_time', 'inference_time'])

    # Define the K-fold Cross Validator
    kfold = KFold(n_splits=5, shuffle=True)

    # K-fold Cross Validation model evaluation
    fold_no = 1
    
    # Choosing the hyperparameters
    params_chosen = {}
    for id, item in params.items():
        params_chosen[id] = random.choice(item)
    print(params_chosen)
    # Saving the hyperparameters
    save_dict_csv(params_chosen, f'./hyperparameters/', f'{n}')
    
    # Training
    for train, val in kfold.split(X_train, y_train):
                
        # Define the model as the best model from the random search
        model = xgb.XGBRegressor(**params_chosen)

        # Generate a print
        print('------------------------------------------------------------------------')
        print(f'Training for fold {fold_no} ...')

        # get the start time
        st_wall = time.time()
        
        # Fit data to model
        model.fit(X_train[train], y_train[train], eval_metric=['rmse'], verbose=50, eval_set=[(X_train[train], y_train[train]), (X_train[val], y_train[val])], early_stopping_rounds=patience)

        # get the end time
        et_wall = time.time()

        # get execution time
        wall_time = et_wall - st_wall

        print('Training Execution time:', wall_time, 'seconds')

        # get the start time
        st_wall_inf = time.time()

        # Generate generalization metrics
        y_pred = model.predict(X_test)

        # get the end time
        et_wall_inf = time.time()

        # get execution time
        wall_time_inf = et_wall_inf - st_wall_inf

        print('Inference Execution time:', wall_time_inf, 'seconds')

        scores = [fold_no, mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred), 
                  sqrt(mean_squared_error(y_test, y_pred)), r2_score(y_test, y_pred), wall_time, wall_time_inf]

        metrics.loc[len(metrics)] = scores

        # Increase fold number
        fold_no = fold_no + 1


    metrics.loc[len(metrics)] = ['mean', metrics['mae'].mean(), metrics['mse'].mean(), metrics['rmse'].mean(), metrics['r2'].mean(), metrics['training_time'].mean(), metrics['inference_time'].mean()]
    metrics.loc[len(metrics)] = ['std', metrics['mae'].iloc[:-1].std(), metrics['mse'].iloc[:-1].std(), metrics['rmse'].iloc[:-1].std(), metrics['r2'].iloc[:-1].std(), metrics['training_time'].iloc[:-1].std(), metrics['inference_time'].iloc[:-1].std()]  
    metrics.loc[len(metrics)] = ['sum', metrics['mae'].iloc[:-2].sum(), metrics['mse'].iloc[:-2].sum(), metrics['rmse'].iloc[:-2].sum(), metrics['r2'].iloc[:-2].sum(), metrics['training_time'].iloc[:-2].sum(), metrics['inference_time'].iloc[:-2].sum()]
    metrics = metrics.set_index('fold')

    path_to_save = f'./results/'

    if not os.path.exists(path_to_save):
        os.makedirs(path_to_save)

    metrics.to_csv(f'{path_to_save}{n}.csv')
    
    path_to_save = f'./models/'

    if not os.path.exists(path_to_save):
        os.makedirs(path_to_save)
    
    # save model to file
    joblib.dump(model, f'{path_to_save}{n}.dat');


Interaciton #43:
{'n_estimators': 1000000, 'max_depth': 16, 'learning_rate': 0.1, 'subsample': 0.8999999999999999, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.7999999999999999, 'min_child_weight': 1, 'gamma': 0.1, 'n_jobs': -1}
------------------------------------------------------------------------
Training for fold 1 ...
[0]	validation_0-rmse:32.17345	validation_1-rmse:32.37174
[50]	validation_0-rmse:10.44451	validation_1-rmse:16.74049
[100]	validation_0-rmse:7.51325	validation_1-rmse:15.81255
[150]	validation_0-rmse:6.15356	validation_1-rmse:15.24820
[200]	validation_0-rmse:5.63837	validation_1-rmse:15.06811
[250]	validation_0-rmse:5.37177	validation_1-rmse:15.01190
[300]	validation_0-rmse:5.15599	validation_1-rmse:14.94900
[350]	validation_0-rmse:4.91374	validation_1-rmse:14.90232
[400]	validation_0-rmse:4.71363	validation_1-rmse:14.85876
[429]	validation_0-rmse:4.61369	validation_1-rmse:14.83681
Training Execution time: 1020.9230089187622 seconds
Inference Execution time: 1.7