# XGBoost

## Libraries

In [2]:
from sklearnex import patch_sklearn 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
patch_sklearn()
from xgboost import XGBClassifier
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN, SMOTENC
from imblearn.combine import SMOTETomek, SMOTEENN 
import matplotlib.pyplot as plt
from numpy import where
import time

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
df_train = pd.read_csv('../data/df_train.csv')
df_test = pd.read_csv('../data/df_test.csv')

X_train = df_train.drop('kill', axis=1)
y_train = df_train['kill']
X_test = df_test.drop(['kill'], axis=1)
y_test = df_test['kill']

X_train = X_train.values
y_train = y_train.values
X_test = X_test.values
y_test = y_test.values

In [4]:
def fit_and_print(model, X_train, y_train):
    model.fit(X_train, y_train)  
    y_pred = model.predict(X_test)
    print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))  
    print("Classification Report: \n", classification_report(y_test, y_pred))  
    print("Accuracy: ", round(accuracy_score(y_test, y_pred),3))
    print("Precision:", round(precision_score(y_test, y_pred),3))
    print("Recall:", round(recall_score(y_test, y_pred),3))
    print("f1: ", round(f1_score(y_test, y_pred),3))

In [5]:
model = XGBClassifier(random_state=42, use_label_encoder=False)

In [6]:
fit_and_print(model, X_train, y_train)

Confusion Matrix: 
 [[19815   406]
 [ 2075   745]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.91      0.98      0.94     20221
           1       0.65      0.26      0.38      2820

    accuracy                           0.89     23041
   macro avg       0.78      0.62      0.66     23041
weighted avg       0.87      0.89      0.87     23041

Accuracy:  0.892
Precision: 0.647
Recall: 0.264
f1:  0.375


## GridSearch

In [7]:
param_grid = {
        'gamma': [0.5, 1, 1.5],
        'subsample': [0.6, 0.8],
        'colsample_bytree': [0.6, 0.8],
        'max_depth': [3, 4, 5]
}

In [8]:
# Create a based model
xgboost = XGBClassifier(random_state=42, use_label_encoder=False)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = xgboost, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 10)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 5/5; 7/36] START colsample_bytree=0.6, gamma=1, max_depth=3, subsample=0.6..
[CV 5/5; 7/36] END colsample_bytree=0.6, gamma=1, max_depth=3, subsample=0.6;, score=0.895 total time=12.4min
[CV 5/5; 10/36] START colsample_bytree=0.6, gamma=1, max_depth=4, subsample=0.8.
[CV 5/5; 10/36] END colsample_bytree=0.6, gamma=1, max_depth=4, subsample=0.8;, score=0.895 total time=17.8min
[CV 4/5; 20/36] START colsample_bytree=0.8, gamma=0.5, max_depth=3, subsample=0.8
[CV 4/5; 20/36] END colsample_bytree=0.8, gamma=0.5, max_depth=3, subsample=0.8;, score=0.895 total time=12.1min
[CV 4/5; 26/36] START colsample_bytree=0.8, gamma=1, max_depth=3, subsample=0.8.
[CV 4/5; 26/36] END colsample_bytree=0.8, gamma=1, max_depth=3, subsample=0.8;, score=0.895 total time=12.2min
[CV 4/5; 32/36] START colsample_bytree=0.8, gamma=1.5, max_depth=3, subsample=0.8
[CV 4/5; 32/36] END colsample_bytree=0.8, gamma=1.5, max_depth=3, subsample=0.8;, scor

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=42,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, u

In [9]:
grid_search.best_params_

{'colsample_bytree': 0.8, 'gamma': 1.5, 'max_depth': 5, 'subsample': 0.8}

In [10]:
best_grid = grid_search.best_estimator_
best_grid

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=1.5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=40, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [11]:
grid_search.best_score_

0.895829663386053

In [12]:
fit_and_print(best_grid,X_train,y_train)

Confusion Matrix: 
 [[19830   391]
 [ 2101   719]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.98      0.94     20221
           1       0.65      0.25      0.37      2820

    accuracy                           0.89     23041
   macro avg       0.78      0.62      0.65     23041
weighted avg       0.87      0.89      0.87     23041

Accuracy:  0.892
Precision: 0.648
Recall: 0.255
f1:  0.366


In [13]:
def calculate_pred_and_inf_time(best_grid, X_test):
    # get the start time
    st_wall_inf = time.time()

    # Generate generalization metrics
    grid_predictions = best_grid.predict(X_test)

    # get the end time
    et_wall_inf = time.time()

    # get execution time
    wall_time_inf = et_wall_inf - st_wall_inf
    print(f'Inference Time: {1000*wall_time_inf:.3f} miliseconds')

calculate_pred_and_inf_time(best_grid, X_test)

Inference Time: 17.076 miliseconds


## Resampling

### SMOTE

In [14]:
# Oversample and plot imbalanced dataset with SMOTE

# summarize class distribution
counter = Counter(y_train)
print(counter)
# transform the dataset
oversample = SMOTE(random_state=42)
X_train_rel, y_train_rel = oversample.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_rel)
print(counter)

fit_and_print(best_grid, X_train_rel, y_train_rel)

calculate_pred_and_inf_time(best_grid, X_test)

Counter({0: 114988, 1: 15577})
[CV 3/5; 7/36] START colsample_bytree=0.6, gamma=1, max_depth=3, subsample=0.6..
[CV 3/5; 7/36] END colsample_bytree=0.6, gamma=1, max_depth=3, subsample=0.6;, score=0.896 total time=12.7min
[CV 1/5; 11/36] START colsample_bytree=0.6, gamma=1, max_depth=5, subsample=0.6.
[CV 1/5; 11/36] END colsample_bytree=0.6, gamma=1, max_depth=5, subsample=0.6;, score=0.893 total time=21.7min
[CV 4/5; 22/36] START colsample_bytree=0.8, gamma=0.5, max_depth=4, subsample=0.8
[CV 4/5; 22/36] END colsample_bytree=0.8, gamma=0.5, max_depth=4, subsample=0.8;, score=0.895 total time=17.2min
[CV 2/5; 30/36] START colsample_bytree=0.8, gamma=1, max_depth=5, subsample=0.8.
[CV 2/5; 30/36] END colsample_bytree=0.8, gamma=1, max_depth=5, subsample=0.8;, score=0.896 total time=20.2min
Counter({0: 114988, 1: 114988})
[CV 1/5; 7/36] START colsample_bytree=0.6, gamma=1, max_depth=3, subsample=0.6..
[CV 1/5; 7/36] END colsample_bytree=0.6, gamma=1, max_depth=3, subsample=0.6;, score=0

### ADASYN

In [15]:
# Oversample and plot imbalanced dataset with ADASYN

# summarize class distribution
counter = Counter(y_train)
print(counter)
# transform the dataset
oversample = ADASYN(random_state=42)
X_train_rel, y_train_rel = oversample.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_rel)
print(counter)

fit_and_print(best_grid, X_train_rel, y_train_rel)

calculate_pred_and_inf_time(best_grid, X_test)

Counter({0: 114988, 1: 15577})
Counter({0: 114988, 1: 112141})
[CV 5/5; 1/36] START colsample_bytree=0.6, gamma=0.5, max_depth=3, subsample=0.6
[CV 5/5; 1/36] END colsample_bytree=0.6, gamma=0.5, max_depth=3, subsample=0.6;, score=0.895 total time=13.5min
[CV 3/5; 12/36] START colsample_bytree=0.6, gamma=1, max_depth=5, subsample=0.8.
[CV 3/5; 12/36] END colsample_bytree=0.6, gamma=1, max_depth=5, subsample=0.8;, score=0.897 total time=20.1min
[CV 4/5; 21/36] START colsample_bytree=0.8, gamma=0.5, max_depth=4, subsample=0.6
[CV 4/5; 21/36] END colsample_bytree=0.8, gamma=0.5, max_depth=4, subsample=0.6;, score=0.895 total time=17.5min
[CV 1/5; 30/36] START colsample_bytree=0.8, gamma=1, max_depth=5, subsample=0.8.
[CV 1/5; 30/36] END colsample_bytree=0.8, gamma=1, max_depth=5, subsample=0.8;, score=0.894 total time=20.8min
[CV 2/5; 1/36] START colsample_bytree=0.6, gamma=0.5, max_depth=3, subsample=0.6
[CV 2/5; 1/36] END colsample_bytree=0.6, gamma=0.5, max_depth=3, subsample=0.6;, sco

### SMOTE and TL

In [18]:
# Oversample and plot imbalanced dataset with SMOTE and TL

# summarize class distribution
counter = Counter(y_train)
print(counter)
# transform the dataset
oversample = SMOTETomek(random_state=42)
X_train_rel, y_train_rel = oversample.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_rel)
print(counter)

fit_and_print(best_grid, X_train_rel, y_train_rel)

calculate_pred_and_inf_time(best_grid, X_test)

Counter({0: 114988, 1: 15577})
Counter({0: 106266, 1: 106266})
Confusion Matrix: 
 [[17283  2938]
 [ 1104  1716]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.85      0.90     20221
           1       0.37      0.61      0.46      2820

    accuracy                           0.82     23041
   macro avg       0.65      0.73      0.68     23041
weighted avg       0.87      0.82      0.84     23041

Accuracy:  0.825
Precision: 0.369
Recall: 0.609
f1:  0.459
Inference Time: 6.176 miliseconds
[CV 3/5; 3/36] START colsample_bytree=0.6, gamma=0.5, max_depth=4, subsample=0.6
[CV 3/5; 3/36] END colsample_bytree=0.6, gamma=0.5, max_depth=4, subsample=0.6;, score=0.897 total time=11.2min
[CV 1/5; 10/36] START colsample_bytree=0.6, gamma=1, max_depth=4, subsample=0.8.
[CV 1/5; 10/36] END colsample_bytree=0.6, gamma=1, max_depth=4, subsample=0.8;, score=0.894 total time=17.5min
[CV 5/5; 19/36] START colsample_bytree=0.8, gamma=0.5, m

### SMOTE and ENN

In [17]:
# Oversample and plot imbalanced dataset with SMOTE and ENN

# summarize class distribution
counter = Counter(y_train)
print(counter)
# transform the dataset
oversample = SMOTEENN(random_state=42)
X_train_rel, y_train_rel = oversample.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_rel)
print(counter)

fit_and_print(best_grid, X_train_rel, y_train_rel)

calculate_pred_and_inf_time(best_grid, X_test)

Counter({0: 114988, 1: 15577})
Counter({1: 79510, 0: 69097})
Confusion Matrix: 
 [[16255  3966]
 [  849  1971]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.80      0.87     20221
           1       0.33      0.70      0.45      2820

    accuracy                           0.79     23041
   macro avg       0.64      0.75      0.66     23041
weighted avg       0.87      0.79      0.82     23041

Accuracy:  0.791
Precision: 0.332
Recall: 0.699
f1:  0.45
Inference Time: 6.791 miliseconds
[CV 4/5; 2/36] START colsample_bytree=0.6, gamma=0.5, max_depth=3, subsample=0.8
[CV 4/5; 2/36] END colsample_bytree=0.6, gamma=0.5, max_depth=3, subsample=0.8;, score=0.895 total time=13.3min
[CV 1/5; 12/36] START colsample_bytree=0.6, gamma=1, max_depth=5, subsample=0.8.
[CV 1/5; 12/36] END colsample_bytree=0.6, gamma=1, max_depth=5, subsample=0.8;, score=0.894 total time=20.9min
[CV 1/5; 22/36] START colsample_bytree=0.8, gamma=0.5, max_