# Random Forest

## Libraries

In [18]:
from sklearnex import patch_sklearn 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neural_network import MLPClassifier
patch_sklearn()
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import time
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN, SMOTENC
from imblearn.combine import SMOTETomek, SMOTEENN 
import matplotlib.pyplot as plt
from numpy import where

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [19]:
df_train = pd.read_csv('../data/df_train.csv')
df_test = pd.read_csv('../data/df_test.csv')

X_train = df_train.drop('kill', axis=1)
y_train = df_train['kill']
X_test = df_test.drop(['kill'], axis=1)
y_test = df_test['kill']

X_train = X_train.values
y_train = y_train.values
X_test = X_test.values
y_test = y_test.values

In [20]:
def fit_and_print(model, X_train, y_train):
    model.fit(X_train, y_train)  
    y_pred = model.predict(X_test)
    print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))  
    print("Classification Report: \n", classification_report(y_test, y_pred))  
    print("Accuracy: ", round(accuracy_score(y_test, y_pred),3))
    print("Precision:", round(precision_score(y_test, y_pred),3))
    print("Recall:", round(recall_score(y_test, y_pred),3))
    print("f1: ", round(f1_score(y_test, y_pred),3))

In [21]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train,y_train)

RandomForestClassifier(random_state=42)

In [22]:
fit_and_print(model, X_train, y_train)

Confusion Matrix: 
 [[19836   385]
 [ 2074   746]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.91      0.98      0.94     20221
           1       0.66      0.26      0.38      2820

    accuracy                           0.89     23041
   macro avg       0.78      0.62      0.66     23041
weighted avg       0.88      0.89      0.87     23041

Accuracy:  0.893
Precision: 0.66
Recall: 0.265
f1:  0.378


## GridSearch

In [23]:
param_grid = {
    'bootstrap': [True],
    'max_features': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf': [3, 5, 8],
    'min_samples_split': [4, 8, 12],
}

In [24]:
# Create a based model
rf = RandomForestClassifier(random_state=42)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 10)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'bootstrap': [True],
                         'max_features': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_leaf': [3, 5, 8],
                         'min_samples_split': [4, 8, 12]},
             verbose=10)

In [25]:
grid_search.best_params_

{'bootstrap': True,
 'max_features': 5,
 'min_samples_leaf': 8,
 'min_samples_split': 4}

In [26]:
best_grid = grid_search.best_estimator_
best_grid

RandomForestClassifier(max_features=5, min_samples_leaf=8, min_samples_split=4,
                       random_state=42)

In [27]:
grid_search.best_score_

0.89670279171294

In [28]:
# get the start time
st_wall_inf = time.time()

# Generate generalization metrics
grid_predictions = best_grid.predict(X_test)

# get the end time
et_wall_inf = time.time()

# get execution time
wall_time_inf = et_wall_inf - st_wall_inf
print(f'Inference Time: {wall_time_inf:.3f} seconds')

Inference Time: 0.061 seconds


In [29]:
fit_and_print(best_grid,X_train,y_train)

Confusion Matrix: 
 [[19882   339]
 [ 2081   739]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.91      0.98      0.94     20221
           1       0.69      0.26      0.38      2820

    accuracy                           0.89     23041
   macro avg       0.80      0.62      0.66     23041
weighted avg       0.88      0.89      0.87     23041

Accuracy:  0.895
Precision: 0.686
Recall: 0.262
f1:  0.379


In [30]:
def calculate_pred_and_inf_time(best_grid, X_test):
    # get the start time
    st_wall_inf = time.time()

    # Generate generalization metrics
    grid_predictions = best_grid.predict(X_test)

    # get the end time
    et_wall_inf = time.time()

    # get execution time
    wall_time_inf = et_wall_inf - st_wall_inf
    print(f'Inference Time: {1000*wall_time_inf:.3f} miliseconds')

calculate_pred_and_inf_time(best_grid, X_test)

Inference Time: 160.834 miliseconds


## Resampling

### SMOTE

In [31]:
# Oversample and plot imbalanced dataset with SMOTE

# summarize class distribution
counter = Counter(y_train)
print(counter)
# transform the dataset
oversample = SMOTE(random_state=42)
X_train_rel, y_train_rel = oversample.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_rel)
print(counter)

fit_and_print(best_grid, X_train_rel, y_train_rel)

calculate_pred_and_inf_time(best_grid, X_test)

Counter({0: 114988, 1: 15577})
Counter({0: 114988, 1: 114988})
Confusion Matrix: 
 [[18123  2098]
 [ 1330  1490]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.90      0.91     20221
           1       0.42      0.53      0.47      2820

    accuracy                           0.85     23041
   macro avg       0.67      0.71      0.69     23041
weighted avg       0.87      0.85      0.86     23041

Accuracy:  0.851
Precision: 0.415
Recall: 0.528
f1:  0.465
Inference Time: 108.346 miliseconds


### ADASYN

In [32]:
# Oversample and plot imbalanced dataset with ADASYN

from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN, SMOTENC
from imblearn.combine import SMOTETomek, SMOTEENN 
import matplotlib.pyplot as plt
from numpy import where

# summarize class distribution
counter = Counter(y_train)
print(counter)
# transform the dataset
oversample = ADASYN(random_state=42)
X_train_rel, y_train_rel = oversample.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_rel)
print(counter)

fit_and_print(best_grid, X_train_rel, y_train_rel)

calculate_pred_and_inf_time(best_grid, X_test)

Counter({0: 114988, 1: 15577})
Counter({0: 114988, 1: 112141})
Confusion Matrix: 
 [[17902  2319]
 [ 1267  1553]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.89      0.91     20221
           1       0.40      0.55      0.46      2820

    accuracy                           0.84     23041
   macro avg       0.67      0.72      0.69     23041
weighted avg       0.87      0.84      0.85     23041

Accuracy:  0.844
Precision: 0.401
Recall: 0.551
f1:  0.464
Inference Time: 175.762 miliseconds


### SMOTE and TL

In [33]:
# Oversample and plot imbalanced dataset with SMOTE and TL

# summarize class distribution
counter = Counter(y_train)
print(counter)
# transform the dataset
oversample = SMOTETomek(random_state=42)
X_train_rel, y_train_rel = oversample.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_rel)
print(counter)

fit_and_print(best_grid, X_train_rel, y_train_rel)

calculate_pred_and_inf_time(best_grid, X_test)

Counter({0: 114988, 1: 15577})
Counter({0: 106266, 1: 106266})
Confusion Matrix: 
 [[18021  2200]
 [ 1306  1514]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.89      0.91     20221
           1       0.41      0.54      0.46      2820

    accuracy                           0.85     23041
   macro avg       0.67      0.71      0.69     23041
weighted avg       0.87      0.85      0.86     23041

Accuracy:  0.848
Precision: 0.408
Recall: 0.537
f1:  0.463
Inference Time: 350.045 miliseconds


### SMOTE and ENN

In [34]:
# Oversample and plot imbalanced dataset with SMOTE and ENN

# summarize class distribution
counter = Counter(y_train)
print(counter)
# transform the dataset
oversample = SMOTEENN(random_state=42)
X_train_rel, y_train_rel = oversample.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_rel)
print(counter)

fit_and_print(best_grid, X_train_rel, y_train_rel)

calculate_pred_and_inf_time(best_grid, X_test)

Counter({0: 114988, 1: 15577})
Counter({1: 79510, 0: 69097})
Confusion Matrix: 
 [[16759  3462]
 [  947  1873]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.83      0.88     20221
           1       0.35      0.66      0.46      2820

    accuracy                           0.81     23041
   macro avg       0.65      0.75      0.67     23041
weighted avg       0.87      0.81      0.83     23041

Accuracy:  0.809
Precision: 0.351
Recall: 0.664
f1:  0.459
Inference Time: 35.179 miliseconds
[CV 3/5; 7/81] START bootstrap=True, max_features=2, min_samples_leaf=8, min_samples_split=4
[CV 3/5; 7/81] END bootstrap=True, max_features=2, min_samples_leaf=8, min_samples_split=4;, score=0.896 total time=  14.8s
[CV 5/5; 9/81] START bootstrap=True, max_features=2, min_samples_leaf=8, min_samples_split=12
[CV 5/5; 9/81] END bootstrap=True, max_features=2, min_samples_leaf=8, min_samples_split=12;, score=0.896 total time=  15.2s
[CV 5/