# Naive Bayes

## Libraries

In [25]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN, SMOTENC
from imblearn.combine import SMOTETomek, SMOTEENN 
import matplotlib.pyplot as plt
from numpy import where
import time

## Read the data from csv

In [26]:
df_train = pd.read_csv('../data/df_train.csv')
df_test = pd.read_csv('../data/df_test.csv')

X_train = df_train.drop('kill', axis=1)
y_train = df_train['kill']
X_test = df_test.drop(['kill'], axis=1)
y_test = df_test['kill']

X_train = X_train.values
y_train = y_train.values
X_test = X_test.values
y_test = y_test.values

In [27]:
scaler = StandardScaler()
#scaler = MinMaxScaler()

scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Training and fitting a logistic regression model on the training set.

In [28]:
model = GaussianNB()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [29]:
def fit_and_print(model, X_train, y_train):
    model.fit(X_train, y_train)  
    y_pred = model.predict(X_test)
    print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))  
    print("Classification Report: \n", classification_report(y_test, y_pred))  
    print("Accuracy: ", round(accuracy_score(y_test, y_pred),3))
    print("Precision:", round(precision_score(y_test, y_pred),3))
    print("Recall:", round(recall_score(y_test, y_pred),3))
    print("f1: ", round(f1_score(y_test, y_pred),3))

In [30]:
fit_and_print(model,X_train,y_train)

Confusion Matrix: 
 [[20087   134]
 [ 2543   277]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.89      0.99      0.94     20221
           1       0.67      0.10      0.17      2820

    accuracy                           0.88     23041
   macro avg       0.78      0.55      0.55     23041
weighted avg       0.86      0.88      0.84     23041

Accuracy:  0.884
Precision: 0.674
Recall: 0.098
f1:  0.171


### Classification Report

Scikit-learn does provide a convenience report when working on classification problems to give you a quick idea of the accuracy of a model using a number of measures.

The classification_report() function displays the precision, recall, f1-score and support for each class.

In [31]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.89      0.99      0.94     20221
           1       0.67      0.10      0.17      2820

    accuracy                           0.88     23041
   macro avg       0.78      0.55      0.55     23041
weighted avg       0.86      0.88      0.84     23041



## GridSearch

In [32]:
np.logspace(0,-9, num=100)

array([1.00000000e+00, 8.11130831e-01, 6.57933225e-01, 5.33669923e-01,
       4.32876128e-01, 3.51119173e-01, 2.84803587e-01, 2.31012970e-01,
       1.87381742e-01, 1.51991108e-01, 1.23284674e-01, 1.00000000e-01,
       8.11130831e-02, 6.57933225e-02, 5.33669923e-02, 4.32876128e-02,
       3.51119173e-02, 2.84803587e-02, 2.31012970e-02, 1.87381742e-02,
       1.51991108e-02, 1.23284674e-02, 1.00000000e-02, 8.11130831e-03,
       6.57933225e-03, 5.33669923e-03, 4.32876128e-03, 3.51119173e-03,
       2.84803587e-03, 2.31012970e-03, 1.87381742e-03, 1.51991108e-03,
       1.23284674e-03, 1.00000000e-03, 8.11130831e-04, 6.57933225e-04,
       5.33669923e-04, 4.32876128e-04, 3.51119173e-04, 2.84803587e-04,
       2.31012970e-04, 1.87381742e-04, 1.51991108e-04, 1.23284674e-04,
       1.00000000e-04, 8.11130831e-05, 6.57933225e-05, 5.33669923e-05,
       4.32876128e-05, 3.51119173e-05, 2.84803587e-05, 2.31012970e-05,
       1.87381742e-05, 1.51991108e-05, 1.23284674e-05, 1.00000000e-05,
      

In [33]:
param_grid = {'var_smoothing': np.logspace(0,-9, num=100)}

In [34]:
grid = GridSearchCV(GaussianNB(), param_grid, verbose=2, cv = 5, n_jobs = -1)

In [35]:
# May take awhile!
grid.fit(X_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


GridSearchCV(cv=5, estimator=GaussianNB(), n_jobs=-1,
             param_grid={'var_smoothing': array([1.00000000e+00, 8.11130831e-01, 6.57933225e-01, 5.33669923e-01,
       4.32876128e-01, 3.51119173e-01, 2.84803587e-01, 2.31012970e-01,
       1.87381742e-01, 1.51991108e-01, 1.23284674e-01, 1.00000000e-01,
       8.11130831e-02, 6.57933225e-02, 5.33669923e-02, 4.32876128e-02,
       3.51119173e-02, 2.8480358...
       1.23284674e-07, 1.00000000e-07, 8.11130831e-08, 6.57933225e-08,
       5.33669923e-08, 4.32876128e-08, 3.51119173e-08, 2.84803587e-08,
       2.31012970e-08, 1.87381742e-08, 1.51991108e-08, 1.23284674e-08,
       1.00000000e-08, 8.11130831e-09, 6.57933225e-09, 5.33669923e-09,
       4.32876128e-09, 3.51119173e-09, 2.84803587e-09, 2.31012970e-09,
       1.87381742e-09, 1.51991108e-09, 1.23284674e-09, 1.00000000e-09])},
             verbose=2)

In [36]:
grid.best_params_

{'var_smoothing': 0.0023101297000831605}

In [37]:
best_grid = grid.best_estimator_
best_grid

GaussianNB(var_smoothing=0.0023101297000831605)

In [38]:
grid.best_score_

0.8860567533412477

Then you can re-run predictions on this grid object just like you would with a normal model.

In [39]:
grid_predictions = grid.predict(X_test)

In [40]:
print(confusion_matrix(y_test,grid_predictions))

[[20089   132]
 [ 2550   270]]


In [41]:
print(classification_report(y_test,grid_predictions))

              precision    recall  f1-score   support

           0       0.89      0.99      0.94     20221
           1       0.67      0.10      0.17      2820

    accuracy                           0.88     23041
   macro avg       0.78      0.54      0.55     23041
weighted avg       0.86      0.88      0.84     23041



In [42]:
def fit_and_print(model, X_train, y_train):
    model.fit(X_train, y_train)  
    y_pred = model.predict(X_test)
    print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))  
    print("Classification Report: \n", classification_report(y_test, y_pred))  
    print("Accuracy: ", round(accuracy_score(y_test, y_pred),3))
    print("Precision:", round(precision_score(y_test, y_pred),3))
    print("Recall:", round(recall_score(y_test, y_pred),3))
    print("f1: ", round(f1_score(y_test, y_pred),3))

In [43]:
fit_and_print(best_grid,X_train,y_train)

Confusion Matrix: 
 [[20089   132]
 [ 2550   270]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.89      0.99      0.94     20221
           1       0.67      0.10      0.17      2820

    accuracy                           0.88     23041
   macro avg       0.78      0.54      0.55     23041
weighted avg       0.86      0.88      0.84     23041

Accuracy:  0.884
Precision: 0.672
Recall: 0.096
f1:  0.168


In [44]:
def calculate_pred_and_inf_time(best_grid, X_test):
    # get the start time
    st_wall_inf = time.time()

    # Generate generalization metrics
    grid_predictions = best_grid.predict(X_test)

    # get the end time
    et_wall_inf = time.time()

    # get execution time
    wall_time_inf = et_wall_inf - st_wall_inf
    print(f'Inference Time: {1000*wall_time_inf:.3f} miliseconds')

calculate_pred_and_inf_time(best_grid, X_test)

Inference Time: 4.193 miliseconds


## Resampling

### SMOTE

In [45]:
# Oversample and plot imbalanced dataset with SMOTE

# summarize class distribution
counter = Counter(y_train)
print(counter)
# transform the dataset
oversample = SMOTE(random_state=42)
X_train_rel, y_train_rel = oversample.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_rel)
print(counter)

fit_and_print(best_grid, X_train_rel, y_train_rel)

calculate_pred_and_inf_time(best_grid, X_test)

Counter({0: 114988, 1: 15577})
Counter({0: 114988, 1: 114988})
Confusion Matrix: 
 [[12825  7396]
 [  873  1947]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.63      0.76     20221
           1       0.21      0.69      0.32      2820

    accuracy                           0.64     23041
   macro avg       0.57      0.66      0.54     23041
weighted avg       0.85      0.64      0.70     23041

Accuracy:  0.641
Precision: 0.208
Recall: 0.69
f1:  0.32
Inference Time: 4.999 miliseconds


### ADASYN

In [46]:
# Oversample and plot imbalanced dataset with ADASYN

# summarize class distribution
counter = Counter(y_train)
print(counter)
# transform the dataset
oversample = ADASYN(random_state=42)
X_train_rel, y_train_rel = oversample.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_rel)
print(counter)

fit_and_print(best_grid, X_train_rel, y_train_rel)

calculate_pred_and_inf_time(best_grid, X_test)

Counter({0: 114988, 1: 15577})
Counter({1: 119427, 0: 114988})
Confusion Matrix: 
 [[11735  8486]
 [  745  2075]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.58      0.72     20221
           1       0.20      0.74      0.31      2820

    accuracy                           0.60     23041
   macro avg       0.57      0.66      0.51     23041
weighted avg       0.85      0.60      0.67     23041

Accuracy:  0.599
Precision: 0.196
Recall: 0.736
f1:  0.31
Inference Time: 3.270 miliseconds


### SMOTE and TL

In [47]:
# Oversample and plot imbalanced dataset with SMOTE and TL

# summarize class distribution
counter = Counter(y_train)
print(counter)
# transform the dataset
oversample = SMOTETomek(random_state=42)
X_train_rel, y_train_rel = oversample.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_rel)
print(counter)

fit_and_print(best_grid, X_train_rel, y_train_rel)

calculate_pred_and_inf_time(best_grid, X_test)

Counter({0: 114988, 1: 15577})
Counter({0: 111748, 1: 111748})
Confusion Matrix: 
 [[12822  7399]
 [  872  1948]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.63      0.76     20221
           1       0.21      0.69      0.32      2820

    accuracy                           0.64     23041
   macro avg       0.57      0.66      0.54     23041
weighted avg       0.85      0.64      0.70     23041

Accuracy:  0.641
Precision: 0.208
Recall: 0.691
f1:  0.32
Inference Time: 3.305 miliseconds


### SMOTE and ENN

In [48]:
# Oversample and plot imbalanced dataset with SMOTE and ENN

# summarize class distribution
counter = Counter(y_train)
print(counter)
# transform the dataset
oversample = SMOTEENN(random_state=42)
X_train_rel, y_train_rel = oversample.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_rel)
print(counter)

fit_and_print(best_grid, X_train_rel, y_train_rel)

calculate_pred_and_inf_time(best_grid, X_test)

Counter({0: 114988, 1: 15577})
Counter({1: 96382, 0: 81992})
Confusion Matrix: 
 [[11504  8717]
 [  726  2094]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.57      0.71     20221
           1       0.19      0.74      0.31      2820

    accuracy                           0.59     23041
   macro avg       0.57      0.66      0.51     23041
weighted avg       0.85      0.59      0.66     23041

Accuracy:  0.59
Precision: 0.194
Recall: 0.743
f1:  0.307
Inference Time: 3.218 miliseconds
[CV] END ..................var_smoothing=0.23101297000831597; total time=   1.0s
[CV] END .................................var_smoothing=0.01; total time=   0.5s
[CV] END ................var_smoothing=0.0015199110829529332; total time=   0.7s
[CV] END ................var_smoothing=0.0003511191734215131; total time=   0.6s
[CV] END ................var_smoothing=6.579332246575683e-05; total time=   0.5s
[CV] END ...............var_smoothing=1