In [95]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [96]:
data = pd.read_csv('eventmatrixlabel.csv')

In [97]:
data.shape

(575061, 50)

In [98]:
# Changing Anomaly = 1, Normal = 0
data['Label'] = pd.factorize(data['Label'])[0]

In [99]:
# Counting % of anomalies
np.round(np.mean(data['Label']),5)*100

2.928

Only 3% of our rows are Anomaly, we thus have a very unbalanced dataset. In order to take this into account, we must use appropriate metrics to evaluate the performance of our models. 

In [100]:
# Keep in mind that accuracy will be high even if does not identify anomaly properly (because of unbalanced data)
# true: Anomaly, false: Normal

# precision : Of those predicted true, proportion that are  true
# Recall/Sensitivity : Of actual trues, proportion predicted true
# Specificity : Of actual falses, proportion predicted false --> not important to us

metrics = {'balanced_accuracy': metrics.make_scorer(metrics.balanced_accuracy_score), 
           'precision': metrics.make_scorer(metrics.precision_score),
          'recall' : metrics.make_scorer(metrics.recall_score),
          'f1' : metrics.make_scorer(metrics.f1_score),
          'log-loss' : metrics.make_scorer(metrics.log_loss)}

In [101]:
# Splitting the dependent and independent variables
y = data[['Label']]
x = data.drop(['Label', 'BlockId'], axis=1)

In [102]:
x.head()

Unnamed: 0,0567184d,06d16156,09a53393,0f86472a,124068c6,13eb7010,234302e6,2e68ccc3,2ecc047e,2f85639c,...,d63ef163,d6b7b743,dba996ef,e024fa48,e3df2680,f0d1ff15,f266840a,f79898ae,fcd37a6d,ff00cd08
0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0


In [103]:
y.head()

Unnamed: 0,Label
0,0
1,0
2,0
3,0
4,0


In [104]:
# Splitting data into training and testing set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 2021)

In [105]:
print(f"% of anomalies in training set: {np.round(np.mean(y_train['Label'])*100,3)}")
print(f"% of anomalies in testing set: {np.round(np.mean(y_test['Label'])*100,3)}")

% of anomalies in training set: 2.934
% of anomalies in testing set: 2.906


### Default Random Forest

In [106]:
# Lets fit a rf with default parameters
rf_default = RandomForestClassifier(random_state = 2021)

In [107]:
# Define evaluation procedure
cv_rf_default = cross_validate(rf_default, X = x_train, y = y_train,scoring = metrics, n_jobs = -1, cv = 5, verbose = 1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   23.4s finished


In [108]:
pd.DataFrame.from_dict(cv_rf_default).set_axis(['Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5'], axis=0)

Unnamed: 0,fit_time,score_time,test_balanced_accuracy,test_precision,test_recall,test_f1,test_log-loss
Fold_1,21.208864,0.890105,0.998658,0.997037,0.997406,0.997222,0.005631
Fold_2,21.080899,0.93504,0.999371,0.995201,0.998888,0.997041,0.006006
Fold_3,20.350469,0.94946,0.999764,0.996677,0.99963,0.998151,0.003754
Fold_4,20.586932,0.977003,0.999225,0.997779,0.998518,0.998148,0.003754
Fold_5,20.66993,0.88399,0.999809,0.999629,0.999629,0.999629,0.000751


In [109]:
# Average of all folds
pd.DataFrame.from_dict(cv_rf_default).set_axis(['Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5'], axis=0).mean(axis = 0)

fit_time                  20.779419
score_time                 0.927119
test_balanced_accuracy     0.999366
test_precision             0.997265
test_recall                0.998814
test_f1                    0.998038
test_log-loss              0.003979
dtype: float64

### Random Forest with Bootstrap Class Weighting

In [82]:
# Lets place a heavier penalty on misclassifying the minority class - Anomaly, using Random Forest With Bootstrap Class Weighting
rf = RandomForestClassifier(n_estimators = 100, class_weight='balanced_subsample', random_state = 2021)

In [83]:
# Define evaluation procedure
cv_rf = cross_validate(rf, X = x_train, y = y_train,scoring = metrics, n_jobs = -1, cv = 5, verbose = 1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   33.3s finished


In [88]:
pd.DataFrame.from_dict(cv_rf).set_axis(['Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5'], axis=0)

Unnamed: 0,fit_time,score_time,test_balanced_accuracy,test_precision,test_recall,test_f1,test_log-loss
Fold_1,30.957188,0.893914,0.998658,0.997037,0.997406,0.997222,0.005631
Fold_2,30.527564,1.068531,0.999371,0.995201,0.998888,0.997041,0.006006
Fold_3,30.79019,0.971906,0.999955,0.997046,1.0,0.998521,0.003003
Fold_4,30.60748,1.02153,0.999405,0.99741,0.998888,0.998149,0.003754
Fold_5,30.854199,0.944896,0.999809,0.999629,0.999629,0.999629,0.000751


In [89]:
# Average of all folds
pd.DataFrame.from_dict(cv_rf).set_axis(['Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5'], axis=0).mean(axis = 0)

fit_time                  30.747324
score_time                 0.980155
test_balanced_accuracy     0.999440
test_precision             0.997265
test_recall                0.998963
test_f1                    0.998112
test_log-loss              0.003829
dtype: float64

## Reference
https://machinelearningmastery.com/bagging-and-random-forest-for-imbalanced-classification/