# Binary classifier: detects attacks (1) vs benign activity (0), trained on all datasets

In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [2]:
%load_ext autoreload
%autoreload 2

In [5]:
from utils.model_utils import split_data, train_model, evaluate_model, save_model
from utils import data_preparation

In [7]:
data = data_preparation.load_dataset('../datasets/cic-ids2017_allattacks_binary_clean.csv')

Dataset loaded from: ../datasets/cic-ids2017_allattacks_binary_clean.csv — shape: (2574151, 9)


In [8]:
data.head()

Unnamed: 0,ack_flag,dst_port,fin_flag,flow_duration,psh_flag,syn_flag,bwd_pkts,fwd_pkts,label
0,1,54865,0,3,0,0,0,2,0
1,1,55054,0,109,0,0,1,1,0
2,1,55055,0,52,0,0,1,1,0
3,1,46236,0,34,0,0,1,1,0
4,1,54863,0,3,0,0,0,2,0


In [9]:
X_train, X_test, y_train, y_test = split_data(data, 'label')

# RANDOM FOREST

In [20]:
rf_model = train_model(X_train, y_train, model_type='rf')

In [21]:
evaluate_model(rf_model, X_test, y_test)

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    644482
           1       0.97      0.97      0.97    127764

    accuracy                           0.99    772246
   macro avg       0.98      0.98      0.98    772246
weighted avg       0.99      0.99      0.99    772246

=== Confusion Matrix ===
[[641094   3388]
 [  3369 124395]]
ROC AUC: 0.9954


In [24]:
save_model(rf_model, name="rf_sharedcols_v1.pkl")

Model saved to: models/rf_sharedcols_v1.pkl


'models/rf_sharedcols_v1.pkl'

# XGBoost

In [7]:
xgb_model = train_model(X_train, y_train, model_type='xgb')

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [8]:
evaluate_model(xgb_model, X_test, y_test)

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    644482
           1       0.98      0.97      0.98    127764

    accuracy                           0.99    772246
   macro avg       0.99      0.98      0.99    772246
weighted avg       0.99      0.99      0.99    772246

=== Confusion Matrix ===
[[642409   2073]
 [  3510 124254]]
ROC AUC: 0.9994


In [9]:
save_model(xgb_model, name="xgb_sharedcols_v1.pkl")

Model saved to: C:\Users\kkita\Projects\cyberthreat-ml-analysis\utils\..\models\xgb_sharedcols_v1.pkl


'C:\\Users\\kkita\\Projects\\cyberthreat-ml-analysis\\utils\\..\\models\\xgb_sharedcols_v1.pkl'

# Logistic regression 

In [11]:
lr_model = train_model(X_train, y_train, model_type='lr')

In [12]:
evaluate_model(lr_model, X_test, y_test)

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.89      0.98      0.93    644482
           1       0.77      0.40      0.52    127764

    accuracy                           0.88    772246
   macro avg       0.83      0.69      0.73    772246
weighted avg       0.87      0.88      0.86    772246

=== Confusion Matrix ===
[[628971  15511]
 [ 76970  50794]]
ROC AUC: 0.7338


In [14]:
save_model(lr_model, name="lr_sharedcols_v1.pkl")

Model saved to: C:\Users\kkita\Projects\cyberthreat-ml-analysis\utils\..\models\lr_sharedcols_v1.pkl


'C:\\Users\\kkita\\Projects\\cyberthreat-ml-analysis\\utils\\..\\models\\lr_sharedcols_v1.pkl'

# Support Vector Machine - WIP

In [None]:
svm_model = train_model(X_train, y_train, model_type='svm')

In [None]:
evaluate_model(svm_model, X_test, y_test)

In [None]:
save_model(svm_model, name="svm_sharedcols_v1.pkl")

In [None]:
print()