In [1]:
import re

import joblib
import pandas as pd
import xgboost as xgb
from sklearn import ensemble, metrics, model_selection, tree

from ads_helpers import encode_labels, txt_to_df

In [2]:
## extract features from the datasets
## combine datasets
### 0 = no attack
### 1 = attack detected

data_dir = 'test_datasets'
dataset_filenames = {'attack_free': 'Attack_free_dataset.txt',
                     'dos_attack': 'DoS_attack_dataset.txt',
                     'fuzzy_attack': 'Fuzzy_attack_dataset.txt',
                     'impersonation_attack': 'Impersonation_attack_dataset.txt'}

data_dfs = {}
for dset in dataset_filenames:
    attack_label = 0 if dset == 'attack_free' else 1
    dset_df = txt_to_df(dataset_filenames, data_dir, dset)
    dset_df['Attack'] = attack_label
    data_dfs[f'{dset}_df'] = dset_df
attack_df = pd.concat(data_dfs.values(), keys=data_dfs.keys())

## preprocess non-numerical features
attack_df = encode_labels(attack_df, label_lst=['ID', 'Data'])

In [3]:
## build ensemble model
## modify and train model

target_key = 'Attack'
training_keys = [s for s in list(attack_df.keys()) if s != target_key]

x = attack_df[training_keys]
y = attack_df[target_key]

num_trees = 30
random_state = 42
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.3, random_state=random_state)

# initialize models
dt_model = tree.DecisionTreeClassifier(random_state=random_state)
rf_model = ensemble.RandomForestClassifier(n_estimators=num_trees, random_state=random_state)
xgb_model = xgb.XGBClassifier(n_estimators=num_trees, random_state=random_state)

# combine predictions from multiple models with ensemble method --> majority voting scheme
# each model "votes" for its predicted class and the class with the most votes becomes the final prediction
voting_classifier = ensemble.VotingClassifier(estimators=[('decision_tree', dt_model),
                                                          ('random_forest', rf_model),
                                                          ('xg_boost', xgb_model)],
                                              voting='hard') # hard for majority voting

voting_classifier.fit(x_train, y_train)

## export model
joblib.dump(voting_classifier, 'ads_model.pkl')


['ads_model.pkl']

In [4]:
## evaluate model

pred = voting_classifier.predict(x_test)
conf_matrix = metrics.confusion_matrix(y_test, pred)

tp = conf_matrix.diagonal() # true positives
tn = conf_matrix.sum() - conf_matrix.sum(axis=0) - conf_matrix.sum(axis=1) + tp # true negatives
fp = conf_matrix.sum(axis=1) - tp # false positives
fn = conf_matrix.sum(axis=0) - tp # false negatives

accuracy = (tp + tn) / (tp + tn + fp + fn)
detection_rate = tp / (tp + fn)
false_alarm_rate = fp / (tn + fp)
f1 = 2*tp / (2*tp + fp + fn)

for i, val in enumerate(set(y_test)):
    if val == 0:
        attack_type = 'No Attack'
    elif val == 1:
        attack_type = 'Attack Detected'

    print(f"Evaluating predictions for {attack_type}...")
    print(f"Model Accuracy: {accuracy[i]}")
    print(f"Model Detection Rate: {detection_rate[i]}")
    print(f"Model False Alarm Rate: {false_alarm_rate[i]}")
    print(f"Model F1 Score: {f1[i]}\n")



Evaluating predictions for No Attack...
Model Accuracy: 0.9490279126494185
Model Detection Rate: 0.9454654080868119
Model False Alarm Rate: 0.047130969076870866
Model F1 Score: 0.9506092715695438

Evaluating predictions for Attack Detected...
Model Accuracy: 0.9490279126494185
Model Detection Rate: 0.9528690309231291
Model False Alarm Rate: 0.054534591913188074
Model F1 Score: 0.9473419426013963

