In [1]:
import pickle
import time

import joblib
import pandas as pd
import xgboost as xgb
from sklearn import ensemble, metrics, model_selection, tree

from ads_helpers import encode_labels, txt_to_df

In [2]:
## extract features from the datasets
## combine datasets
### 0 = no attack
### 1 = attack detected

data_dir = 'test_datasets'
dataset_filenames = {'attack_free': 'Attack_free_dataset.txt',
                     'dos_attack': 'DoS_attack_dataset.txt',
                     'fuzzy_attack': 'Fuzzy_attack_dataset.txt',
                     'impersonation_attack': 'Impersonation_attack_dataset.txt'}

data_dfs = {}
for dset in dataset_filenames:
    attack_label = 0 if dset == 'attack_free' else 1
    dset_df = txt_to_df(dataset_filenames, data_dir, dset)
    dset_df['Attack'] = attack_label
    data_dfs[f'{dset}_df'] = dset_df
data_df = pd.concat(data_dfs.values(), keys=data_dfs.keys())

## preprocess non-numerical features
data_df = encode_labels(data_df, label_lst=['ID', 'Data'])


In [6]:
## build ensemble model
## modify and train model

target_key = 'Attack'
training_keys = [s for s in list(data_df.keys()) if s != target_key]

x = data_df[training_keys]
y = data_df[target_key]
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.1, random_state=42)

start_time = time.time()

# initialize models
dt_model = tree.DecisionTreeClassifier(random_state=42, criterion='entropy', max_leaf_nodes=1000, min_samples_leaf=2) #2750
rf_model = ensemble.RandomForestClassifier(n_estimators=30, random_state=42, max_leaf_nodes=1000, max_features=None) #7500
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)

# combine predictions from multiple models with ensemble method --> majority voting scheme
# each model "votes" for its predicted class and the class with the most votes becomes the final prediction
voting_classifier = ensemble.VotingClassifier(estimators=[('decision_tree', dt_model),
                                                          ('random_forest', rf_model),
                                                          ('xg_boost', xgb_model)],
                                              voting='hard') # hard for majority voting
voting_classifier.fit(x_train, y_train)

end_time = time.time()
print(f"Model training time = {end_time - start_time} seconds\n")


Model training time = 143.4360671043396 seconds



In [7]:
## evaluate model
pred = voting_classifier.predict(x_test)
conf_matrix = metrics.confusion_matrix(y_test, pred)

tp = conf_matrix.diagonal() # true positives
tn = conf_matrix.sum() - conf_matrix.sum(axis=0) - conf_matrix.sum(axis=1) + tp # true negatives
fp = conf_matrix.sum(axis=1) - tp # false positives
fn = conf_matrix.sum(axis=0) - tp # false negatives

accuracy = (tp + tn) / (tp + tn + fp + fn)
detection_rate = tp / (tp + fn)
false_alarm_rate = fp / (tn + fp)
f1 = 2*tp / (2*tp + fp + fn)

for i, val in enumerate(set(y_test)):
    if val == 0:
        attack_type = 'No Attack'
    elif val == 1:
        attack_type = 'Attack Detected'

    print(f"Evaluating predictions for {attack_type}...")
    print(f"Model Accuracy: {accuracy[i] * 100} %")
    print(f"Model Detection Rate: {detection_rate[i] * 100} %")
    print(f"Model False Alarm Rate: {false_alarm_rate[i] * 100} %")
    print(f"Model F1 Score: {f1[i] * 100} %\n")


Evaluating predictions for No Attack...
Model Accuracy: 96.44950405770965 %
Model Detection Rate: 95.10964049071706 %
Model False Alarm Rate: 2.047770583474358 %
Model F1 Score: 96.58964568125887 %

Evaluating predictions for Attack Detected...
Model Accuracy: 96.44950405770965 %
Model Detection Rate: 97.95222941652564 %
Model False Alarm Rate: 4.890359509282949 %
Model F1 Score: 96.29735118199943 %



In [9]:
## export model in lightweight format
model_name = 'ads_model'

with open(f'{model_name}.pkl', 'wb') as f:
    pickle.dump(voting_classifier, f)
