In [1]:
import re

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import (datasets, ensemble, metrics, model_selection,
                     preprocessing, tree)

In [2]:
# need to do
## extract features from datasets
## label each dataset by error type
## combine datasets
## preprocess non-numerical features
## modify and train model
## evaluate model
# -----------------------------------------------------------------------
## export model
## apply exported model to each dataset to test error detection


In [3]:
## extract features from the datasets
## assign labels to each dataset by error type
## combine datasets
### 0 = attack_free_df
### 1 = dos_attack_df
### 2 = fuzzy_attack_df
### 3 = imperson_attack_df

data_dir = 'test_datasets'
dataset_filenames = {'attack_free': 'Attack_free_dataset.txt',
                     'dos_attack': 'DoS_attack_dataset.txt',
                     'fuzzy_attack': 'Fuzzy_attack_dataset.txt',
                     'imperson_attack': 'Impersonation_attack_dataset.txt'}

re_pattern = r'Timestamp:\s+(\d+\.\d+)\s+ID:\s+(\w+)\s+(\w+)\s+DLC:\s+(\d+)\s+((?:\w{2}\s+)+)'
null_pattern = r'Timestamp:\s+(\d+\.\d+)\s+ID:\s+(\w+)\s+(\w+)\s+DLC:\s+(\d+)'

data_dfs = {}
for i, dset in enumerate(dataset_filenames):
    timestamps = []
    ids = []
    dlcs = []
    data_fields = []
    with open(f'{data_dir}/{dataset_filenames[dset]}', 'r') as f:
        for line in f:
            re_match = re.match(re_pattern, line)
            if re_match:
                timestamp, msg_id, _, dlc, data_field = re_match.groups()
                timestamps.append(float(timestamp))
                ids.append(msg_id)
                dlcs.append(int(dlc))
                data_fields.append(data_field.strip())
            else:
                re_match = re.match(null_pattern, line)
                if re_match:
                    timestamp, msg_id, _, dlc = re_match.groups()
                    timestamps.append(float(timestamp))
                    ids.append(msg_id)
                    dlcs.append(int(dlc))
                    data_fields.append('')


    data_dfs[f'{dset}_df'] = pd.DataFrame({'Timestamp': timestamps,
                                           'ID': ids,
                                           'DLC': dlcs,
                                           'Data': data_fields,
                                           'Attack': i})

attack_df = pd.concat(data_dfs.values(), keys=data_dfs.keys())

In [4]:
## preprocess non-numerical features

label_encoder = preprocessing.LabelEncoder()
attack_df['ID'] = label_encoder.fit_transform(attack_df['ID'])
attack_df['Data'] = label_encoder.fit_transform(attack_df['Data'])

In [5]:
## build ensemble model
## modify and train model

x = attack_df[['Timestamp', 'ID', 'DLC', 'Data']]
y = attack_df['Attack']

num_trees = 30
random_state = 42
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.3, random_state=random_state)

# initialize models
dt_model = tree.DecisionTreeClassifier(random_state=random_state)
rf_model = ensemble.RandomForestClassifier(n_estimators=num_trees, random_state=random_state)
xgb_model = xgb.XGBClassifier(n_estimators=num_trees, random_state=random_state)

# combine predictions from multiple models with ensemble method --> majority voting scheme
# each model "votes" for its predicted class and the class with the most votes becomes the final prediction
voting_classifier = ensemble.VotingClassifier(estimators=[('decision_tree', dt_model),
                                                          ('random_forest', rf_model),
                                                          ('xg_boost', xgb_model)],
                                              voting='hard') # hard for majority voting

voting_classifier.fit(x_train, y_train)
pred = voting_classifier.predict(x_test)


In [6]:
## evaluate model

conf_matrix = metrics.confusion_matrix(y_test, pred)

tp = conf_matrix.diagonal() # true positives
tn = conf_matrix.sum() - conf_matrix.sum(axis=0) - conf_matrix.sum(axis=1) + tp # true negatives
fp = conf_matrix.sum(axis=1) - tp # false positives
fn = conf_matrix.sum(axis=0) - tp # false negatives

accuracy = (tp + tn) / (tp + tn + fp + fn)
detection_rate = tp / (tp + fn)
false_alarm_rate = fp / (tn + fp)
f1 = 2*tp / (2*tp + fp + fn)

for i, val in enumerate(set(y_test)):
    if val == 0:
        attack_type = 'Attack Free'
    elif val == 1:
        attack_type = 'DoS Attack'
    elif val == 2:
        attack_type = 'Fuzzy Attack'
    elif val == 3:
        attack_type = 'Impersonation Attack'

    print(f"Evaluating predictions for {attack_type}...")
    print(f"Model Accuracy: {accuracy[i]}")
    print(f"Model Detection Rate: {detection_rate[i]}")
    print(f"Model False Alarm Rate: {false_alarm_rate[i]}")
    print(f"Model F1 Score: {f1[i]}\n")



Evaluating predictions for Attack Free...
Model Accuracy: 0.9467244976994752
Model Detection Rate: 0.9357318876293741
Model False Alarm Rate: 0.04099040379931211
Model F1 Score: 0.9488209771872117

Evaluating predictions for DoS Attack...
Model Accuracy: 0.9577502543293797
Model Detection Rate: 0.8730369090606673
Model False Alarm Rate: 0.02906680137435238
Model F1 Score: 0.8476828782270521

Evaluating predictions for Fuzzy Attack...
Model Accuracy: 0.9682305033409632
Model Detection Rate: 0.8963306886777909
Model False Alarm Rate: 0.021793004714536074
Model F1 Score: 0.8730247655130989

Evaluating predictions for Impersonation Attack...
Model Accuracy: 1.0
Model Detection Rate: 1.0
Model False Alarm Rate: 0.0
Model F1 Score: 1.0

