In [1]:
import re

import joblib
import pandas as pd
import xgboost as xgb
from sklearn import (ensemble, metrics, model_selection, preprocessing, svm,
                     tree)
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# need to do
## extract features from datasets
## label each dataset by error type
## combine datasets
## preprocess non-numerical features
## modify and train model
## evaluate model
## export model


In [3]:
## extract features from the datasets
## assign labels to each dataset by error type
## combine datasets
### 0 = no attack
### 1 = attack detected

data_dir = 'test_datasets'
dataset_filenames = {'attack_free': 'Attack_free_dataset.txt',
                     'dos_attack': 'DoS_attack_dataset.txt',
                     'fuzzy_attack': 'Fuzzy_attack_dataset.txt',
                     'imperson_attack': 'Impersonation_attack_dataset.txt'}

re_pattern = r'Timestamp:\s+(\d+\.\d+)\s+ID:\s+(\w+)\s+(\w+)\s+DLC:\s+(\d+)\s+((?:\w{2}\s+)+)'
null_pattern = r'Timestamp:\s+(\d+\.\d+)\s+ID:\s+(\w+)\s+(\w+)\s+DLC:\s+(\d+)'

data_dfs = {}
for dset in dataset_filenames:
    attack_label = 0 if dset == 'attack_free' else 1
    timestamps = []
    ids = []
    dlcs = []
    data_fields = []
    with open(f'{data_dir}/{dataset_filenames[dset]}', 'r') as f:
        for line in f:
            re_match = re.match(re_pattern, line)
            if re_match:
                timestamp, msg_id, _, dlc, data_field = re_match.groups()
                timestamps.append(float(timestamp))
                ids.append(msg_id)
                dlcs.append(int(dlc))
                data_fields.append(data_field.strip())
            else:
                re_match = re.match(null_pattern, line)
                if re_match:
                    timestamp, msg_id, _, dlc = re_match.groups()
                    timestamps.append(float(timestamp))
                    ids.append(msg_id)
                    dlcs.append(int(dlc))
                    data_fields.append('')


    data_dfs[f'{dset}_df'] = pd.DataFrame({'Timestamp': timestamps,
                                           'ID': ids,
                                           'DLC': dlcs,
                                           'Data': data_fields,
                                           'Attack': attack_label})

attack_df = pd.concat(data_dfs.values(), keys=data_dfs.keys())

In [4]:
## preprocess non-numerical features

label_encoder = preprocessing.LabelEncoder()
attack_df['ID'] = label_encoder.fit_transform(attack_df['ID'])

# use TF-IDF to represent features of Data in a vector space
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(attack_df['Data'])
df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
attack_df = pd.concat([attack_df.reset_index(drop=True), df_tfidf], axis=1)


In [13]:
## build ensemble model
## modify and train model

training_keys = set(attack_df.keys())
exclude_keys = set(['Attack', 'Data'])
training_keys_mod = list(training_keys - exclude_keys)
target_key = 'Attack'

x = attack_df[training_keys_mod]
y = attack_df[target_key]

num_trees = 30
random_state = 42
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.3, random_state=random_state)

# initialize models
dt_model = tree.DecisionTreeClassifier(random_state=random_state)
rf_model = ensemble.RandomForestClassifier(n_estimators=num_trees, random_state=random_state)
xgb_model = xgb.XGBClassifier(n_estimators=num_trees, random_state=random_state)

# combine predictions from multiple models with ensemble method --> majority voting scheme
# each model "votes" for its predicted class and the class with the most votes becomes the final prediction
voting_classifier = ensemble.VotingClassifier(estimators=[('decision_tree', dt_model),
                                                          ('random_forest', rf_model),
                                                          ('xg_boost', xgb_model)],
                                              voting='hard') # hard for majority voting

voting_classifier.fit(x_train, y_train)

## export model
joblib.dump(voting_classifier, 'ads_model.pkl')


['ads_model.pkl']

In [14]:
## evaluate model

pred = voting_classifier.predict(x_test)
conf_matrix = metrics.confusion_matrix(y_test, pred)

tp = conf_matrix.diagonal() # true positives
tn = conf_matrix.sum() - conf_matrix.sum(axis=0) - conf_matrix.sum(axis=1) + tp # true negatives
fp = conf_matrix.sum(axis=1) - tp # false positives
fn = conf_matrix.sum(axis=0) - tp # false negatives

accuracy = (tp + tn) / (tp + tn + fp + fn)
detection_rate = tp / (tp + fn)
false_alarm_rate = fp / (tn + fp)
f1 = 2*tp / (2*tp + fp + fn)

for i, val in enumerate(set(y_test)):
    if val == 0:
        attack_type = 'No Attack'
    elif val == 1:
        attack_type = 'Attack Detected'

    print(f"Evaluating predictions for {attack_type}...")
    print(f"Model Accuracy: {accuracy[i]}")
    print(f"Model Detection Rate: {detection_rate[i]}")
    print(f"Model False Alarm Rate: {false_alarm_rate[i]}")
    print(f"Model F1 Score: {f1[i]}\n")



Evaluating predictions for No Attack...
Model Accuracy: 0.9426089859194007
Model Detection Rate: 0.9403254266059144
Model False Alarm Rate: 0.05494098969566259
Model F1 Score: 0.9443229857027601

Evaluating predictions for Attack Detected...
Model Accuracy: 0.9426089859194007
Model Detection Rate: 0.9450590103043374
Model False Alarm Rate: 0.05967457339408559
Model F1 Score: 0.9407861043293513

