# Analysis of Results of Anomaly Detection with Filtering by Binary Classifier
Results of GEE VAE autoencoder combined with binary classifier filtering the traffic

## Environment Setting
Import libraries/packages/modules

In [None]:
import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

## Load Results and Run Analysis

In [None]:
#setting the threshold

#train background mean = 0.003405, train background std = 0.005413
train_mean = 0.003405
train_std = 0.005413

coef = 1 #1/3, 1/2, 1, 2, 3
threshold = train_mean + train_std * coef

### Definitions

In [None]:
#function for enhancing classification results by results from anomaly detection
def preprocess_results(ad_data, cf_data, threshold):
   #take MSE from anomaly detection results and combine it with classification results
   data = cf_data
   data['mse'] = ad_data['mse']
   #modify background-classified predictions based on MSE from AD crossing the threshold, label as "anomaly"
   data.predictions[(data['predictions'] == '0') & (data['mse'] > threshold)] = 'anomaly'
   return data

In [None]:
#function for binarizing labels and predicitons (add to an input dataframe with labels and predictions columns blabels and bpredictions columns)
def binarize_results(data):
   #binarize labels
   data['blabels'] = data['labels']
   data.blabels[data['blabels'] == 'background'] = 0
   data.blabels[data['blabels'] != 0] = 1
   #binarize predictions
   data['bpredictions'] = data['predictions']
   data.bpredictions[data['bpredictions'] == '0'] = int(0)
   data.bpredictions[data['bpredictions'] != 0] = int(1)
   return data

In [None]:
#function for printing AUC, classification report, and confusion matrix for given data (dataframe with blabels and bpredictions), title is a string to differentiate the printed results
def print_result(data, title):
   if(len(data.blabels.unique()) < 2):
     print(title + ' subset contains a single label')
     return
   print('AUC (' + title + '):', metrics.roc_auc_score(data.blabels.to_list(), data.bpredictions.to_list()))
   print(metrics.classification_report(data.blabels.to_list(), data.bpredictions.to_list(), digits=4))
   print(metrics.confusion_matrix(data.blabels.to_list(), data.bpredictions.to_list()))
   print('Anomalies by AD:', len(data[data['predictions'] == 'anomaly']))
   print()

In [None]:
#AUC ROC (based on MSE) from dataframe with binary labels (blabels) and MSE
def plot_roc(data):
   fig, ax = plt.subplots(figsize=(5, 5))
   fpr, tpr, thresholds = metrics.roc_curve(data.blabels.to_list(), data.mse.to_list())
   auc = metrics.auc(fpr, tpr)
   ax.plot([0, 1], [0,1], 'k--')
   ax.plot(fpr, tpr, label=f'MSE (AUC = {auc: .4f})')
   ax.set_xlabel('False positive rate')
   ax.set_ylabel('True positive rate')
   ax.legend(loc='lower right')
   fig.show()
   return fig

In [None]:
#confusion matrix from dataframe with binary labels and predictions (blabels and bpredictions)
def plot_cm(data):
   cmd = metrics.ConfusionMatrixDisplay(metrics.confusion_matrix(data.blabels.to_list(), data.bpredictions.to_list()), display_labels=['0','1'])
   cmd.plot()
   return cmd.figure_

In [None]:
#KDE of MSE from dataframe with labels and MSE, MSE of 1.0 (filtered attacks) omitted from plotting
def plot_kde(data):
   normal_recon_error = data[(data['labels'] == 'background') & (data['mse'] != 1.0)]['mse'].tolist()
   malicious_recon_error = data[(data['labels'] != 'background') & (data['mse'] != 1.0)]['mse'].tolist()
   fig, ax = plt.subplots(figsize=(5, 5))
   sns.kdeplot(normal_recon_error, ax=ax, label='Background MSE')
   sns.kdeplot(malicious_recon_error, ax=ax, label='Anomaly-filtered MSE')
   ax.legend(loc='lower right')
   fig.show()
   return fig

In [None]:
#histogram of MSE from dataframe with labels and MSE, MSE of 1.0 (filtered attacks) omitted from plotting
def plot_hist(data):
   normal_recon_error = data[(data['labels'] == 'background') & (data['mse'] != 1.0)]['mse'].tolist()
   malicious_recon_error = data[(data['labels'] != 'background') & (data['mse'] != 1.0)]['mse'].tolist()
   fig, ax = plt.subplots(figsize=(10, 5))
   ax.hist([normal_recon_error, malicious_recon_error], bins=100, range=(0, 0.06), label=['Background', 'Anomaly-filtered'])
   ax.set_ylim(0, 100)
   ax.set_xlabel("MSE")
   ax.set_ylabel("Density")
   ax.legend(loc='upper right')
   fig.show()
   return fig

### Binary classifier trained on all classes

In [None]:
%%time
ad_results = pd.read_feather('results_ad_test.feather.with_mse')
cf_results = pd.read_feather('results_bb_test.feather')

#combine classification and anomaly detection results
results = preprocess_results(ad_results, cf_results, threshold)
print(results.predictions.value_counts())

#binarize labels and predictions
binarize_results(results)
print(results.blabels.value_counts())
print(results.bpredictions.value_counts())

print('Threshold = ', threshold)

print_result(results, 'all')

results_without_blacklist = results[results['labels'] != 'blacklist']
print_result(results_without_blacklist, 'without blacklist')

### Binary classifier trained without blacklist

In [None]:
%%time
ad_results = pd.read_feather('results_ad_test.feather.with_mse')
cf_results = pd.read_feather('results_bb_without_blacklist_test.feather')

#combine classification and anomaly detection results
results = preprocess_results(ad_results, cf_results, threshold)
print(results.predictions.value_counts())

#binarize labels and predictions
binarize_results(results)
print(results.blabels.value_counts())
print(results.bpredictions.value_counts())

print('Threshold = ', threshold)

print_result(results, 'all')

results_without_blacklist = results[results['labels'] != 'blacklist']
print_result(results_without_blacklist, 'without blacklist')

print_result(results[(results['labels'] == 'background') | (results['labels'] == 'dos')], 'background + dos')
print_result(results[(results['labels'] == 'background') | (results['labels'] == 'nerisbotnet')], 'background + nerisbotnet')
print_result(results[(results['labels'] == 'background') | (results['labels'] == 'anomaly-spam')], 'background + anomaly-spam')
print_result(results[(results['labels'] == 'background') | (results['labels'] == 'scan11')], 'background + scan11')
print_result(results[(results['labels'] == 'background') | (results['labels'] == 'scan44')], 'background + scan44')

In [None]:
#modify for plotting (set MSE of classified attacks to 1.0)
results_without_blacklist.mse[(results_without_blacklist['predictions'] == '1')] = 1.0

In [None]:
#plot ROC
fig = plot_roc(results_without_blacklist)
#fig.savefig('bcf-ad_auc-roc.pdf')

In [None]:
#plot confusion matrix
fig = plot_cm(results_without_blacklist)
#fig.savefig('bcf-ad_cm.pdf')

In [None]:
#plot KDE
fig = plot_kde(results_without_blacklist)
fig.show()
#fig.savefig('bcf-ad_kde.pdf')

In [None]:
normal_recon_error = results_without_blacklist[(results_without_blacklist['labels'] == 'background')]['mse'].tolist()
malicious_recon_error_filtered = results[(results['labels'] != 'blacklist') & (results['labels'] != 'background') & (results['predictions'] == '1')]['mse'].tolist()#
fig, ax = plt.subplots(figsize=(5, 5))
sns.kdeplot(normal_recon_error, ax=ax, label='Background MSE')
sns.kdeplot(malicious_recon_error_filtered, ax=ax, label='Detected anomalies')
ax.legend(loc='lower right')

fig.show()

In [None]:
#plot histogram
fig = plot_hist(results_without_blacklist)
fig.show()
#fig.savefig('bcf-ad_hist.pdf')

### Binary classifier trained without blacklist and dos

In [None]:
%%time
ad_results = pd.read_feather('results_ad.feather')
cf_results = pd.read_feather('results_bb_without_blacklist_dos.feather')

#combine classification and anomaly detection results
results = preprocess_results(ad_results, cf_results, threshold)
print(results.predictions.value_counts())

#binarize labels and predictions
binarize_results(results)
print(results.blabels.value_counts())
print(results.bpredictions.value_counts())

print('Threshold = ', threshold)

print_result(results, 'all')

results_without_blacklist = results[results['labels'] != 'blacklist']
print_result(results_without_blacklist, 'without blacklist')

print_result(results[(results['labels'] == 'background') | (results['labels'] == 'dos')], 'background + dos')

In [None]:
#modify for plotting (set MSE of classified attacks to 1.0)
results_without_blacklist.mse[(results_without_blacklist['predictions'] == '1')] = 1.0

fig = plot_roc(results_without_blacklist)
fig = plot_cm(results_without_blacklist)
fig = plot_kde(results_without_blacklist)
fig = plot_hist(results_without_blacklist)