In [37]:
import sys
import os

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score
sys.path.append(os.path.dirname(os.getcwd()))

from dbn_nslkdd.dbn.models import UnsupervisedDBN

from pathlib import Path

In [38]:
DATA_PATH = Path('..') / 'data'
RAW_DATA_PATH = DATA_PATH / 'raw'
PROCESSED_DATA_PATH = DATA_PATH / 'processed_v3'

MODELS_PATH = Path('..') / 'models'

LOAD_MODEL = False

# Load Data

In [39]:
shap_values_train = np.fromfile(PROCESSED_DATA_PATH / 'shap_values_train_from_xgboost.npy')
shap_values_train = shap_values_train.reshape(22500, 41)
shap_values_test = np.fromfile(PROCESSED_DATA_PATH / 'shap_values_test_from_xgboost.npy')
shap_values_test = shap_values_test.reshape(22500, 41)

# Scale Data

In [40]:
scaler = MinMaxScaler()
shap_values_train_scaled = scaler.fit(shap_values_train).transform(shap_values_train)
shap_values_test_scaled = scaler.fit(shap_values_test).transform(shap_values_test)

# Run DBN

In [59]:
N_EPOCHS_RBM = 100

dbn = UnsupervisedDBN(hidden_layers_structure=[41, 41],
                      batch_size=10,
                      learning_rate_rbm=0.06,
                      n_epochs_rbm=N_EPOCHS_RBM,
                      activation_function='sigmoid')

In [60]:
if LOAD_MODEL:
    dbn = UnsupervisedDBN.load(MODELS_PATH / 'dbn_unsupervised_with_shap_v1.model')
else:
    dbn.fit(shap_values_train_scaled)

dbn.save(MODELS_PATH / 'dbn_unsupervised_with_shap_v1.model')

[START] Pre-training step:
>> Epoch 1 finished 	RBM Reconstruction error 0.232008
>> Epoch 2 finished 	RBM Reconstruction error 0.198123
>> Epoch 3 finished 	RBM Reconstruction error 0.194166
>> Epoch 4 finished 	RBM Reconstruction error 0.184610
>> Epoch 5 finished 	RBM Reconstruction error 0.172055
>> Epoch 6 finished 	RBM Reconstruction error 0.166961
>> Epoch 7 finished 	RBM Reconstruction error 0.165338
>> Epoch 8 finished 	RBM Reconstruction error 0.171749
>> Epoch 9 finished 	RBM Reconstruction error 0.162174
>> Epoch 10 finished 	RBM Reconstruction error 0.173525
>> Epoch 11 finished 	RBM Reconstruction error 0.157442
>> Epoch 12 finished 	RBM Reconstruction error 0.165406
>> Epoch 13 finished 	RBM Reconstruction error 0.164437
>> Epoch 14 finished 	RBM Reconstruction error 0.152687
>> Epoch 15 finished 	RBM Reconstruction error 0.159074
>> Epoch 16 finished 	RBM Reconstruction error 0.157349
>> Epoch 17 finished 	RBM Reconstruction error 0.157821
>> Epoch 18 finished 	RBM Reco

>> Epoch 48 finished 	RBM Reconstruction error 0.027871
>> Epoch 49 finished 	RBM Reconstruction error 0.026712
>> Epoch 50 finished 	RBM Reconstruction error 0.028929
>> Epoch 51 finished 	RBM Reconstruction error 0.027869
>> Epoch 52 finished 	RBM Reconstruction error 0.029998
>> Epoch 53 finished 	RBM Reconstruction error 0.027336
>> Epoch 54 finished 	RBM Reconstruction error 0.028204
>> Epoch 55 finished 	RBM Reconstruction error 0.025614
>> Epoch 56 finished 	RBM Reconstruction error 0.028064
>> Epoch 57 finished 	RBM Reconstruction error 0.027034
>> Epoch 58 finished 	RBM Reconstruction error 0.025557
>> Epoch 59 finished 	RBM Reconstruction error 0.026425
>> Epoch 60 finished 	RBM Reconstruction error 0.027043
>> Epoch 61 finished 	RBM Reconstruction error 0.028857
>> Epoch 62 finished 	RBM Reconstruction error 0.027666
>> Epoch 63 finished 	RBM Reconstruction error 0.029799
>> Epoch 64 finished 	RBM Reconstruction error 0.029933
>> Epoch 65 finished 	RBM Reconstruction error 0

In [63]:
K = 10
reconstructed_shap_values = dbn.reconstruct_k(shap_values_test_scaled, K)

In [89]:
thresh_hold = 0.07

ARE = abs(reconstructed_shap_values - shap_values_test_scaled)

are_df = pd.DataFrame(ARE)

are_avg_list = pd.DataFrame(are_df.mean(axis=1)).values.tolist()

are_avg_list = [1 if item[0] > thresh_hold else 0 for item in are_avg_list]
y_test_to_10K_df = y_test.iloc[:22500,:]

are_avg_df = pd.DataFrame(are_avg_list)
are_avg_df.to_csv(PROCESSED_DATA_PATH / 'dbn_model_results.csv')
np.save(PROCESSED_DATA_PATH / 'dbn_model_results.npy', are_avg_list, allow_pickle=True)
print(f'accuracy_score: {accuracy_score(y_test_to_10K_df, are_avg_df)}')
print(f'precision_score: {precision_score(y_test_to_10K_df, are_avg_df)}')
print(f'recall_score {recall_score(y_test_to_10K_df, are_avg_df)}')

accuracy_score: 0.6691555555555555
precision_score: 0.8287956831003188
recall_score 0.5277213806028425


In [136]:
import matplotlib.pyplot as plt

results = {'10': {'ARE': 0.837, 'XGBoost': 0.80, 'ensemble': 0.9},
           '50': {'ARE': 0.853, 'XGBoost': 0.80, 'ensemble': 0.91},
          '100': {'ARE': 0.669, 'XGBoost': 0.80, 'ensemble': 0.89}}

results_df = pd.DataFrame(results)

results_df.transpose().plot.bar()



plt.legend(loc=(1.04, 0))
plt.title('accuracy vs number of epcohs')
plt.xlabel('number of epcohs')
plt.ylabel('accuracy')

for k in ['10', '50', '100']:
    for index, value in enumerate(results[k]):
        plt.text(value, index, str(value))
# ax = barh.plot.barh()
# ax.axhline(0, color='grey', linewidth=0.8)
# ax.bar_label(ax.containers[0], ax.containers[1])
# ax.bar_label(ax.containers[1])

plt.show()
# plt.show()

ConversionError: Failed to convert value(s) to axis units: 'ARE'

<Figure size 640x480 with 1 Axes>