In [23]:
import sys
import os

from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report
import scikitplot as skplt


from xgboost import XGBClassifier

sys.path.append(os.path.dirname(os.getcwd()))

from dbn_nslkdd.dbn.models import UnsupervisedDBN


In [24]:
DATA_PATH = Path('..') / 'data'
RAW_DATA_PATH = DATA_PATH / 'raw'
PROCESSED_DATA_PATH = DATA_PATH / 'processed_v3'

MODELS_PATH = Path('..') / 'models'

In [45]:
X_train = pd.read_csv(PROCESSED_DATA_PATH / 'X_train.csv')
X_val   = pd.read_csv(PROCESSED_DATA_PATH / 'X_val.csv')
y_train = pd.read_csv(PROCESSED_DATA_PATH / 'y_train.csv', index_col=False)
y_val   = pd.read_csv(PROCESSED_DATA_PATH / 'y_val.csv')
X_test  = pd.read_csv(PROCESSED_DATA_PATH / 'X_test.csv')
y_test  = pd.read_csv(PROCESSED_DATA_PATH / 'y_test.csv', index_col=False)

y_pred_val_xgboost = np.load(PROCESSED_DATA_PATH / 'y_pred_val_xgboost.npy', allow_pickle=True).astype(np.bool_)
y_pred_test_xgboost = np.load(PROCESSED_DATA_PATH / 'y_pred_test_xgboost.npy', allow_pickle=True).astype(np.bool_)
y_pred_val_dbn = np.load(PROCESSED_DATA_PATH / 'dbn_model_val_results.npy', allow_pickle=True).astype(np.bool_)
y_pred_test_dbn = np.load(PROCESSED_DATA_PATH / 'dbn_model_test_results.npy', allow_pickle=True).astype(np.bool_)

combined_models_val_OR = np.logical_or(y_pred_val_xgboost, y_pred_val_dbn)
combined_models_test_OR = np.logical_or(y_pred_test_xgboost, y_pred_test_dbn)

In [46]:
report_val = classification_report(y_pred_val_xgboost, combined_models_val_OR)
report_test = classification_report(y_pred_test_xgboost, combined_models_test_OR)
# print(report_val)
print(report_test)



              precision    recall  f1-score   support

       False       1.00      0.84      0.92     10601
        True       0.81      1.00      0.89      6942

    accuracy                           0.91     17543
   macro avg       0.90      0.92      0.90     17543
weighted avg       0.92      0.91      0.91     17543



In [20]:
from sklearn import metrics
from sklearn.metrics import roc_auc_score


import matplotlib.pyplot as plt

y_train = y_train.applymap(lambda x: x[0] if isinstance(x, list) else x).iloc[:22500]
y_test  = y_test.applymap(lambda x: x[0] if isinstance(x, list) else x).iloc[:22500]
combined_models_OR_tf = pd.DataFrame(combined_models_OR).applymap(lambda x: x[0] if isinstance(x, list) else x).iloc[:22500]

# score = roc_auc_score(y_test, combined_models_OR_tf)
# print(f"ROC AUC: {score:.4f}")

# fpr, tpr, _ = metrics.roc_curve(y_test,  combined_models_OR_tf)
# auc = metrics.roc_auc_score(y_test, combined_models_OR_tf)

# #create ROC curve
# plt.plot(fpr,tpr,label="AUC="+str(auc))
# plt.ylabel('True Positive Rate')
# plt.xlabel('False Positive Rate')
# plt.legend(loc=4)
# plt.show()

from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay

fpr, tpr, _ = metrics.roc_curve(y_test, combined_models_OR_tf)
auc = round(metrics.roc_auc_score(y_test, combined_models_OR_tf), 4)
plt.plot(fpr,tpr,label="ensemble models, AUC="+str(auc))

fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_dbn)
auc = round(metrics.roc_auc_score(y_test, y_pred_dbn), 4)
plt.plot(fpr,tpr,label="dbn, AUC="+str(auc))

fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_xgboost)
auc = round(metrics.roc_auc_score(y_test, y_pred_xgboost), 4)
plt.plot(fpr,tpr,label="XGBoost, AUC="+str(auc))

plt.title('ROC Curve')

#add legend
plt.legend()


ValueError: Found input variables with inconsistent numbers of samples: [17543, 5000]

In [97]:
from sklearn.metrics import classification_report
 
print(classification_report(y_test, combined_models_OR_tf))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90      9694
           1       0.94      0.89      0.92     12806

    accuracy                           0.91     22500
   macro avg       0.90      0.91      0.91     22500
weighted avg       0.91      0.91      0.91     22500



In [1]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)

tsne_results = tsne.fit_transform(data_subset)

df_subset['tsne-2d-one'] = tsne_results[:,0]
df_subset['tsne-2d-two'] = tsne_results[:,1]

plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="y",
    palette=sns.color_palette("hls", 10),
    data=df_subset,
    legend="full",
    alpha=0.3
)