In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
from timeit import timeit
from IPython.display import display
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split
from joblib import dump, load
from imblearn.under_sampling import RandomUnderSampler

In [None]:
from compiledataset import load_dataset, compile_dataset

PATH = "/home/hampus/miun/master_thesis/Datasets"

datasets = {}

# dataset: pd.DataFrame = load_dataset(PATH + "/ORNL", "data_a.csv")
# dataset["remarks"] = "No DLC available"
# datasets["ROAD"] = dataset.to_dict("records")

dataset: pd.DataFrame = load_dataset(PATH + "/Survival", "data.csv")
dataset["remarks"] = "-"
datasets["Survival"] = dataset.to_dict("records")

# dataset: pd.DataFrame = load_dataset(PATH + "/Hisingen", "data.csv")
# dataset["remarks"] = "-"
# datasets["Hisingen"] = dataset.to_dict("records")


df = compile_dataset(datasets)
df.drop(columns=["d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "data", "ID", "DLC", "t"], inplace=True, errors="ignore")

dataset = None # Release memory, as it isn't used for now
datasets = None

display(df)

In [None]:
# df.drop(columns=["t", "ID", "data", "type"], inplace=True, errors="ignore")
# df.to_csv("datasets.zip", index=False)

In [None]:
# df = pd.read_csv("datasets.zip")
# display(df)
# df.drop(columns="Unnamed: 0", inplace=True)
# df.to_csv("datasets.zip", index=False)

In [None]:
from plot_tools import plot_correlation_matrix

plot_correlation_matrix(df.drop(columns=["dataset", "type", "name", "ID", "DLC", "t"], errors="ignore"))

In [None]:
df = df.loc[df["type"] != "masq"]
# df = df.loc[(df["type"] == "fuzz") | (df["type"] == "none")]

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# index_normal = df.loc[df["Label"] == 0, (df.columns != "type") & (df.columns != "dataset") & (df.columns != "name") & (df.columns != "Label")]
scaler = StandardScaler(copy=True).fit(df.loc[df["Label"] == 0, (df.columns != "type") & (df.columns != "dataset") & (df.columns != "name") & (df.columns != "Label")])

df.loc[:, (df.columns != "type") & (df.columns != "dataset") & (df.columns != "name") & (df.columns != "Label")] = scaler.transform(
    df.loc[:, (df.columns != "type") & (df.columns != "dataset") & (df.columns != "name") & (df.columns != "Label")]
)

display(df)

In [None]:
# Stratify on the sub-dataset
X_train = df.drop(columns="name")
y_train = df["name"]

df = None # Release memory

# Split dataset into training and test data, stratify by the type of attack
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=0, shuffle=True, stratify=y_train)

# Use feature "Label" as classification label
d_temp: pd.DataFrame = pd.concat([X_train, y_train], axis="columns")
X_train, y_train = d_temp.drop(columns="Label"), d_temp["Label"]
d_temp: pd.DataFrame = pd.concat([X_test, y_test], axis="columns")
X_test, y_test = d_temp.drop(columns="Label"), d_temp["Label"]
d_temp = None # Release memory

In [None]:
rus = RandomUnderSampler(random_state=0)
X_train, y_train = rus.fit_resample(X_train, y_train)
bintr = np.bincount(y_train)
binte = np.bincount(y_test)
print(f"Labels\t\tTraining\tTesting\nNormal\t\t{bintr[0]}\t\t{binte[0]}\nAttack\t\t{bintr[1]}\t\t{binte[1]}")

In [None]:
# X_test = X_test.loc[(X_test["type"] == "fuzz") | (X_test["type"] == "none")]
# y_test = y_test.loc[X_test.index]

In [None]:
train_index = X_train.loc[X_train["dataset"] == "Survival"].index
test_index = X_test.loc[X_test["dataset"] == "ROAD"].index

X_train = X_train.loc[train_index]
y_train = y_train.loc[train_index]

X_test = X_test.loc[test_index]
y_test = y_test.loc[test_index]

In [None]:
X_train.drop(columns=["type", "dataset", "name"], inplace=True)
X_test.drop(columns=["type", "dataset", "name"], inplace=True)

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=20, random_state=0, max_leaf_nodes=300, max_features="log2", warm_start=True)
clf.fit(X_train, y_train)

In [None]:
scores = cross_val_score(clf, X_train, y_train, scoring='f1', cv=10, n_jobs=-1)
print("Training F1: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std()))

pred = clf.predict(X_test)

f1_scores = f1_score(y_test, pred, average='weighted')
print("Testing F1:  %0.4f(+/- %0.4f)" % (f1_scores.mean(), f1_scores.std()))

kappa_scores = cohen_kappa_score(y_test, pred)
print("Kappa score:  %0.4f(+/- %0.4f)" % (kappa_scores.mean(), kappa_scores.std()))

Plot test data predictions

In [None]:
from plot_tools import plot_confusion_matrix

plot_confusion_matrix(y_test, pred, "Survival, All Attacks, RF\nrate\n(# of instances)")

Plot train data predictions

In [None]:
pred_train = clf.predict(X_train)

plot_confusion_matrix(y_train, pred_train, "Survival, All Attacks, RF\nrate\n(# of instances)")

Add trees trained on the FPs and FNs

In [None]:
clf.set_params(n_estimators=30) # Add another 20 trees for the FN and FPs
clf.fit(X_train.loc[y_train != pred_train], y_train.loc[y_train != pred_train])

Plot test data predictions again

In [None]:
pred_test = clf.predict(X_test)

plot_confusion_matrix(y_test, pred_test, "Survival, All Attacks, RF\nrate\n(# of instances)")

In [None]:
import shap
from shap_tools import *


exp = shap.TreeExplainer(clf)

# # Make sure that the ingested SHAP model (a TreeEnsemble object) makes the
# # same predictions as the original model
# assert np.abs(exp.model.predict(X_test_sample) - clf.predict_proba(X_test_sample)).max() < 1e-4

# # Make sure the SHAP values sum up to the model output (this is the local accuracy property)
# assert np.abs(exp.expected_value + exp.shap_values(X_test_sample).sum(1) - clf.predict_proba(X_test_sample)).max() < 1e-4


shap_false = get_explanation(exp, X_test.loc[y_test != pred])
shap_true = get_explanation(exp, X_test.loc[y_test == pred])

plot_beeswarm(shap_false)

# shap_values = explainer(X_test.sample(1000, random_state=0))
# shap_values = shap.Explanation(shap_values[:, :, 1], feature_names=X_test.columns)

# shap.summary_plot(shap_values)


# shap.waterfall_plot(shap.Explanation(values=shap_values[int("which_class")][row], 
#                                          base_values=explainer.expected_value[int(which_class)], 
#                                          data=X_test.iloc[row],  # added this line
#                                          feature_names=X_test.columns.tolist()))
# shap.force_plot(explainer.expected_value[1], shap_values[1], features=X_test[:1], feature_names=X_test.columns)

# shap.plots.scatter(shap_values[:,"ones_w"])
# shap.summary_plot(shap_values[1], X_test.columns)

# plt.show()

In [None]:
plot_beeswarm(shap_true)