In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
from timeit import timeit
from IPython.display import display
import matplotlib
# matplotlib.use("pgf")
from matplotlib import pyplot as plt
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
from collections import Counter
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import cohen_kappa_score, confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
from joblib import dump, load
from imblearn.under_sampling import RandomUnderSampler
import shap
import seaborn as sns

In [None]:
from compiledataset import load_dataset, compile_dataset

PATH = "/home/hampus/miun/master_thesis/Datasets"

datasets = {}

# dataset: pd.DataFrame = load_dataset(PATH + "/ORNL", "data_a.csv")
# dataset["remarks"] = "No DLC available"
# datasets["ROAD"] = dataset.to_dict("records")

dataset: pd.DataFrame = load_dataset(PATH + "/Survival", "data.csv")
dataset["remarks"] = "-"
datasets["Survival"] = dataset.to_dict("records")

# dataset: pd.DataFrame = load_dataset(PATH + "/Hisingen", "data.csv")
# dataset["remarks"] = "-"
# datasets["Hisingen"] = dataset.to_dict("records")


df = compile_dataset(datasets)
df.drop(columns=["data", "data_dec", "ID", "DLC", "t"], inplace=True, errors="ignore")
# df.drop(columns=["d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"], inplace=True, errors="ignore")

dataset = None # Release memory, as it isn't used for now
datasets = None

display(df)

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler

feature_columns= list(set(df.columns.to_list()).difference(["name", "class", "dataset", "type", "Label"]))

for col in feature_columns:
    scaler = RobustScaler().fit(df.loc[df["Label"] == 0, df.columns == col])
    df.loc[:, df.columns ==col] = scaler.transform(df.loc[:, df.columns == col])

display(df)

In [None]:
# Stratify on the sub-dataset
X_train = df.drop(columns="Label")
y_train = df["Label"]

df = None # Release memory

# Split dataset into training and test data, stratify by the type of attack
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=0, shuffle=True, stratify=X_train["name"])

In [None]:
X_train.drop(columns=["type", "dataset", "name", "class"], inplace=True)
X_test.drop(columns=["type", "dataset", "name", "class"], inplace=True)

In [None]:
# X_train = df.loc[df["Label"] == 0]
# X_test = df.loc[df["Label"] == 1]
X_train_normal = X_train.loc[y_train[y_train == 0].index]
X_train_anomaly = X_train.loc[y_train[y_train == 1].index]

X_test_normal = X_test.loc[y_test[y_test == 0].index]
X_test_anomaly = X_test.loc[y_test[y_test == 1].index]

In [None]:
plt.plot(X_train_normal.iloc[0])
plt.plot(X_train_normal.iloc[1])
plt.plot(X_train_normal.iloc[2])
plt.title("Normal Data")
plt.show()

In [None]:
plt.plot(X_train_anomaly.iloc[0])
plt.plot(X_train_anomaly.iloc[1])
plt.plot(X_train_anomaly.iloc[2])
plt.title("Anomaly Data")
plt.show()

Autoencoder

In [None]:
import keras
from keras import Sequential, layers, callbacks
from keras.models import Model

class AutoEncoder(Model):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.encoder = Sequential([ # 13 input features
            layers.Dense(10, activation="relu"),
            layers.Dense(7, activation="relu"),
            layers.Dense(5, activation="relu"),
            layers.Dense(2, activation="relu")
        ])
        self.decoder = Sequential([
            layers.Dense(5, activation="relu"),
            layers.Dense(7, activation="relu"),
            layers.Dense(10, activation="relu"),
            layers.Dense(13, activation="sigmoid")
        ])
    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# input_img = keras.Input(shape=(6,))
# encoded = layers.Dense(4, activation='relu')(input_img)
# encoded = layers.Dense(2, activation='relu')(encoded)
# encoded = layers.Dense(1, activation='relu')(encoded)

# decoded = layers.Dense(2, activation='relu')(encoded)
# decoded = layers.Dense(4, activation='relu')(decoded)
# decoded = layers.Dense(6, activation='sigmoid')(decoded)

# autoencoder = keras.Model(input_img, decoded)
# autoencoder.compile(optimizer='adam', loss='msle')

model = AutoEncoder()

early_stopping = callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0.0001,
    patience=10,
    verbose=1, 
    mode='min',
    restore_best_weights=True
)

model.compile(optimizer='adam', loss="mean_squared_error")
history = model.fit(X_train_normal, X_train_normal, epochs=25, batch_size=120,
    validation_data=(X_test_normal, X_test_normal),
    shuffle=True,
    callbacks=[early_stopping]
)

# history = autoencoder.fit(X_train, X_train,
#     epochs=100,
#     batch_size=256,
#     shuffle=True,
#     validation_data=(X_test, X_test),
#     callbacks=[early_stop]
# ).history

In [None]:
plt.plot(history.history['loss'], linewidth=2, label='Train')
plt.plot(history.history['val_loss'], linewidth=2, label='Test')
plt.legend(loc='upper right')
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
#plt.ylim(ymin=0.70,ymax=1)
plt.show()

In [None]:
X_test_pred = model.predict(X_test)
mse = np.mean(np.power(X_test - X_test_pred, 2), axis=1)
error_df = pd.DataFrame({'Reconstruction_error': mse, 'True_class': y_test})

In [None]:
threshold_fixed = 50
groups = error_df.groupby('True_class')
fig, ax = plt.subplots()
for name, group in groups:
    ax.plot(group.index, group.Reconstruction_error, marker='o', ms=3.5, linestyle='',
            label= "Fraud" if name == 1 else "Normal")
ax.hlines(threshold_fixed, ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold')
ax.legend()
ax.set_yscale('log')
plt.title("Reconstruction error for normal and fraud data")
plt.ylabel("Reconstruction error")
plt.xlabel("Data point index")
plt.show()

In [None]:
from plot_tools import plot_confusion_matrix

threshold_fixed = 52
y_pred = [1 if e > threshold_fixed else 0 for e in error_df.Reconstruction_error.values]
error_df['pred'] = y_pred

plot_confusion_matrix(y_test, y_pred, "Confusion matrix for the Autoencoder")

f1_scores = f1_score(y_test, y_pred, average='weighted')
print("Testing F1:  %0.4f(+/- %0.4f)" % (f1_scores.mean(), f1_scores.std()))

kappa_scores = cohen_kappa_score(y_test, y_pred)
print("Kappa score:  %0.4f(+/- %0.4f)" % (kappa_scores.mean(), kappa_scores.std()))

# # print Accuracy, precision and recall
# print(" Accuracy: ",accuracy_score(error_df['True_class'], error_df['pred']))
# print(" Recall: ",recall_score(error_df['True_class'], error_df['pred']))
# print(" Precision: ",precision_score(error_df['True_class'], error_df['pred']))