In [None]:
from pandas import read_csv
from autoencoder.autoencoder import AnomalyDetector
from turtleIsolationForests.preprocessFeatures import minmax_preprocess_features
from turtleIsolationForests.printResults import calc_confusion
from tensorflow.keras.losses import mae
from matplotlib import pyplot as plt
import numpy as np

In [None]:
train_dataframe = read_csv("./eda_simple_classification/network_data_mod_train.csv", index_col=0)
test_dataframe = read_csv("./eda_simple_classification/network_data_mod_test.csv", index_col=0)
X_train, X_test, train_labels, test_labels = minmax_preprocess_features(train_dataframe, test_dataframe)
X_train_ae = X_train[train_labels]
X_test_normals = X_test[test_labels]
X_test_anomalies = X_test[~test_labels]

In [None]:
autoenc = AnomalyDetector()
autoenc.compile(optimizer='adam', loss='mae')

In [None]:
losses = []
val_losses = []
test_normal_losses = []
test_anomaly_losses = []

In [None]:
i = 0
while i < 5000:
    history = autoenc.fit(X_train_ae, X_train_ae, initial_epoch=i, epochs=i+1, validation_split=0.2, shuffle=True)
    reconstructions_n = autoenc.predict(X_test_normals)
    reconstructions_a = autoenc.predict(X_test_anomalies)
    test_loss_normal = mae(X_test_normals, reconstructions_n)
    test_loss_anomaly = mae(X_test_anomalies, reconstructions_a)
    test_loss_normal_m = np.mean(test_loss_normal)
    test_loss_anomaly_m = np.mean(test_loss_anomaly)
    print("test loss on normals: " + str(test_loss_normal_m) + ", test loss on anomalies: " + str(test_loss_anomaly_m))
    losses.append(history.history["loss"][0])
    val_losses.append(history.history["val_loss"][0])
    test_normal_losses.append(test_loss_normal_m)
    test_anomaly_losses.append(test_loss_anomaly_m)
    i += 1

In [None]:
plt.plot(losses)
plt.plot(val_losses)
plt.plot(test_normal_losses)
plt.plot(test_anomaly_losses)
plt.title("Training, Validation, and Test Loss over epochs")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.ylim([0, .08])
plt.legend(["Loss", "Validation Loss", "Test Loss Normals", "Test Loss Anomalies"])
plt.show()

In [None]:
plt.violinplot([test_loss_normal, test_loss_anomaly], showmeans=True, showmedians=False)
plt.title("Distributions of test losses on normal and anomalous points at epoch 5000")
plt.legend(["test loss normal", "test loss anomaly"])
plt.show

In [None]:
from time import time
from turtleIsolationForests.printResults import calc_confusion, get_auroc_value, calc_f1, print_by_result
test_labels_np = test_labels.to_numpy()
start_time = time()
#autoenc.threshold = np.mean(losses[-1])
autoenc.threshold = (np.median(test_normal_losses[-1]) + np.median(test_anomaly_losses[-1])) / 2
ae_scores, ae_predictions = autoenc.pipeline_predict(X_test, test_labels_np)
ae_time = time() - start_time
ae_TA, ae_FA, ae_FN, ae_TN = calc_confusion(ae_predictions, test_labels_np)
ae_auroc = get_auroc_value(ae_scores, test_labels_np)
ae_precision, ae_recall, ae_f1 = calc_f1(ae_TA, ae_FA, ae_FN, ae_TN)
print("Autoencoder Results")
print_by_result(ae_TA, ae_FA, ae_FN, ae_TN, ae_precision, ae_recall, ae_f1)
print("auroc: " + str(ae_auroc))
print("test set prediction time: " + str(ae_time))

In [None]:
while i < 10000:
    history = autoenc.fit(X_train_ae, X_train_ae, initial_epoch=i, epochs=i+1, validation_split=0.2, shuffle=True)
    reconstructions_n = autoenc.predict(X_test_normals)
    reconstructions_a = autoenc.predict(X_test_anomalies)
    test_loss_normal = mae(X_test_normals, reconstructions_n)
    test_loss_anomaly = mae(X_test_anomalies, reconstructions_a)
    test_loss_normal_m = np.mean(test_loss_normal)
    test_loss_anomaly_m = np.mean(test_loss_anomaly)
    print("test loss on normals: " + str(test_loss_normal_m) + ", test loss on anomalies: " + str(test_loss_anomaly_m))
    losses.append(history.history["loss"][0])
    val_losses.append(history.history["val_loss"][0])
    test_normal_losses.append(test_loss_normal_m)
    test_anomaly_losses.append(test_loss_anomaly_m)
    i += 1

In [None]:
plt.plot(losses)
plt.plot(val_losses)
plt.plot(test_normal_losses)
plt.plot(test_anomaly_losses)
plt.title("Training, Validation, and Test Loss over epochs")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.ylim([0, .08])
plt.legend(["Loss", "Validation Loss", "Test Loss Normals", "Test Loss Anomalies"])
plt.show()

In [None]:
plt.violinplot([test_loss_normal, test_loss_anomaly], showmeans=True, showmedians=False)
plt.title("Distributions of test losses on normal and anomalous points at epoch 10000")
plt.legend(["test loss normal", "test loss anomaly"])
plt.show

In [None]:
start_time = time()
autoenc.threshold = np.mean(losses[-1])
ae_scores, ae_predictions = autoenc.pipeline_predict(X_test, test_labels_np)
ae_time = time() - start_time
ae_TA, ae_FA, ae_FN, ae_TN = calc_confusion(ae_predictions, test_labels_np)
ae_auroc = get_auroc_value(ae_scores, test_labels_np)
ae_precision, ae_recall, ae_f1 = calc_f1(ae_TA, ae_FA, ae_FN, ae_TN)
print("Autoencoder Results")
print_by_result(ae_TA, ae_FA, ae_FN, ae_TN, ae_precision, ae_recall, ae_f1)
print("auroc: " + str(ae_auroc))
print("test set prediction time: " + str(ae_time))