In [None]:
from pandas import read_csv, DataFrame
from autoencoder.autoencoder import AnomalyDetector
from turtleIsolationForests.isolationForest import IsolationForest
from turtleIsolationForests.extendedIsolationForest import ExtendedIsolationForest
from turtleIsolationForests.sciForest import SCIsolationForest
from turtleIsolationForests.FBIF import FBIsolationForest
from turtleIsolationForests.preprocessFeatures import minmax_preprocess_features, minmax_preprocess_denoised_features, minmax_preprocess_z_features
from turtleIsolationForests.printResults import calc_confusion, calc_f1, print_by_result, get_auroc_value
from pipelineUtils.runPipeline import addZToData
from time import time

In [None]:
train_dataframe = read_csv("./eda_simple_classification/network_data_mod_train.csv", index_col=0)
test_dataframe = read_csv("./eda_simple_classification/network_data_mod_test.csv", index_col=0)
robust_ae_train_dataframe = read_csv('./Robust_Autoencoder_Cleaned_Training_Normals.csv', index_col=0)
X_train, X_test, train_labels, test_labels = minmax_preprocess_features(train_dataframe, test_dataframe)
X_train_ae = minmax_preprocess_denoised_features(robust_ae_train_dataframe)
train_labels_np = train_labels.to_numpy()
test_labels_np = test_labels.to_numpy()
contamination = sum(train_labels) / len(train_labels)
contamination

In [None]:
epochs = 1200
repeats = 3

In [None]:
def initAutoenc():
    autoenc = AnomalyDetector()
    autoenc.compile(optimizer='adam', loss='mae')
    return autoenc

autoencs = [initAutoenc() for i in range(repeats)]
for autoenc in autoencs:
    autoenc.pipeline_fit(X_train_ae, epochs=epochs)




In [None]:
from tensorflow.keras.losses import mae
from tensorflow.math import less
import numpy as np
def pipeline_predict(autoenc, test_data, contamination):
        reconstructions = autoenc.predict(test_data)
        test_loss = mae(test_data, reconstructions) # 1 = anomaly (same as data)
        contamination = contamination + 0.3 * (1 - contamination) #corrective factor to contamination to increase autoencoder recall
        threshold = np.percentile(test_loss, 100 - 100 * contamination)
        predictions = less(threshold, test_loss) # if threshold < loss, then we return a 1, as it's an anomaly, else return 0
        return test_loss.numpy(), predictions.numpy()

def timed_pipeline_predict(autoenc):
    start_time = time()
    ae_scores, ae_predictions = pipeline_predict(autoenc, X_test, contamination=contamination)
    ae_time = time() - start_time
    return ae_scores, ae_predictions, ae_time

autoenc_score_predictions_time = [timed_pipeline_predict(autoenc) for autoenc in autoencs]

pre_Z_cols = X_train.shape[1]
def addZAndScale(i):
    autoenc = autoencs[i]
    ae_predictions = autoenc_score_predictions_time[i][1]
    X_train_forest = addZToData(X_train, autoenc)
    X_train_forest.iloc[:,pre_Z_cols:] = minmax_preprocess_z_features(X_train_forest.iloc[:,pre_Z_cols:])
    X_test_forest = addZToData(X_test[ae_predictions], autoenc)
    X_test_forest.iloc[:,pre_Z_cols:] = minmax_preprocess_z_features(X_test_forest.iloc[:,pre_Z_cols:])
    return X_train_forest, X_test_forest

X_data_forests = [addZAndScale(i) for i in range(repeats)]

In [None]:
def run_pipeline_with_pretrained_autoenc(i, iForest, intermediatePrint = True):
    ae_scores, ae_predictions, ae_time = autoenc_score_predictions_time[i]
    ae_TA, ae_FA, ae_FN, ae_TN = calc_confusion(ae_predictions, test_labels_np)
    if intermediatePrint:
        ae_auroc = get_auroc_value(ae_scores, test_labels_np)
        ae_precision, ae_recall, ae_f1 = calc_f1(ae_TA, ae_FA, ae_FN, ae_TN)
        print("Autoencoder Results")
        print_by_result(ae_TA, ae_FA, ae_FN, ae_TN, ae_precision, ae_recall, ae_f1)
        print("auroc: " + str(ae_auroc))
        print("test set prediction time: " + str(ae_time))
        print("")
    X_train_forest, X_test_forest = X_data_forests[i]
    iForest.fit(X_train_forest, train_labels_np)
    test_labels_forest_np = test_labels_np[ae_predictions]
    start_time = time()
    if_scores, if_predictions = iForest.predict(X_test_forest, test_labels_forest_np)
    if_time = time() - start_time
    if_TA, if_FA, if_FN, if_TN = calc_confusion(if_predictions, test_labels_forest_np)
    if intermediatePrint:
        print("Isolation Forest Results")
        if_precision, if_recall, if_f1 = calc_f1(if_TA, if_FA, if_FN, if_TN)
        if_auroc = get_auroc_value(if_scores, test_labels_forest_np)
        print_by_result(if_TA, if_FA, if_FN, if_TN, if_precision, if_recall, if_f1)
        print("auroc: " + str(if_auroc))
        print("test set prediction time: " + str(if_time))
        print("")
    TA = if_TA
    FA = if_FA
    FN = ae_FN + if_FN
    TN = ae_TN + if_TN
    precision, recall, f1 = calc_f1(TA, FA, FN, TN)
    print("Pipeline Results:")
    print_by_result(TA, FA, FN, TN, precision, recall, f1)
    print("Stage 1 prediction time: " + str(ae_time))
    print("Percentage of data passed to stage 2: " + str(sum(ae_predictions) / len(ae_predictions)))
    print("Stage 2 prediction time: " + str(if_time))

In [None]:
def run_pipelines_with_pretrained_autoencs(runs, iForest):
    for i in range(runs):
        run_pipeline_with_pretrained_autoenc(i, iForest)

In [None]:
iForest = IsolationForest(contamination = contamination, random_state = None)
run_pipelines_with_pretrained_autoencs(repeats, iForest)

In [None]:
eif = ExtendedIsolationForest(contamination = contamination, random_state = None)
run_pipelines_with_pretrained_autoencs(repeats, eif)

In [None]:
scif = SCIsolationForest(num_hyperplanes_per_split = 10, num_attributes_per_split = 5, contamination = contamination, random_state = None)
run_pipelines_with_pretrained_autoencs(repeats, scif)

In [None]:
fbif = FBIsolationForest(c1 = 1.0, c2 = 1.0, contamination = contamination, random_state = None)
run_pipelines_with_pretrained_autoencs(repeats, fbif)