In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [None]:
import autoencoder.aecExtraFeatures as Z_calculations

def addZToPrediction(model, data_point):
    encoded = model.encoder(data_point)
    reconstruction = model.decoder(encoded)

    Z_features = [Z_calculations.getZVector(data_point, reconstruction, encoded)]

    Z_features_tensor = tf.convert_to_tensor(Z_features, dtype=tf.float32)
    data_point = tf.convert_to_tensor(data_point, dtype=tf.float32)

    data_point = tf.concat([data_point, Z_features_tensor], 1)

    return data_point

In [None]:
def isAnomaly(data_point, model_1, model_2, threshold):

    # need autoencoder to return boolean isAnomaly
    isAnomaly = tf.math.less(tf.keras.losses.mae(model_1(data_point), data_point), threshold)

    # if the autoencoder doesn't find anything out of the ordinary, return False
    if not isAnomaly:
        return False

    data_point = addZToPrediction(model_1, data_point)

    # if the autoencoder sees something weird, run it through the isolation forest to make sure
    return model_2.predict(data_point)

In [None]:
from turtleIsolationForests.preprocessFeatures import preprocess_features

train_dataframe = pd.read_csv("eda_simple_classification/network_data_mod_train.csv", index_col=0)
test_dataframe = pd.read_csv("eda_simple_classification/network_data_mod_test.csv", index_col=0)

train_data, test_data, train_labels, test_labels = preprocess_features(train_dataframe, test_dataframe)

In [None]:
#train_data, test_data, train_labels, test_labels = train_data[:1000], test_data[:1000], train_labels[:1000], test_labels[:1000]

In [None]:
print(len(train_data))
print(len(train_labels))
print(len(test_data))
print(len(test_labels))

In [None]:
np_train_labels = train_labels.to_numpy()
np_test_labels = test_labels.to_numpy()

In [None]:
np_train_data = train_data.to_numpy()
np_test_data = test_data.to_numpy()

In [None]:
np_train_data = tf.cast(np_train_data, tf.float32)
np_test_data = tf.cast(np_test_data, tf.float32)

In [None]:
np_train_labels = np_train_labels.astype(bool)
np_test_labels = np_test_labels.astype(bool)

normal_train_data = np_train_data[np_train_labels]
normal_test_data = np_test_data[np_test_labels]

anomalous_train_data = np_train_data[~np_train_labels]
anomalous_test_data = np_test_data[~np_test_labels]

In [None]:
from autoencoder.autoencoder import AnomalyDetector
autoencoder = AnomalyDetector()

In [None]:
autoencoder.compile(optimizer='adam', loss='mae')

In [None]:
autoencoder.encoder.layers

In [None]:
history = autoencoder.fit(normal_train_data, normal_train_data,
          epochs=200,
          validation_data=(test_data, test_data),
          shuffle=True)

In [None]:
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("Training Loss & Validation Loss over epochs")
plt.xlabel("Epochs")
plt.ylabel("Loss")
#plt.ylim([0,.0008])
plt.legend(["Loss", "Validation Loss"])
plt.show()

In [None]:
reconstructions = autoencoder.predict(normal_train_data)
train_loss = tf.keras.losses.mae(reconstructions, normal_train_data)
threshold = np.mean(train_loss) + 0 * np.std(train_loss)
print("Threshold: ", threshold)

In [None]:
def predict(model, data, threshold):
  reconstructions = model(data)
  loss = tf.keras.losses.mae(data, reconstructions) # 1 = anomaly (same as data)
  return tf.math.less(threshold, loss) # if threshold < loss, then we return a 1, as it's an anomaly, else return 0

def getLoss(model, data):
    reconstructions = model(data)
    loss = tf.keras.losses.mae(data, reconstructions) # 1 = anomaly (same as data)
    return loss

def print_stats(predictions, labels):
  print("Accuracy = {}".format(accuracy_score(labels, predictions)))
  print("Precision = {}".format(precision_score(labels, predictions)))
  print("Recall = {}".format(recall_score(labels, predictions)))

In [None]:
%%timeit
%%capture

predict(autoencoder, np_test_data, threshold)

In [None]:
%%capture

test_predictions = predict(autoencoder, np_test_data, threshold)

### Ones are anomalous predictions

In [None]:
print(len(np.where(test_predictions.numpy()==1)[0]))

### Zeros are normal predictions

In [None]:
print(len(np.where(test_predictions.numpy()==0)[0]))

In [None]:
%%capture

auc_df = pd.DataFrame()
auc_df["scores"] = getLoss(autoencoder, np_test_data)
auc_df["predictions"] = test_predictions
auc_df['is_anomaly'] = np_test_labels

### Stats for whole dataset

In [None]:
from turtleIsolationForests.printResults import print_results
from turtleIsolationForests.printResults import get_auroc_value

print_results(auc_df["predictions"].to_numpy(), auc_df['is_anomaly'].to_numpy())
print("auroc: " + str(get_auroc_value(auc_df["scores"], auc_df['is_anomaly'])))

In [None]:
def addZToData(data, model):
    data_with_Z = []
    for i in range(1, len(data)+1):
        data_with_Z.append(addZToPrediction(model, data[i-1:i]))

    data_with_Z_rf = []
    for i in range(len(data_with_Z)):
        data_with_Z_rf.append(np.append(data[:][:][i].numpy().reshape(1,46).squeeze(), data_with_Z[i]))

    return pd.DataFrame(data_with_Z_rf)

In [None]:
train_data_with_Z_df = addZToData(np_train_data, autoencoder)

In [None]:
train_data_with_Z_df

In [None]:
np_test_labels = np.array(np_test_labels)

In [None]:
predicted_anomalous = np_test_data[test_predictions]

In [None]:
predicted_anomalous_labels = np_test_labels[test_predictions]

In [None]:
anomalous_test_data_with_Z_df = addZToData(predicted_anomalous, autoencoder)

In [None]:
contamination = sum(train_labels == 0) / len(train_labels)

In [None]:
def getFinalPredictions(first_predictions, second_predictions):
    autoencoder_copy = first_predictions.copy() # copy first df
    if_copy = second_predictions.copy() # copy 2nd df
    #print("Copies made")
    indices = autoencoder_copy.index[autoencoder_copy["predictions"] == True] # get indices of data where autoencoder predicted true
    #print("Indices created")
    if_copy.index = indices # set indices of if model's predictions to be where autoencoder predicted true
    #print("Indices set")
    autoencoder_copy.loc[indices, "predictions"] = if_copy["predictions"] # set autoencoder's predictions where predicted true to isolation forest's predictions where predicted true
    #print("Used indices")
    print_results(autoencoder_copy["predictions"], autoencoder_copy["is_anomaly"])
    #print("auroc: " + str(get_auroc_value(autoencoder_copy["scores"], second_predictions)))

In [None]:
from turtleIsolationForests.extendedIsolationForest import ExtendedIsolationForest
from turtleIsolationForests.printResults import print_results

eif = ExtendedIsolationForest(contamination = contamination, random_state = None)
eif.fit(train_data_with_Z_df, train_labels)

In [None]:
%%timeit

eif.predict(anomalous_test_data_with_Z_df, predicted_anomalous_labels)

In [None]:
import pandas as pd
eif_scores, eif_predictions = eif.predict(anomalous_test_data_with_Z_df, predicted_anomalous_labels)

eif_df = pd.DataFrame()
eif_df["scores"] = eif_scores
eif_df["predictions"] = eif_predictions
eif_df['is_anomaly'] = predicted_anomalous_labels

getFinalPredictions(auc_df, eif_df)

In [None]:
print("EIF AUROC: " + str(get_auroc_value(eif_df["scores"], predicted_anomalous_labels)))

In [None]:
from turtleIsolationForests.sciForest import SCIsolationForest

scif = SCIsolationForest(contamination = contamination, num_hyperplanes_per_split=5, num_attributes_per_split=5, random_state = None)
scif.fit(train_data_with_Z_df, train_labels)

In [None]:
%%timeit

scif.predict(anomalous_test_data_with_Z_df, predicted_anomalous_labels)

In [None]:
scif_scores, scif_predictions = scif.predict(anomalous_test_data_with_Z_df, predicted_anomalous_labels)

scif_df = pd.DataFrame()
scif_df["scores"] = scif_scores
scif_df["predictions"] = scif_predictions
scif_df['is_anomaly'] = predicted_anomalous_labels

getFinalPredictions(auc_df, scif_df)

In [None]:
print("SCIF AUROC: " + str(get_auroc_value(scif_df["scores"], predicted_anomalous_labels)))

In [None]:
from turtleIsolationForests.isolationForest import IsolationForest

isoforest = IsolationForest(contamination = contamination, random_state = None)
isoforest.fit(train_data_with_Z_df, train_labels)

In [None]:
%%timeit

isoforest.predict(anomalous_test_data_with_Z_df,  predicted_anomalous_labels)

In [None]:
train_predictions = isoforest.train_scores
print_results(isoforest, train_labels)

In [None]:
isoforest_scores, isoforest_predictions = isoforest.predict(anomalous_test_data_with_Z_df, predicted_anomalous_labels)

isoforest_df = pd.DataFrame()
isoforest_df["scores"] = isoforest_scores
isoforest_df["predictions"] = isoforest_predictions
isoforest_df['is_anomaly'] = predicted_anomalous_labels

getFinalPredictions(auc_df, isoforest_df)

In [None]:
print("IF AUROC: " + str(get_auroc_value(isoforest_df["scores"], predicted_anomalous_labels)))

In [None]:
from turtleIsolationForests.FBIF import FBIsolationForest

fbif = FBIsolationForest(contamination = contamination, random_state = None)
fbif.fit(train_data_with_Z_df, train_labels)

In [None]:
%%timeit

fbif.predict(anomalous_test_data_with_Z_df, predicted_anomalous_labels)

In [None]:
fbif_scores, fbif_predictions = fbif.predict(anomalous_test_data_with_Z_df, predicted_anomalous_labels)

fbif_df = pd.DataFrame()
fbif_df["scores"] = fbif_scores
fbif_df["predictions"] = fbif_predictions
fbif_df['is_anomaly'] = predicted_anomalous_labels

getFinalPredictions(auc_df, fbif_df)

In [None]:
print("FBIF AUROC: " + str(get_auroc_value(fbif_df["scores"], predicted_anomalous_labels)))