In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [2]:
import autoencoder.aecExtraFeatures as Z_calculations

def addZToPrediction(model, data_point):
    reconstruction = model.decoder(model.encoder(data_point))

    Z_features = [Z_calculations.getZVector(data_point, reconstruction)]

    Z_features_tensor = tf.convert_to_tensor(Z_features, dtype=tf.float32)
    data_point = tf.convert_to_tensor(data_point, dtype=tf.float32)

    data_point = tf.concat([data_point, Z_features_tensor], 1)

    return data_point

In [3]:
def isAnomaly(data_point, model_1, model_2, threshold):

    # need autoencoder to return boolean isAnomaly
    isAnomaly = tf.math.less(tf.keras.losses.mae(model_1(data), data), threshold)

    # if the autoencoder doesn't find anything out of the ordinary, return False
    if not isAnomaly:
        return False

    data_point = addZToPrediction(model_1, data_point)

    # if the autoencoder sees something weird, run it through the isolation forest to make sure
    return model_2.predict(data_point)

In [4]:
train_data = pd.read_csv('eda_simple_classification/network_data_mod_train.csv')
test_data = pd.read_csv('eda_simple_classification/network_data_mod_test.csv')

frames = [train_data, test_data]

dataframe  = pd.concat(frames)
raw_data = dataframe.values

In [5]:
# The last element contains the labels
labels = raw_data[:, -1]

data = raw_data[:, 0:-1]

train_data, test_data, train_labels, test_labels = train_test_split(
    data, labels, test_size=0.2, random_state=34
)

In [6]:
min_val = tf.reduce_min(train_data)
max_val = tf.reduce_max(train_data)

train_data = (train_data - min_val) / (max_val - min_val)
test_data = (test_data - min_val) / (max_val - min_val)

train_data = tf.cast(train_data, tf.float32)
test_data = tf.cast(test_data, tf.float32)

In [7]:
train_labels = train_labels.astype(bool)
test_labels = test_labels.astype(bool)

normal_train_data = train_data[train_labels]
normal_test_data = test_data[test_labels]

anomalous_train_data = train_data[~train_labels]
anomalous_test_data = test_data[~test_labels]

In [8]:
from autoencoder.autoencoder import AnomalyDetector
autoencoder = AnomalyDetector()

In [9]:
autoencoder.compile(optimizer='adam', loss='mae')

In [10]:
history = autoencoder.fit(normal_train_data, normal_train_data,
          epochs=5,
          validation_data=(normal_test_data, normal_test_data),
          shuffle=True)

Epoch 1/5
Tensor("anomaly_detector/sequential/dense_2/Relu:0", shape=(None, 8), dtype=float32)
Tensor("anomaly_detector/sequential_1/dense_5/Sigmoid:0", shape=(None, 47), dtype=float32)
Tensor("anomaly_detector/sequential/dense_2/Relu:0", shape=(None, 8), dtype=float32)
Tensor("anomaly_detector/sequential_1/dense_5/Sigmoid:0", shape=(None, 47), dtype=float32)
Tensor("anomaly_detector/sequential_1/dense_5/Sigmoid:0", shape=(None, 47), dtype=float32)
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [11]:
reconstructions = autoencoder.predict(train_data)
train_loss = tf.keras.losses.mae(reconstructions, train_data)
threshold = np.mean(train_loss) + np.std(train_loss)
print("Threshold: ", threshold)

Tensor("anomaly_detector/sequential/dense_2/Relu:0", shape=(None, 8), dtype=float32)
Tensor("anomaly_detector/sequential_1/dense_5/Sigmoid:0", shape=(None, 47), dtype=float32)
Threshold:  0.0011032214


In [12]:
train_data_with_Z = []
for i in range(1, len(normal_train_data)):
    train_data_with_Z.append(addZToPrediction(autoencoder, normal_train_data[i-1:i]))

In [13]:
train_data[:1]

<tf.Tensor: shape=(1, 47), dtype=float32, numpy=
array([[4.5528370e-01, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 7.9382717e-06, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        6.0330867e-04, 1.2304322e-03, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 7.9382717e-06, 0.0000000e+00,
        7.9382723e-08, 2.0242594e-03, 2.0242594e-03, 7.9382717e-06,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00]], dtype=float32)>

In [14]:
data_point = train_data_with_Z[:1]

In [15]:
train_data_with_Z = train_data_with_Z[:100]

In [51]:
rf_train_data_with_Z = []

In [53]:
for i in range(len(train_data_with_Z)):
    rf_train_data_with_Z.append([item.numpy() for sublist in data_point for item in sublist])

In [68]:
np.array([list(range(49))]).shape

(1, 49)

In [70]:
rf_train_data_with_Z[:][0][0].shape

(49,)

In [30]:
contamination = sum(train_labels == 0) / len(train_labels)
contamination

0.4811678856690766

In [31]:
labels

array([1., 1., 0., ..., 0., 1., 0.])

In [33]:
train_data_with_Z_df = pd.DataFrame(train_data_with_Z)

In [34]:
train_data_with_Z_df

Unnamed: 0,0
0,"[0.4552837, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,"[0.4552837, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,"[0.4552837, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,"[0.4552837, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,"[0.4552837, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
...,...
95,"[0.4552837, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
96,"[0.4552837, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
97,"[0.4552837, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
98,"[0.4552837, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [34]:
from turtleIsolationForests.extendedIsolationForest import ExtendedIsolationForest

model = ExtendedIsolationForest(contamination = contamination, random_state = None)
model.fit(train_data_with_Z_df)
model

AttributeError: 'list' object has no attribute 'sample'