In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, losses
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.models import Model

2023-02-06 17:26:16.899812: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
aecModel = None
eifModel = None

"""Sample code - may look different in the future"""

# Framework for Autoencoder predict function
def aecPredict(data, threshold):
    reconstructions = aecModel(data)
    loss = tf.keras.losses.mae(reconstructions, data)

    # return prediction
    return tf.math.less(loss, threshold)

# Framework for Extended Isolation Forest predict function
def eifPredict(data, aecFinalLayer):
    # append our final autencoder model layer to the data and predict
    data.append(aecFinalLayer)
    return eifModel.predict(data)

def isAnomaly(data):
    threshold = None

    # need autoencoder to return boolean isAnomaly and list of features
    isAnomaly = aecPredict(data, threshold)

    # if the autoencoder doesn't find anything out of the ordinary, return False
    if not isAnomaly:
        return False

    # if the autoencoder sees something weird, run it through the isolation forest to make sure
    return eifPredict(data, aecModel.finalLayer)


In [3]:
"""
Questions:
How do we obtain "windows" from our datasets?
What should k be for our data? Look at paper
How do we obtain our threshold? It would be a line between the average True point and average False point
"""

def pak(anomaly_segment_list, ground_truth, threshold,  k):
    allAboveThreshold = True

    for item in anomaly_segment_list:
        if item <= threshold:
            allAboveThreshold = False

    if allAboveThreshold:
        return True

    numCorrectlyDetected = 0

    for i in range(len(anomaly_segment_list)):
        if anomaly_segment_list[i] == ground_truth[i]:
            numCorrectlyDetected += 1

    return numCorrectlyDetected / len(anomaly_segment_list) > k

In [4]:
train_data = pd.read_csv('eda_simple_classification/network_data_mod_train.csv')
test_data = pd.read_csv('eda_simple_classification/network_data_mod_test.csv')

frames = [train_data, test_data]

dataframe  = pd.concat(frames)
raw_data = dataframe.values

In [5]:
# The last element contains the labels
labels = raw_data[:, -1]

data = raw_data[:, 0:-1]

train_data, test_data, train_labels, test_labels = train_test_split(
    data, labels, test_size=0.2, random_state=34
)

In [6]:
min_val = tf.reduce_min(train_data)
max_val = tf.reduce_max(train_data)

train_data = (train_data - min_val) / (max_val - min_val)
test_data = (test_data - min_val) / (max_val - min_val)

train_data = tf.cast(train_data, tf.float32)
test_data = tf.cast(test_data, tf.float32)

2023-02-06 17:26:29.428745: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
train_labels = train_labels.astype(bool)
test_labels = test_labels.astype(bool)

normal_train_data = train_data[train_labels]
normal_test_data = test_data[test_labels]

anomalous_train_data = train_data[~train_labels]
anomalous_test_data = test_data[~test_labels]

In [8]:

class AnomalyDetector(Model):
  def __init__(self):
    super(AnomalyDetector, self).__init__()
    self.encoder = tf.keras.Sequential([
      layers.Dense(32, activation="relu"), #zc
      layers.Dense(16, activation="relu"),
      layers.Dense(8, activation="relu")])

    self.decoder = tf.keras.Sequential([
      layers.Dense(16, activation="relu"),
      layers.Dense(32, activation="relu"),
      layers.Dense(47, activation="sigmoid")]) #zed

  def call(self, x):
    encoded = self.encoder(x)
    print(encoded)
    decoded = self.decoder(encoded)
    print(decoded)
    return decoded

In [9]:
autoencoder = AnomalyDetector()

In [10]:
autoencoder.compile(optimizer='adam', loss='mae')

In [11]:
history = autoencoder.fit(normal_train_data, normal_train_data,
          epochs=5,
          validation_data=(normal_test_data, normal_test_data),
          shuffle=True)

Epoch 1/5
Tensor("anomaly_detector/sequential/dense_2/Relu:0", shape=(None, 8), dtype=float32)
Tensor("anomaly_detector/sequential_1/dense_5/Sigmoid:0", shape=(None, 47), dtype=float32)
Tensor("anomaly_detector/sequential/dense_2/Relu:0", shape=(None, 8), dtype=float32)
Tensor("anomaly_detector/sequential_1/dense_5/Sigmoid:0", shape=(None, 47), dtype=float32)
Tensor("anomaly_detector/sequential_1/dense_5/Sigmoid:0", shape=(None, 47), dtype=float32)
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
reconstructions = autoencoder.predict(normal_train_data)
train_loss = tf.keras.losses.mae(reconstructions, normal_train_data)

Tensor("anomaly_detector/sequential/dense_2/Relu:0", shape=(None, 8), dtype=float32)
Tensor("anomaly_detector/sequential_1/dense_5/Sigmoid:0", shape=(None, 47), dtype=float32)


In [13]:
threshold = np.mean(train_loss) + np.std(train_loss)
print("Threshold: ", threshold)

Threshold:  0.00084734126


In [14]:
def predict(model, data, threshold):
  reconstructions = model(data)
  loss = tf.keras.losses.mae(reconstructions, data)
  return tf.math.less(loss, threshold)

In [15]:
preds = predict(autoencoder, test_data, threshold)

tf.Tensor(
[[1.1646488  0.         3.4084232  ... 2.0006201  1.019398   0.        ]
 [1.1531345  0.         1.9199445  ... 0.881355   0.935874   0.        ]
 [1.1536148  0.         2.1248932  ... 1.0357742  0.94654834 0.        ]
 ...
 [1.1650895  0.         3.3887477  ... 1.9860327  1.0185463  0.        ]
 [1.1686077  0.         3.9579866  ... 2.4143417  1.0501136  0.        ]
 [1.159325   0.         2.679181   ... 1.4523392  0.9787563  0.        ]], shape=(29704, 8), dtype=float32)
tf.Tensor(
[[6.6435272e-01 9.3528607e-10 1.3717155e-09 ... 2.0812418e-09
  2.4275397e-09 2.8831264e-09]
 [1.0204186e-01 3.9981942e-06 2.2685240e-06 ... 2.9166879e-06
  3.8471912e-06 4.8897759e-06]
 [1.4444372e-01 1.2676114e-06 8.2057483e-07 ... 1.0792702e-06
  1.3993307e-06 1.7618109e-06]
 ...
 [6.5579379e-01 1.0420544e-09 1.5088842e-09 ... 2.2846027e-09
  2.6691072e-09 3.1731635e-09]
 [8.7409121e-01 3.9734816e-11 8.4996926e-11 ... 1.3890124e-10
  1.5713267e-10 1.7895756e-10]
 [3.2793447e-01 5.6134315e-08 

In [39]:
data_point = normal_train_data[:1]
data_point

<tf.Tensor: shape=(1, 47), dtype=float32, numpy=
array([[4.5528370e-01, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 7.9382717e-06, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        6.0330867e-04, 1.2304322e-03, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 7.9382717e-06, 0.0000000e+00,
        7.9382723e-08, 2.0242594e-03, 2.0242594e-03, 7.9382717e-06,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00]], dtype=float32)>

In [40]:
encoded_data = autoencoder.encoder(data_point)

In [41]:
encoded_data

<tf.Tensor: shape=(1, 8), dtype=float32, numpy=
array([[1.1617663 , 0.        , 2.9265873 , 2.445588  , 0.        ,
        1.6382828 , 0.99282753, 0.        ]], dtype=float32)>

In [42]:
decoded_data = autoencoder.decoder(encoded_data)

In [43]:
decoded_data

<tf.Tensor: shape=(1, 47), dtype=float32, numpy=
array([[4.3940300e-01, 1.3973949e-08, 1.5061506e-08, 1.0552014e-08,
        1.7473852e-08, 2.1859215e-08, 1.8635976e-08, 1.7040120e-08,
        2.3410042e-08, 2.5621022e-08, 1.3952103e-07, 1.4892878e-08,
        1.4344020e-08, 3.0197867e-08, 1.6959351e-08, 1.7427986e-08,
        1.8848855e-08, 7.3073714e-08, 3.4542477e-08, 3.9594838e-08,
        2.2048861e-08, 1.9965208e-08, 2.4226885e-08, 1.3661312e-08,
        1.3048875e-08, 1.6178312e-08, 1.1852753e-08, 1.8646322e-08,
        5.5552459e-06, 2.7627437e-05, 1.8125210e-08, 1.0177120e-08,
        1.7655600e-08, 2.0234639e-08, 1.6106350e-07, 2.1260393e-08,
        1.7201305e-08, 9.7254536e-04, 1.7271625e-03, 2.0627638e-07,
        1.5970231e-08, 1.6651642e-08, 1.1500732e-08, 1.9974234e-08,
        2.1661364e-08, 2.6294675e-08, 3.1926223e-08]], dtype=float32)>