In [1]:
import tensorflow as tf
# tf.enable_eager_execution()
import numpy as np
import datetime
import os
import argparse
import matplotlib.pyplot as plt
import pandas as pd
import json
import glob
import tqdm
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
tf.__version__

'1.15.0'

In [2]:
from AAE import AAE
from CAAE import CAAE
from utils import *

In [3]:
input_dim = 29 * 29
n_l1 = 1000
n_l2 = 1000
z_dim = 10
batch_size = 100
n_epochs = 500
# learning_rate = 0.001
supervised_lr = 0.0001
reconstruction_lr = 0.0001
regularization_lr = 0.0001
beta1 = 0.9
n_labels = 2

# model = AAE(input_dim, n_l1, n_l2, z_dim, n_labels)
model = CAAE(n_labels = n_labels, z_dim = z_dim)

In [4]:
# Placeholders for input data and the targets
x_input = tf.placeholder(dtype=tf.float32, shape=[batch_size, input_dim], name='Input')
x_input_l = tf.placeholder(dtype=tf.float32, shape=[batch_size, input_dim], name='Labeled_Input')
y_input = tf.placeholder(dtype=tf.float32, shape=[batch_size, n_labels], name='Labels')
x_target = tf.placeholder(dtype=tf.float32, shape=[batch_size, input_dim], name='Target')
real_distribution = tf.placeholder(dtype=tf.float32, shape=[batch_size, z_dim], name='Real_distribution')
categorial_distribution = tf.placeholder(dtype=tf.float32, shape=[batch_size, n_labels],
                                         name='Categorical_distribution')
manual_decoder_input = tf.placeholder(dtype=tf.float32, shape=[1, z_dim + n_labels], name='Decoder_input')
learning_rate = tf.placeholder(tf.float32, shape=[])

In [5]:
# Reconstruction Phase
# Encoder try to predict both label and latent space of the input, which will be feed into Decoder to reconstruct the input
# The process is optimized by autoencoder_loss which is the MSE of the decoder_output and the orginal input
with (tf.variable_scope(tf.get_variable_scope())):
    encoder_output_label, encoder_output_latent = model.encoder(x_input)
    decoder_input = tf.concat([encoder_output_label, encoder_output_latent], 1)
    decoder_output = model.decoder(decoder_input)

autoencoder_loss = tf.reduce_mean(tf.square(x_target - decoder_output))
autoencoder_optimizer = tf.train.AdamOptimizer(learning_rate=reconstruction_lr, beta1=beta1).minimize(autoencoder_loss)


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Please use `layer.__call__` method instead.



In [6]:
# Regularization Phase
# Train both 2 discriminator of gaussian and categorical to detect the output from encoder
with (tf.variable_scope(tf.get_variable_scope())):
    # Discriminator for gaussian
    d_g_real = model.discriminator_gauss(real_distribution)
    d_g_fake = model.discriminator_gauss(encoder_output_latent, reuse=True)
# Need to seperate dicriminator of gaussian and categorical
with (tf.variable_scope(tf.get_variable_scope())):
    # Discrimnator for categorical
    d_c_real = model.discriminator_categorical(categorial_distribution)
    d_c_fake = model.discriminator_categorical(encoder_output_label, reuse=True)

# Discriminator gaussian loss 
dc_g_loss_real = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_g_real), logits=d_g_real))
dc_g_loss_fake = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(d_g_fake), logits=d_g_fake))
dc_g_loss = dc_g_loss_real + dc_g_loss_fake

# Discriminator categorical loss
dc_c_loss_real = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_c_real), logits=d_c_real))
dc_c_loss_fake = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(d_c_fake), logits=d_c_fake))
dc_c_loss = dc_c_loss_fake + dc_c_loss_real

all_variables = tf.trainable_variables()
dc_g_var = [var for var in all_variables if 'dc_g_' in var.name]
dc_c_var = [var for var in all_variables if 'dc_c_' in var.name]
discriminator_g_optimizer = tf.train.AdamOptimizer(learning_rate=regularization_lr/5,
                                                       beta1=beta1).minimize(dc_g_loss, var_list=dc_g_var)
discriminator_c_optimizer = tf.train.AdamOptimizer(learning_rate=regularization_lr/5,
                                                       beta1=beta1).minimize(dc_c_loss, var_list=dc_c_var)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [7]:
# Generator loss
generator_g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_g_fake), logits=d_g_fake))
generator_c_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_c_fake), logits=d_c_fake))
generator_loss = generator_g_loss + generator_c_loss

en_var = [var for var in all_variables if 'e_' in var.name]
generator_optimizer = tf.train.AdamOptimizer(learning_rate=regularization_lr, beta1=beta1).minimize(generator_loss, var_list=en_var)

In [8]:
# Semi-Supervised Classification Phase
# Train encoder with a small amount of label samples
with tf.variable_scope(tf.get_variable_scope()):
    encoder_output_label_s, encoder_output_latent_s = model.encoder(x_input_l, reuse=True, supervised=True)
    
# Classification accuracy of encoder
output_label = tf.argmax(encoder_output_label_s, 1)
correct_pred = tf.equal(output_label, tf.argmax(y_input, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
supervised_encoder_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_input, logits=encoder_output_label_s))
supervised_encoder_optimizer = tf.train.AdamOptimizer(learning_rate=supervised_lr, beta1=beta1).minimize(supervised_encoder_loss, var_list=en_var)


Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



# Train in supervised with latent labeled data

In [None]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()
# data_path = './Data/*/'
n_labeled = 10000
labels = ['DoS', 'Fuzzy', 'gear', 'RPM', 'Normal']
unknown_attack = ''
normal_label_path = ['./Data/Normal/train_label']
attack_label_paths = ['./Data/{}/train_label'.format(l) for l in labels[:-1]]

train_normal = data_from_tfrecord(normal_label_path, batch_size, 1)
train_attack = data_from_tfrecord(attack_label_paths, batch_size, 1)
with tf.Session() as sess:
    results_path = './Results/all/2021-07-19 17:33:01.052325_10_0.0001_100_1000_0.9_Semi_Supervised/'
    saver.restore(sess, save_path=tf.train.latest_checkpoint(results_path + '/Saved_models'))
    n_batches = int(n_labeled / batch_size)
    X = np.empty((0, 10), float)
    y = np.empty((0), int)
    for b in tqdm.tqdm(range(1, n_batches + 1)):
        batch_x_l, batch_y_l = data_stream(train_normal, sess)
        batch_label = np.argmax(batch_y_l, axis=1)
        batch_encoded_x = sess.run(encoder_latent, feed_dict={x_input_l: batch_x_l})
        
        X = np.append(X, batch_encoded_x, axis=0)
        y = np.append(y, batch_label, axis=0)
        
        batch_x_l, batch_y_l = data_stream(train_attack, sess)
        batch_label = np.argmax(batch_y_l, axis=1)
        batch_encoded_x = sess.run(encoder_latent, feed_dict={x_input_l: batch_x_l})
        
        X = np.append(X, batch_encoded_x, axis=0)
        y = np.append(y, batch_label, axis=0)
        
    print(X.shape, y.shape)

In [96]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(100, input_dim=10, activation='relu'),
  tf.keras.layers.Dense(100, activation='relu'),
  tf.keras.layers.Dense(2, activation='softmax')
])

In [97]:
model.compile(optimizer='adam',
              loss= tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [98]:
model.fit(X, y, epochs=100, batch_size=100)

Train on 20000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/10

<tensorflow.python.keras.callbacks.History at 0x7fa29659ab70>

# Train K-means with latent labeled data

In [46]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()
# data_path = './Data/*/'
n_labeled = 37372
labels = ['Fuzzy', 'gear', 'RPM', 'Normal']
unknown_attack = ''
train_label_paths = ['./Data/{}/train_label'.format(l) for l in labels if l is not unknown_attack]
train_label = data_from_tfrecord(train_label_paths, batch_size, 1)

with tf.variable_scope(tf.get_variable_scope()):
    encoder_label, encoder_latent = model.encoder(x_input_l, reuse=True, supervised=False)

with tf.Session() as sess:
    results_path = './Results/unknown/DoS/CNN_2021-07-23 11:29:55.406382_10_0.0001_64_500_0.9_Semi_Supervised/'
    saver.restore(sess, save_path=tf.train.latest_checkpoint(results_path + '/Saved_models'))
    n_batches = int(n_labeled / batch_size)
    X = np.empty((0, 10), float)
    y = np.empty((0), int)
    for b in tqdm.tqdm(range(1, n_batches + 1)):
        batch_x_l, batch_y_l = data_stream(train_label, sess)
        batch_label = np.argmax(batch_y_l, axis=1)
        batch_encoded_x = sess.run(encoder_latent, feed_dict={x_input_l: batch_x_l})
        X = np.append(X, batch_encoded_x, axis=0)
        y = np.append(y, batch_label, axis=0)
    print(X.shape, y.shape)

INFO:tensorflow:Restoring parameters from ./Results/unknown/DoS/CNN_2021-07-23 11:29:55.406382_10_0.0001_64_500_0.9_Semi_Supervised//Saved_models/-291500


100%|██████████| 373/373 [00:01<00:00, 327.86it/s]

(37300, 10) (37300,)





In [47]:
kmeans = KMeans(n_clusters=5).fit(X)

In [48]:
def get_class(clt, y):
    mask = (kmeans.labels_ == clt)
    same_cluster = y[mask]
    try:
        prob_attack = np.count_nonzero(same_cluster)/same_cluster.shape[0]
    except:
        print(clt, same_cluster.shape)
    return int(prob_attack >= 0.7)

# Train anomoly detection by latent space

In [33]:
from sklearn.neighbors import LocalOutlierFactor

In [34]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()
# data_path = './Data/*/'
n_labeled = 24693
# Only use labeled normal data
train_label_paths = ['./Data/Normal/train_label']
train_label = data_from_tfrecord(train_label_paths, batch_size, 1)

with tf.variable_scope(tf.get_variable_scope()):
    encoder_label, encoder_latent = model.encoder(x_input_l, reuse=True, supervised=False)
    
with tf.Session() as sess:
    results_path = './Results/unknown/DoS/CNN_2021-07-23 11:29:55.406382_10_0.0001_64_500_0.9_Semi_Supervised//'
    saver.restore(sess, save_path=tf.train.latest_checkpoint(results_path + '/Saved_models'))
    n_batches = int(n_labeled / batch_size)
    X = np.empty((0, 12), float)
    y = np.empty((0), int)
    for b in tqdm.tqdm(range(1, n_batches + 1)):
        batch_x_l, batch_y_l = data_stream(train_label, sess)
        batch_label = np.argmax(batch_y_l, axis=1)
        #print(batch_x_l.shape)
        batch_encoded_l, batch_encoded_z = sess.run([encoder_label, encoder_latent], feed_dict={x_input_l: batch_x_l})
        batch_encoded_x = np.concatenate((batch_encoded_l, batch_encoded_z), axis=1)
        X = np.append(X, batch_encoded_x, axis=0)
        y = np.append(y, batch_label, axis=0)
    print(X.shape, y.shape)

INFO:tensorflow:Restoring parameters from ./Results/unknown/DoS/CNN_2021-07-23 11:29:55.406382_10_0.0001_64_500_0.9_Semi_Supervised///Saved_models/-291500


100%|██████████| 246/246 [00:00<00:00, 304.19it/s]

(24600, 12) (24600,)





In [35]:
lof = LocalOutlierFactor(novelty=True)
lof.fit(X)

LocalOutlierFactor(novelty=True)

# Final prediction

In [49]:
# labels = ['DoS', 'Fuzzy', 'gear', 'RPM', 'Normal']
labels = ['DoS', 'Normal']
unknown_attack = ''
test_size = 0
for f in ['./Data/{}/datainfo.txt'.format(l) for l in labels]:
    data_read = json.load(open(f))
    test_size += data_read['test']
test_size

58532

In [54]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()
# labels = ['DoS', 'Fuzzy', 'gear', 'RPM', 'Normal']
# labels = ['DoS', 'Normal']
# test_paths = glob.glob(data_path + 'test')
with tf.Session() as sess:
    #attack = 'gear'
    data_path = ['./Data/{}/'.format(a) for a in labels]
    print(data_path)
    results_path = './Results/unknown/DoS/CNN_2021-07-23 11:29:55.406382_10_0.0001_64_500_0.9_Semi_Supervised/'
    #data_info = json.load(open(data_path + 'datainfo.txt'))
    
    # Get the latest results folder
    #all_results = os.listdir(results_path)
    #all_results.sort()
    
    #saver.restore(sess, save_path=tf.train.latest_checkpoint(results_path + '/' +
    #                                                         all_results[1] + '/Saved_models/'))
    saver.restore(sess, save_path=tf.train.latest_checkpoint(results_path + '/Saved_models'))
    
    test = data_from_tfrecord([p + 'test' for p in data_path], batch_size, 1)
    
    num_batches = int(test_size / batch_size)
    y_true = np.empty((0), int)
    y_pred = np.empty((0), int)
    total_prob = np.empty((0), float)
    total_latent = np.empty((0, z_dim + 2), float)
    
    with tf.variable_scope(tf.get_variable_scope()):
        encoder_label, _ = model.encoder(x_input_l, reuse=True, supervised=False)
    
    for _ in tqdm.tqdm(range(num_batches)):
        x_test, y_test = data_stream(test, sess)
        batch_pred, batch_latent = sess.run([encoder_output_label_s, encoder_output_latent_s], feed_dict={x_input_l: x_test})
        batch_encoded_label = sess.run(encoder_label, feed_dict={x_input_l: x_test})
        batch_encoded = np.concatenate((batch_encoded_label, batch_latent), axis=1)
        total_latent = np.append(total_latent, batch_encoded, axis=0)
        batch_label = np.argmax(y_test, axis=1).reshape((batch_size))
        prob = np.max(batch_pred, axis=1).reshape((batch_size))
        batch_pred = np.argmax(batch_pred, axis=1).reshape((batch_size))
        y_pred = np.append(y_pred, batch_pred, axis=0)
        y_true = np.append(y_true, batch_label, axis=0)  
        total_prob = np.append(total_prob, prob, axis=0)

['./Data/DoS/', './Data/Normal/']
INFO:tensorflow:Restoring parameters from ./Results/unknown/DoS/CNN_2021-07-23 11:29:55.406382_10_0.0001_64_500_0.9_Semi_Supervised//Saved_models/-291500


100%|██████████| 585/585 [00:03<00:00, 182.77it/s]


In [55]:
evaluate(y_true, y_pred)

1 5613
8 52878
False negative rate:  0.9998218738867118
Error rate:  0.09608547008547008
Precision:  0.1111111111111111
Recall:  0.0001781261132881573
F1 score:  0.000355682020273774


In [56]:
%%time 
normal_indices = (y_pred == 0).nonzero()
normal_latent = total_latent[normal_indices]

CPU times: user 0 ns, sys: 3.21 ms, total: 3.21 ms
Wall time: 2.41 ms


In [57]:
pred_cluster = kmeans.predict(normal_latent[:, :10])
f = lambda x: get_class(x, y)
vf = np.vectorize(f)
pred_label = vf(pred_cluster)
y_pred[normal_indices] = pred_label

In [58]:
evaluate(y_true, y_pred)

1 5613
8 52878
False negative rate:  0.9998218738867118
Error rate:  0.09608547008547008
Precision:  0.1111111111111111
Recall:  0.0001781261132881573
F1 score:  0.000355682020273774


In [None]:
pred = model.predict(normal_latent)
pred = np.argmax(pred, axis=1)
y_pred[normal_indices] = pred

In [103]:
evaluate(y_true, y_pred)

32540 246
23183 29731
False negative rate:  0.007503202586469835
Error rate:  0.27338389731621937
Precision:  0.5839599447265941
Recall:  0.9924967974135301
F1 score:  0.7352924561344044


In [36]:
pred = lof.predict(normal_latent)
label = np.where(pred < 0, 1, 0)
y_pred[normal_indices] = label

In [37]:
evaluate(y_true, y_pred)

26 5591
131 52752
False negative rate:  0.9953711945878583
Error rate:  0.09781196581196581
Precision:  0.16560509554140126
Recall:  0.004628805412141723
F1 score:  0.009005888465535176
