In [1]:
import tensorflow as tf
# tf.enable_eager_execution()
import numpy as np
import datetime
import os
import argparse
import matplotlib.pyplot as plt
import pandas as pd
import json
import glob
import tqdm
from sklearn.metrics import confusion_matrix
tf.__version__

'1.15.0'

In [2]:
from model import encoder, decoder, discriminator_gauss, discriminator_categorical
from utils import read_tfrecord, data_from_tfrecord, data_stream

In [3]:
# Run this when train with all data
data_info = {
   "train_unlabel": 0, 
    "train_label": 0, 
    "validation": 0, 
    "test": 0
}
labels = ['DoS', 'Fuzzy', 'gear', 'RPM', 'Normal']
for f in ['./Data/{}/datainfo.txt'.format(l) for l in labels]:
    data_read = json.load(open(f))
    for key in data_info.keys():
        data_info[key] += data_read[key]
        
attack = 'all' # DoS, Fuzzy, gear, RPM, all
results_path = './Results/{}/'.format(attack)
data_path = './Data/*/'

print('Data info: ', data_info)

Data info:  {'train_unlabel': 10798551, 'train_label': 800000, 'validation': 2485403, 'test': 2485403}


In [2]:
# Run this when train with unknown attack
data_info = {
   "train_unlabel": 0, 
    "train_label": 0, 
    "validation": 0, 
    "test": 0
}
attacks = ['DoS', 'gear', 'RPM', 'Fuzzy']
unknown_attack_idx = 0
results_path = './Results/unknown_attack/{}'.format(attacks[unknown_attack_idx])
# Put all into unlabel
train_unlabel_paths = ['./Data/{}/train_unlabel'.format(x) for x in attacks]
train_label_paths = ['./Data/{}/train_label'.format(x) for x in attacks if x is not attacks[unknown_attack_idx]]
val_paths = ['./Data/{}/val'.format(x) for x in attacks if x is not attacks[unknown_attack_idx]]

print('Train unlabel paths: ', train_unlabel_paths)
print('Train label paths: ', train_label_paths)
print('Validation paths: ', val_paths)

data_info_paths = ['./Data/{}/datainfo.txt'.format(x) for x in attacks if x is not attacks[unknown_attack_idx]]
for f in data_info_paths:
    data_read = json.load(open(f))
    for key in data_info.keys():
        data_info[key] += data_read[key]
        
print('Data info:', data_info)

Train unlabel paths:  ['./Data/DoS/train_unlabel', './Data/gear/train_unlabel', './Data/RPM/train_unlabel', './Data/Fuzzy/train_unlabel']
Train label paths:  ['./Data/gear/train_label', './Data/RPM/train_label', './Data/Fuzzy/train_label']
Validation paths:  ['./Data/gear/val', './Data/RPM/val', './Data/Fuzzy/val']
Data info: {'train_unlabel': 6503420, 'train_label': 722604, 'validation': 1806508, 'test': 3871088}


In [3]:
print('Unknown attack: ', attacks[unknown_attack_idx])
data_info_unknown_attack = json.load(open('./Data/{}/datainfo.txt'.format(attacks[unknown_attack_idx])))
val_unknown_attack_path = './Data/{}/val'.format(attacks[unknown_attack_idx])
print('Data info: ', data_info_unknown_attack)
validation_unknown_size = data_info_unknown_attack['validation']

Unknown attack:  DoS
Data info:  {'train_unlabel': 1662780, 'train_label': 184754, 'validation': 461884, 'test': 1099723}


## Idea to improve:

1) Tune n_l1, n_l2 and z_dim 
2) Generator, Regualrization and Semi phase with different learning rate
3) Discriminator wih smaller learning rate
4) Consider about regularize autoencoder: parse penalty or variational 

# Hyperparameter

In [15]:
input_dim = 29 * 29
n_l1 = 1000
n_l2 = 1000
z_dim = 10
batch_size = 100
n_epochs = 500
# learning_rate = 0.001
supervised_lr = 0.001
reconstruction_lr = 0.0006
regularization_lr = 0.0008
beta1 = 0.9
n_labels = 2
n_labeled = data_info['train_label']
validation_size = data_info['validation']

# Define data

In [None]:
# For unknown attack
train_unlabel = data_from_tfrecord(train_unlabel_paths, batch_size, n_epochs)
train_label = data_from_tfrecord(train_label_paths, batch_size, n_epochs)
validation = data_from_tfrecord(val_paths, batch_size, n_epochs)
validation_unknown = data_from_tfrecord(val_unknown_attack_path, batch_size, n_epochs)

In [5]:
# For all
train_unlabel = data_from_tfrecord(glob.glob(data_path + 'train_unlabel'), batch_size, n_epochs)
train_label = data_from_tfrecord(glob.glob(data_path + 'train_label'), batch_size, n_epochs)
validation = data_from_tfrecord(glob.glob(data_path + 'val'), batch_size, n_epochs)

Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.


In [13]:
# train = data_from_tfrecord('./Data/TFRecord/Normal_DoS', 100, 1)
# init = tf.global_variables_initializer()
# with tf.Session() as sess:
#     sess.run(init)
#     x_l, y_l = data_helper(train_unlabel, sess)
#     print(x_l.shape, y_l.shape)

In [6]:
# Placeholders for input data and the targets
x_input = tf.placeholder(dtype=tf.float32, shape=[batch_size, input_dim], name='Input')
x_input_l = tf.placeholder(dtype=tf.float32, shape=[batch_size, input_dim], name='Labeled_Input')
y_input = tf.placeholder(dtype=tf.float32, shape=[batch_size, n_labels], name='Labels')
x_target = tf.placeholder(dtype=tf.float32, shape=[batch_size, input_dim], name='Target')
real_distribution = tf.placeholder(dtype=tf.float32, shape=[batch_size, z_dim], name='Real_distribution')
categorial_distribution = tf.placeholder(dtype=tf.float32, shape=[batch_size, n_labels],
                                         name='Categorical_distribution')
manual_decoder_input = tf.placeholder(dtype=tf.float32, shape=[1, z_dim + n_labels], name='Decoder_input')

# Loss

In [7]:
# Reconstruction Phase
# Encoder try to predict both label and latent space of the input, which will be feed into Decoder to reconstruct the input
# The process is optimized by autoencoder_loss which is the MSE of the decoder_output and the orginal input
with (tf.variable_scope(tf.get_variable_scope())):
    encoder_output_label, encoder_output_latent = encoder(x_input)
    decoder_input = tf.concat([encoder_output_label, encoder_output_latent], 1)
    decoder_output = decoder(decoder_input)

autoencoder_loss = tf.reduce_mean(tf.square(x_target - decoder_output))
autoencoder_optimizer = tf.train.AdamOptimizer(learning_rate=reconstruction_lr, beta1=beta1).minimize(autoencoder_loss)




In [8]:
# Regularization Phase
# Train both 2 discriminator of gaussian and categorical to detect the output from encoder
with (tf.variable_scope(tf.get_variable_scope())):
    # Discriminator for gaussian
    d_g_real = discriminator_gauss(real_distribution)
    d_g_fake = discriminator_gauss(encoder_output_latent, reuse=True)
# Need to seperate dicriminator of gaussian and categorical
with (tf.variable_scope(tf.get_variable_scope())):
    # Discrimnator for categorical
    d_c_real = discriminator_categorical(categorial_distribution)
    d_c_fake = discriminator_categorical(encoder_output_label, reuse=True)

# Discriminator gaussian loss 
dc_g_loss_real = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_g_real), logits=d_g_real))
dc_g_loss_fake = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(d_g_fake), logits=d_g_fake))
dc_g_loss = dc_g_loss_real + dc_g_loss_fake

# Discriminator categorical loss
dc_c_loss_real = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_c_real), logits=d_c_real))
dc_c_loss_fake = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(d_c_fake), logits=d_c_fake))
dc_c_loss = dc_c_loss_fake + dc_c_loss_real

all_variables = tf.trainable_variables()
dc_g_var = [var for var in all_variables if 'dc_g_' in var.name]
dc_c_var = [var for var in all_variables if 'dc_c_' in var.name]
discriminator_g_optimizer = tf.train.AdamOptimizer(learning_rate=regularization_lr/5,
                                                       beta1=beta1).minimize(dc_g_loss, var_list=dc_g_var)
discriminator_c_optimizer = tf.train.AdamOptimizer(learning_rate=regularization_lr/5,
                                                       beta1=beta1).minimize(dc_c_loss, var_list=dc_c_var)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [9]:
# Generator loss
generator_g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_g_fake), logits=d_g_fake))
generator_c_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_c_fake), logits=d_c_fake))
generator_loss = generator_g_loss + generator_c_loss

en_var = [var for var in all_variables if 'e_' in var.name]
generator_optimizer = tf.train.AdamOptimizer(learning_rate=regularization_lr, beta1=beta1).minimize(generator_loss, var_list=en_var)

In [10]:
# Semi-Supervised Classification Phase
# Train encoder with a small amount of label samples
with tf.variable_scope(tf.get_variable_scope()):
    encoder_output_label_, _ = encoder(x_input_l, reuse=True, supervised=True)
    
# Classification accuracy of encoder
output_label = tf.argmax(encoder_output_label_, 1)
correct_pred = tf.equal(output_label, tf.argmax(y_input, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

supervised_encoder_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_input, logits=encoder_output_label_))
supervised_encoder_optimizer = tf.train.AdamOptimizer(learning_rate=supervised_lr, beta1=beta1).minimize(supervised_encoder_loss, var_list=en_var)


Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [11]:
def form_results():
    """
    Forms folders for each run to store the tensorboard files, saved models and the log files.
    :return: three string pointing to tensorboard, saved models and log paths respectively.
    """
    folder_name = "/{0}_{1}_{2}_{3}_{4}_{5}_Semi_Supervised". \
        format(datetime.datetime.now(), z_dim, supervised_lr, batch_size, n_epochs, beta1)
    tensorboard_path = results_path + folder_name + '/Tensorboard'
    saved_model_path = results_path + folder_name + '/Saved_models/'
    log_path = results_path + folder_name + '/log'
    if not os.path.exists(results_path + folder_name):
        os.mkdir(results_path + folder_name)
        os.mkdir(tensorboard_path)
        os.mkdir(saved_model_path)
        os.mkdir(log_path)
    return tensorboard_path, saved_model_path, log_path

In [12]:
def next_batch(x, y, batch_size):
    """
    Used to return a random batch from the given inputs.
    :param x: Input images of shape [None, 784]
    :param y: Input labels of shape [None, 10]
    :param batch_size: integer, batch size of images and labels to return
    :return: x -> [batch_size, 784], y-> [batch_size, 10]
    """
    index = np.arange(n_labeled)
    random_index = np.random.permutation(index)[:batch_size]
    return x[random_index], y[random_index]

# Training

In [13]:
def get_val_acc(val_size, batch_size, tfdata):
    acc = 0
    y_true, y_pred = [], []
    num_batches = int(val_size/batch_size)
    for j in tqdm.tqdm(range(num_batches)):
        batch_x_l, batch_y_l = data_helper(tfdata, sess)
        batch_pred = sess.run(output_label, feed_dict={x_input_l: batch_x_l, y_input: batch_y_l})
        
        #batch_pred = sess.run(pred_label, feed_dict={x_input_l: x_test})
        batch_label = np.argmax(batch_y_l, axis=1)
        y_pred += batch_pred.tolist()
        y_true += batch_label.tolist()
        
    avg_acc = np.equal(y_true, y_pred).mean()

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    fnr = fn/(tp + fn)
    err = (fn + fp) / (tp + tn + fp + fn)
    precision = tp/(tp + fp)
    recall = 1 - fnr
    f1 = (2 * precision * recall) / (precision + recall)
    
    #print(avg_acc, precision, recall, f1)
    return avg_acc, precision, recall, f1

In [None]:
init = tf.global_variables_initializer()

# Tensorboard visualization
tf.summary.scalar(name='Autoencoder Loss', tensor=autoencoder_loss)
tf.summary.scalar(name='Discriminator gauss Loss', tensor=dc_g_loss)
tf.summary.scalar(name='Discriminator categorical Loss', tensor=dc_c_loss)
tf.summary.scalar(name='Generator Loss', tensor=generator_loss)
tf.summary.scalar(name='Supervised Encoder Loss', tensor=supervised_encoder_loss)
# tf.summary.scalar(name='Supervised Encoder Accuracy', tensor=accuracy)
tf.summary.histogram(name='Encoder Gauss Distribution', values=encoder_output_latent)
tf.summary.histogram(name='Real Gauss Distribution', values=real_distribution)
tf.summary.histogram(name='Encoder Categorical Distribution', values=encoder_output_label)
tf.summary.histogram(name='Real Categorical Distribution', values=categorial_distribution)

summary_op = tf.summary.merge_all()

accuracies = []
# Saving the model
saver = tf.train.Saver()
step = 0
# Early stopping
save_sess = None
best_acc = 1
stop = False
last_improvement = 0
require_improvement = 20

with tf.Session() as sess:
    if True:
        tensorboard_path, saved_model_path, log_path = form_results()
        sess.run(init)
        writer = tf.summary.FileWriter(logdir=tensorboard_path, graph=sess.graph)
        for epoch in range(n_epochs):
            if epoch == 150:
                supervised_lr /= 10
                reconstruction_lr /= 10
                regularization_lr /= 10
            n_batches = int(n_labeled / batch_size)
            print("------------------Epoch {}/{}------------------".format(epoch, n_epochs))
            for b in tqdm.tqdm(range(1, n_batches + 1)):
                z_real_dist = np.random.randn(batch_size, z_dim) * 5.
                real_cat_dist = np.random.randint(low=0, high=2, size=batch_size)
                real_cat_dist = np.eye(n_labels)[real_cat_dist]
                
                batch_x_ul, _ = data_stream(train_unlabel, sess)
                batch_x_l, batch_y_l = data_stream(train_label, sess)
                
                sess.run(autoencoder_optimizer, feed_dict={x_input: batch_x_ul, x_target: batch_x_ul})
                sess.run(discriminator_g_optimizer,
                         feed_dict={x_input: batch_x_ul, x_target: batch_x_ul, real_distribution: z_real_dist})
                sess.run(discriminator_c_optimizer,
                         feed_dict={x_input: batch_x_ul, x_target: batch_x_ul,
                                    categorial_distribution: real_cat_dist})
                sess.run(generator_optimizer, feed_dict={x_input: batch_x_ul, x_target: batch_x_ul})
                
                if b % 10 == 0:
                    sess.run(supervised_encoder_optimizer, feed_dict={x_input_l: batch_x_l, y_input: batch_y_l})
                if b % 100 == 0:
                    a_loss, d_g_loss, d_c_loss, g_loss, s_loss, summary = sess.run(
                        [autoencoder_loss, dc_g_loss, dc_c_loss, generator_loss, supervised_encoder_loss,
                         summary_op],
                        feed_dict={x_input: batch_x_ul, x_target: batch_x_ul,
                                   real_distribution: z_real_dist, y_input: batch_y_l, x_input_l: batch_x_l,
                                   categorial_distribution: real_cat_dist})
                    writer.add_summary(summary, global_step=step)
                    with open(log_path + '/log.txt', 'a') as log:
                        log.write("Epoch: {}, iteration: {}\n".format(epoch, b))
                        log.write("Autoencoder Loss: {}\n".format(a_loss))
                        log.write("Discriminator Gauss Loss: {}".format(d_g_loss))
                        log.write("Discriminator Categorical Loss: {}".format(d_c_loss))
                        log.write("Generator Loss: {}\n".format(g_loss))
                        log.write("Supervised Loss: {}".format(s_loss))
                step += 1
                
            if (epoch + 1) % 10 == 0:
                print("Runing on validation...")
                acc_known, precision_known, recall_known, f1_known = get_val_acc(validation_size, batch_size, validation)
                print("Accuracy on Known attack: {}".format(acc_known))
                print("Precision on Known attack: {}".format(precision_known))
                print("Recall on Known attack: {}".format(recall_known))
                print("F1 on Known attack: {}".format(f1_known))
            saver.save(sess, save_path=saved_model_path, global_step=step)

INFO:tensorflow:Summary name Autoencoder Loss is illegal; using Autoencoder_Loss instead.
INFO:tensorflow:Summary name Discriminator gauss Loss is illegal; using Discriminator_gauss_Loss instead.
INFO:tensorflow:Summary name Discriminator categorical Loss is illegal; using Discriminator_categorical_Loss instead.
INFO:tensorflow:Summary name Generator Loss is illegal; using Generator_Loss instead.
INFO:tensorflow:Summary name Supervised Encoder Loss is illegal; using Supervised_Encoder_Loss instead.
INFO:tensorflow:Summary name Encoder Gauss Distribution is illegal; using Encoder_Gauss_Distribution instead.
INFO:tensorflow:Summary name Real Gauss Distribution is illegal; using Real_Gauss_Distribution instead.
INFO:tensorflow:Summary name Encoder Categorical Distribution is illegal; using Encoder_Categorical_Distribution instead.
INFO:tensorflow:Summary name Real Categorical Distribution is illegal; using Real_Categorical_Distribution instead.


  0%|          | 0/8000 [00:00<?, ?it/s]

------------------Epoch 0/500------------------


 94%|█████████▍| 7536/8000 [01:54<00:05, 89.76it/s]