In [1]:
import tensorflow as tf
# tf.enable_eager_execution()
import numpy as np
import datetime
import os
import argparse
import matplotlib.pyplot as plt
import pandas as pd
import json
import glob
import tqdm
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
tf.__version__

'1.15.0'

In [2]:
from model import encoder, decoder, discriminator_gauss, discriminator_categorical
from utils import read_tfrecord, data_from_tfrecord, data_stream

In [3]:
input_dim = 29 * 29
n_l1 = 1000
n_l2 = 1000
z_dim = 10
batch_size = 100
n_epochs = 500
# learning_rate = 0.001
supervised_lr = 0.001
reconstruction_lr = 0.0006
regularization_lr = 0.0008
beta1 = 0.9
n_labels = 2

In [4]:
# Placeholders for input data and the targets
x_input = tf.placeholder(dtype=tf.float32, shape=[batch_size, input_dim], name='Input')
x_input_l = tf.placeholder(dtype=tf.float32, shape=[batch_size, input_dim], name='Labeled_Input')
y_input = tf.placeholder(dtype=tf.float32, shape=[batch_size, n_labels], name='Labels')
x_target = tf.placeholder(dtype=tf.float32, shape=[batch_size, input_dim], name='Target')
real_distribution = tf.placeholder(dtype=tf.float32, shape=[batch_size, z_dim], name='Real_distribution')
categorial_distribution = tf.placeholder(dtype=tf.float32, shape=[batch_size, n_labels],
                                         name='Categorical_distribution')
manual_decoder_input = tf.placeholder(dtype=tf.float32, shape=[1, z_dim + n_labels], name='Decoder_input')

In [5]:
# Reconstruction Phase
# Encoder try to predict both label and latent space of the input, which will be feed into Decoder to reconstruct the input
# The process is optimized by autoencoder_loss which is the MSE of the decoder_output and the orginal input
with (tf.variable_scope(tf.get_variable_scope())):
    encoder_output_label, encoder_output_latent = encoder(x_input)
    decoder_input = tf.concat([encoder_output_label, encoder_output_latent], 1)
    decoder_output = decoder(decoder_input)

autoencoder_loss = tf.reduce_mean(tf.square(x_target - decoder_output))
autoencoder_optimizer = tf.train.AdamOptimizer(learning_rate=reconstruction_lr, beta1=beta1).minimize(autoencoder_loss)




In [6]:
# Regularization Phase
# Train both 2 discriminator of gaussian and categorical to detect the output from encoder
with (tf.variable_scope(tf.get_variable_scope())):
    # Discriminator for gaussian
    d_g_real = discriminator_gauss(real_distribution)
    d_g_fake = discriminator_gauss(encoder_output_latent, reuse=True)
# Need to seperate dicriminator of gaussian and categorical
with (tf.variable_scope(tf.get_variable_scope())):
    # Discrimnator for categorical
    d_c_real = discriminator_categorical(categorial_distribution)
    d_c_fake = discriminator_categorical(encoder_output_label, reuse=True)

# Discriminator gaussian loss 
dc_g_loss_real = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_g_real), logits=d_g_real))
dc_g_loss_fake = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(d_g_fake), logits=d_g_fake))
dc_g_loss = dc_g_loss_real + dc_g_loss_fake

# Discriminator categorical loss
dc_c_loss_real = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_c_real), logits=d_c_real))
dc_c_loss_fake = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(d_c_fake), logits=d_c_fake))
dc_c_loss = dc_c_loss_fake + dc_c_loss_real

all_variables = tf.trainable_variables()
dc_g_var = [var for var in all_variables if 'dc_g_' in var.name]
dc_c_var = [var for var in all_variables if 'dc_c_' in var.name]
discriminator_g_optimizer = tf.train.AdamOptimizer(learning_rate=regularization_lr/5,
                                                       beta1=beta1).minimize(dc_g_loss, var_list=dc_g_var)
discriminator_c_optimizer = tf.train.AdamOptimizer(learning_rate=regularization_lr/5,
                                                       beta1=beta1).minimize(dc_c_loss, var_list=dc_c_var)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [7]:
# Generator loss
generator_g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_g_fake), logits=d_g_fake))
generator_c_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_c_fake), logits=d_c_fake))
generator_loss = generator_g_loss + generator_c_loss

en_var = [var for var in all_variables if 'e_' in var.name]
generator_optimizer = tf.train.AdamOptimizer(learning_rate=regularization_lr, beta1=beta1).minimize(generator_loss, var_list=en_var)

In [8]:
# Semi-Supervised Classification Phase
# Train encoder with a small amount of label samples
with tf.variable_scope(tf.get_variable_scope()):
    encoder_output_label_, encoder_latent = encoder(x_input_l, reuse=True, supervised=True)
    
# Classification accuracy of encoder
output_label = tf.argmax(encoder_output_label_, 1)
correct_pred = tf.equal(output_label, tf.argmax(y_input, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
supervised_encoder_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_input, logits=encoder_output_label_))
supervised_encoder_optimizer = tf.train.AdamOptimizer(learning_rate=supervised_lr, beta1=beta1).minimize(supervised_encoder_loss, var_list=en_var)


Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [9]:
def evaluate(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    fnr = fn/(tp + fn)
    err = (fn + fp) / (tp + tn + fp + fn)
    precision = tp/(tp + fp)
    recall = 1 - fnr
    f1score = (2 * precision * recall) / (precision + recall)
    print('False negative rate: ', fnr)
    print('Error rate: ', err)
    print('Precision: ', precision)
    print('Recall: ', recall)
    print('F1 score: ', f1score)

# Train with K-means

In [10]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()
data_path = './Data/*/'
n_labeled = 800000
train_label = data_from_tfrecord(glob.glob(data_path + 'train_label'), batch_size, 1)
with tf.Session() as sess:
    saver.restore(sess, save_path=tf.train.latest_checkpoint('./Results/all/2021-07-12 11:07:24.409270_10_0.001_100_500_0.9_Semi_Supervised/Saved_models'))
    n_batches = int(n_labeled / batch_size)
    X = np.empty((0, 10), float)
    y = np.empty((0), int)
#     with tf.variable_scope(tf.get_variable_scope()):
#         _, latent_space = encoder(x_input_l, reuse=True, supervised=True)
    for b in tqdm.tqdm(range(1, n_batches + 1)):
        batch_x_l, batch_y_l = data_stream(train_label, sess)
        batch_label = np.argmax(batch_y_l, axis=1)
        batch_encoded_x = sess.run(encoder_latent, feed_dict={x_input_l: batch_x_l})
        X = np.append(X, batch_encoded_x, axis=0)
        y = np.append(y, batch_label, axis=0)
    print(X.shape, y.shape)
kmeans = KMeans(n_clusters=5).fit(X)

Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.
INFO:tensorflow:Restoring parameters from ./Results/all/2021-07-12 11:07:24.409270_10_0.001_100_500_0.9_Semi_Supervised/Saved_models/-4000000


 17%|█▋        | 1359/8000 [00:03<00:17, 374.34it/s]


KeyboardInterrupt: 

In [None]:
def get_class(clt, y):
    mask = (kmeans.labels_ == clt)
    same_cluster = y[mask]
    try:
        prob_attack = np.count_nonzero(same_cluster)/same_cluster.shape[0]
    except:
        print(clt, same_cluster.shape)
    return int(prob_attack >= 0.7)

# Final prediction

In [None]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()
# test_paths = glob.glob(data_path + 'test')
with tf.Session() as sess:
    #attack = 'gear'
    data_path = ['./Data/{}/'.format(a) for a in labels]
    results_path = './Results/all/2021-07-12 11:07:24.409270_10_0.001_100_500_0.9_Semi_Supervised/'
    #data_info = json.load(open(data_path + 'datainfo.txt'))
    
    # Get the latest results folder
    #all_results = os.listdir(results_path)
    #all_results.sort()
    
    #saver.restore(sess, save_path=tf.train.latest_checkpoint(results_path + '/' +
    #                                                         all_results[1] + '/Saved_models/'))
    saver.restore(sess, save_path=tf.train.latest_checkpoint(results_path + '/Saved_models'))
    
    # produce_test_result()
    test_size = int(1e6)
#     test = data_from_tfrecord(test_paths, batch_size, 1)
    test = data_from_tfrecord([p + 'test' for p in data_path], batch_size, 1)

    with tf.variable_scope(tf.get_variable_scope()):
        output_label, latent = encoder(x_input_l, reuse=True, supervised=True)

    num_batches = int(test_size / batch_size)
    y_true = np.empty((0), int)
    y_pred = np.empty((0), int)
    total_latent = np.empty((0, 10), float)
    for _ in range(num_batches):
        x_test, y_test = data_helper(test, sess)
        batch_pred, batch_latent = sess.run([output_label, latent], feed_dict={x_input_l: x_test})
        total_latent = np.append(total_latent, batch_latent, axis=0)
        batch_label = np.argmax(y_test, axis=1).reshape((batch_size))
        batch_pred = np.argmax(batch_pred, axis=1).reshape((batch_size))
        y_pred = np.append(y_pred, batch_pred, axis=0)
        y_true = np.append(y_true, batch_label, axis=0)
        

In [None]:
normal_indices = (y_pred == 0).nonzero()
normal_latent = total_latent[normal_indices]
pred_cluster = kmeans.predict(normal_latent)
f = lambda x: get_class(x, y)
vf = np.vectorize(f)
pred_label = vf(pred_cluster)
y_pred[normal_indices] = pred_label