In [None]:
%matplotlib notebook

import time

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.stats.stats import pearsonr
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.utils import shuffle
import sklearn

In [None]:
# read dataset
def load_data():
    train_data = shuffle(pd.read_csv('data/KDDTrain+.csv', header=None))
    test_data = shuffle(pd.read_csv('data/KDDTest+.csv', header=None))
    
    symbolic_cols = [1, 2, 3]
    
    train_data.iloc[:, -2] = train_data.iloc[:, -2].map(lambda x: 0 if x == 'normal' else 1)
    test_data.iloc[:, -2] = test_data.iloc[:, -2].map(lambda x: 0 if x == 'normal' else 1)
    
    train_data_x = train_data.iloc[:, :-2].values 
    train_data_y = train_data.iloc[:, -2: -1].values
    
    test_data_x = test_data.iloc[:, :-2].values
    test_data_y = test_data.iloc[:, -2: -1].values
    
    for col in symbolic_cols:
        enc = sklearn.preprocessing.LabelEncoder()
        enc.fit(train_data_x[:, col])
        transformed = enc.transform(train_data_x[:, col])
        train_data_x[:, col] = transformed
        transformed_test = enc.transform(test_data_x[:, col])
        test_data_x[:, col] = transformed_test
    
#     create_symbolic_mapping(train_data_y, [0])
#     print(len(create_symbolic_mapping(test_data_y, [0])[0]))
    
#     output_enc = sklearn.preprocessing.LabelEncoder()
#     output_enc.fit(train_data_y[:, 0])
#     transformed = output_enc.transform(train_data_y[:, 0])
#     transformed_test = output_enc.transform(test_data_y[:, 0])
    
#     train_data_y[:, 0] = transformed
#     test_data_y[:, 0] = transformed_test

    means = np.mean(train_data_x, axis=0, keepdims=True)
    stds = np.std(train_data_y, axis=0, keepdims=True)
    
    train_data_x = (train_data_x - means) / stds
    test_data_x = (test_data_x - means) / stds
    
    # convert lablels to 2 classes (2 cols per example) 
    train_data_y = np.eye(2)[train_data_y].reshape(-1, 2)
    test_data_y = np.eye(2)[test_data_y].reshape(-1, 2)

    return train_data_x, train_data_y, test_data_x, test_data_y

In [None]:
def create_symbolic_mapping(data, columns):
    
    values = [[] for _ in range(len(columns))]
    for i, col in enumerate(columns):
        for ex in data:
            val = ex[col]
            if val not in values[i]:
                values[i].append(val)
    print(values)
    return values


In [None]:
def random_test_train_split(data):
    """
    Splits the data into test and training sets, assuming last column is the label
    
    Parameters:
    -----------
    data: pandas.DataFrame
        consolidated dataset returned by load_data() or normalize_data()
    """
    x_cols = data.columns[:-2]
    X = data[x_cols]
    
    y_cols = data.columns[-2:-1]
    y = data[y_cols]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    return X_train, X_test, y_train, y_test

In [None]:
def normalize_data(data):
    """
    Centers continuous variables around 0, and scales by 1/std
    
    Parameters:
    -----------
    data: pandas.DataFrame
        consolidated dataset returned by load_data()

    TODO: handle categorical variables
    """
    t_data = data - np.mean(data)
    t_data = t_data / np.std(data)
    return t_data

In [None]:
def forwardprop(X):
    """
    yhat is not softmax since TensorFlow's softmax_cross_entropy_with_logits() does that internally.
    """
    h1    = tf.contrib.layers.fully_connected(X, 128,
                activation_fn=tf.nn.relu,
                normalizer_fn=tf.contrib.layers.batch_norm,
                normalizer_params=None,
                weights_initializer=tf.contrib.layers.xavier_initializer(),
                weights_regularizer=tf.contrib.layers.l2_regularizer(0.0),
                biases_initializer=tf.zeros_initializer(),
                biases_regularizer=None,
                reuse=None,
                variables_collections=None,
                outputs_collections=None,
                trainable=True,
                scope=None
            )
    h2 = tf.contrib.layers.fully_connected(h1, 64,
                activation_fn=tf.nn.relu,
                normalizer_fn=tf.contrib.layers.batch_norm,
                normalizer_params=None,
                weights_initializer=tf.contrib.layers.xavier_initializer(),
                weights_regularizer=tf.contrib.layers.l2_regularizer(0.0),
                biases_initializer=tf.zeros_initializer(),
                biases_regularizer=None,
                reuse=None,
                variables_collections=None,
                outputs_collections=None,
                trainable=True,
                scope=None
            )
    h3 = tf.contrib.layers.fully_connected(h2, 2,
                activation_fn=tf.nn.sigmoid,
                normalizer_fn=None,
                normalizer_params=None,
                weights_initializer=tf.contrib.layers.xavier_initializer(),
                weights_regularizer=tf.contrib.layers.l2_regularizer(0.0),
                biases_initializer=tf.zeros_initializer(),
                biases_regularizer=None,
                reuse=None,
                variables_collections=None,
                outputs_collections=None,
                trainable=True,
                scope=None
            )
    yhat = h3
    return yhat

In [None]:
train_X, train_y, test_X, test_y = load_data()

In [None]:
# Layer's sizes
x_size = train_X.shape[1]   # Number of input nodes
y_size = train_y.shape[1]   # Number of outcomes


In [None]:
print(train_X.shape)
print(train_y.shape)

In [None]:
# Symbols
X = tf.placeholder("float", shape=[None, x_size])
y = tf.placeholder("float", shape=[None, y_size])

In [None]:
# Forward propagation
yhat   = forwardprop(X)
# predict = tf.argmax(yhat, axis=1)

In [None]:
# Backward propagation
cost    = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=yhat)
updates = tf.train.AdamOptimizer(0.001).minimize(cost)


In [None]:
# Run SGD
with tf.device('/cpu:0'):
    sess = tf.Session()
    init = tf.global_variables_initializer()
    sess.run(init)
    NUM_EPOCHS = 1000
    BATCH_SIZE = 2000
    for epoch in range(NUM_EPOCHS):
        # Train with each example
        tic = time.time()
        num_iters = len(train_X) // BATCH_SIZE
        for i in range(num_iters):
            idx1 = BATCH_SIZE * i
            idx2 = idx1 + BATCH_SIZE
            _, loss = sess.run([updates, cost], feed_dict={X: train_X[idx1:idx2], y: train_y[idx1:idx2]})
        
        pred_train = sess.run(yhat, feed_dict={X: train_X, y: train_y})
        # threshold outputs at 0.5
        pred_train = pred_train > 0.5
        train_accuracy = np.mean(train_y == pred_train)
        print(np.mean(pred_train))
        
        pred_test = sess.run(yhat, feed_dict={X: test_X, y: test_y})
        pred_test = pred_test > 0.5
        test_accuracy = np.mean(test_y == pred_test)
        toc = time.time()
        if epoch % 10 == 0:
            print('time:', toc-tic)
            print("Epoch = %d, train acc. = %.2f, test acc. = %.2f, loss: %.2f"
                  % (epoch + 1, train_accuracy, test_accuracy, np.mean(loss)))
    sess.close()
