In [None]:
# Get datasets from: https://drive.google.com/open?id=16w02wuMOqoLm6-YlM-rhAC1uZthO-2A3
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# authenticate user credentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# download datasets from GDrive
# downloaded = drive.CreateFile({'id': '1pghUuYyaYsuwDCA7Ow1Y601kGl6qPboL'})
# downloaded.GetContentFile('large_10_1000.zip')
# downloaded = drive.CreateFile({'id': '19wcJs9TdIwhT85TGdsaw1GV1ypbLnOpK'})
# downloaded.GetContentFile('small_10_100.zip')
downloaded = drive.CreateFile({'id': '1f-o1ygRlTZtvhbC9naP721khFJicbp-n'})
downloaded.GetContentFile('large_95_100.zip')

# unzip the datasets
# !unzip -o small_10_100.zip
# !unzip -o large_10_1000.zip
!unzip -o large_95_100.zip

In [None]:
%tensorflow_version 2.x
%load_ext tensorboard
import numpy as np
from sklearn.utils import shuffle

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import RMSprop, Adam, Adamax
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
from tensorflow.keras.regularizers import l2
from itertools import takewhile
from keras.utils.vis_utils import plot_model


import tensorflow as tf
#tf.logging.set_verbosity(tf.logging.ERROR)


In [None]:

# Global Definitions
data_path="undefended/"  # trace data path

num_sites=95               # number of sites (max 95)
num_instances=100          # number of instances per site (max 100)
file_ext=""                # trace file extension
max_length = 200          # maximum number of packet directions to use


In [None]:
def condense(x):
  ol = len(x)
  x = list(filter(lambda a: a != 0, x))
  out = []
  while x:
    e = x[0]
    count = (len(list(takewhile(lambda a: a == e, x))))
    x = x[count:]
    out.append(count * e)
  return out


def get_data():
    """

    :return: a numpy ndarray of dimension (m x (n+1)) containing direction data
        loaded from the files, where `m` is the number of data samples and `n`
        is length of direction packets (restricted to 500 to consume less
        computation time and memory). The last column in the data contains the
        class labels of the `m` samples, which are the website numbers.

    This function loads the data from the files and creates a numpy data matrix
    with each row as a data sample and the columns containing packet direction.
    
    The column data is condensed to decrease the size of the packet data in
    order to make LSTM classification easier. Packets in the form 
    [-1,-1,-1,1,1,1,1] are instead saved as [-3,4]. The change in packet sizes
    with this condensation method being used is noted.

    The last column of the data is the label, which is the website to which the
    instance belongs.
    """

    # read data from files
    print("loading data...")
    data = []
    len_changes = []
    lens = []
    max_len_change = 0
    min_len_change = 1000000000
    for site in range(0, num_sites):
        for instance in range(0, num_instances):
            file_name = str(site) + "-" + str(instance)
            # Directory of the raw data
            with open(data_path + file_name + file_ext, "r") as file_pt:
                directions = []
                for line in file_pt:
                    x = line.strip().split('\t')
                    directions.append(1 if float(x[1]) > 0 else -1)
                # Condense sequence data
                # Save old lengths
                old_len = len(directions)
                # old_sum = sum(directions)
                # Condese directions array
                directions = condense(directions)
                # record new lengths
                # new_sum = sum(directions)
                new_len = len(directions)
                lens.append(new_len)
                # save difference in length
                diff = old_len - new_len
                # Update max and min change values, if applicable
                if diff > max_len_change:
                  max_len_change = diff
                if diff < min_len_change:
                  min_len_change = diff
                len_changes.append(diff)
                # Pad/Condense directions array to max_len
                if len(directions) < max_length:
                    zend = max_length - len(directions)
                    directions.extend([0] * zend)
                elif len(directions) > max_length:
                    directions = directions[:max_length]
                # update large array with properly sized packet
                data.append(directions + [site])
    # Display min/max/average length changes
    print(f"Maximum Length Change: {max_len_change}")
    print(f"Minimum Length Change: {min_len_change}")
    print(f"Average Length Change: {np.mean(len_changes)}")
    print(f"Average Condensed Length: {np.mean(lens)}")
    print("done")
    return np.array(data)


def split_data(X, Y, fraction=0.80, balance_dist=False):
    """
    :param X: a numpy ndarray of dimension (m x n) containing data samples
    :param Y: a numpy ndarray of dimension (m x 1) containing labels for X
    :param fraction: a value between 0 and 1, which will be the fraction of
        data split into training and test sets. value of `fraction` will be the
        training data and the rest being test data.
    :param balance_dist: boolean value. The split is performed with ensured
        class balance if the value is true.
    :return: X_train, Y_train, X_test, Y_test

    This function splits the data into training and test datasets.
    """
    X, Y = shuffle(X, Y)
    m, n = X.shape
    split_index = int(round(m*fraction))
    if balance_dist:
        X_train = np.zeros(shape=(split_index, n))
        X_test = np.zeros(shape=(m-split_index, n))
        Y_train = np.zeros(shape=(split_index,))
        Y_test = np.zeros(shape=(m-split_index,))
        labels = np.unique(Y)
        ind1 = 0
        ind2 = 0
        for i in np.arange(labels.size):
            indices = np.where(Y == labels[i])[0]
            split = int(round(len(indices)*fraction))

            X_train[ind1:ind1 + split, :] = X[indices[:split], :]
            X_test[ind2:ind2+(indices.size-split), :] = X[indices[split:], :]

            Y_train[ind1:ind1 + split] = Y[indices[:split]]
            Y_test[ind2:ind2+(indices.size-split)] = Y[indices[split:]]

            ind1 += split
            ind2 += indices.size-split
        X_train, Y_train = shuffle(X_train, Y_train)
        X_test, Y_test = shuffle(X_test, Y_test)
        return X_train, Y_train, X_test, Y_test
    return X[:split_index, :], Y[:split_index], \
        X[split_index:, :], Y[split_index:]

class LSTM:
    def __init__(self, num_features, num_classes):
        # We begin by defining the a empty stack. We'll use this for building our 
        # network, later by layer.
        model = Sequential()

        # add the LSTM layer
        # input units are condensed data length
        # dropout of 0.5
        # tanh activation function (although I think that's set by default)
        model.add(
            tf.keras.layers.LSTM(
                units=max_length, # input units as variable so we can test
                return_sequences=False,
                input_shape = (num_features, 1),
                recurrent_dropout=0.5,
                activation="tanh" 
            )
        )

        # All LSTM units are connected to output the number of units as we
        # have websites. the "softmax" activation function is used to figure out
        # which class has the highest prediction
        model.add(
            tf.keras.layers.Dense(
                units=num_classes, # number of websites
                activation='softmax' # Softmax Activation Function
            )
        )

        # Compile the model
        model.compile(
            loss=tf.keras.losses.categorical_crossentropy, # loss function
            optimizer=Adam(learning_rate=0.001),
            metrics=['accuracy']) # reporting metric
        self.model = model
        plot_model(model, to_file='model_plot.png', show_shapes=True)
        # Display a summary of the models structure
        print(self.model.summary())

    def fit(self, x_train, y_train, batch_size, epochs, verbose):
        tboard_cb = TensorBoard(log_dir='./graph', histogram_freq=0,
                        write_graph=True, write_images=True)
        early_stopping_cb = EarlyStopping(monitor="val_loss", patience=3)
        # Train the LSTM on the training data
        self.model.fit(

            # Training data
            x_train, y_train,
                            
            # Number of samples to work through before updating the 
            # internal model parameters via back propagation. The 
            # higher the batch, the more memory you need.
            batch_size=batch_size, 

            # An epoch is an iteration over the entire training data.
            epochs=epochs, 
            
            # The model will set apart his fraction of the training 
            # data, will not train on it, and will evaluate the loss
            # and any model metrics on this data at the end of 
            # each epoch.
            validation_split=0.2,
            
            verbose=verbose,

            callbacks=[tboard_cb, early_stopping_cb]
        )

In [None]:
def main():

    # Load the data and create X and Y matrices
    data = get_data()
    num_features = data.shape[1] - 1
    X = data[:, :num_features]
    Y = data[:, -1]

    # split the data into training and test set
    X_train, Y_train, X_test, Y_test = split_data(X, Y, 0.85, balance_dist=True)
    X_train = np.expand_dims(X_train, axis=2)
    X_test = np.expand_dims(X_test, axis=2)
    Y_train = to_categorical(Y_train)
    Y_test = to_categorical(Y_test)

    # instantiate the CNN model and train on the data
    # model = CNN(num_features, Y_train.shape[1])
    # model.fit(X_train, Y_train, batch_size=25, epochs=500, verbose=1)

    # instantiate the LSTM model and train on the data
    model = LSTM(num_features, Y_train.shape[1])
    model.fit(X_train, Y_train, batch_size=64, epochs=50, verbose=1)
    
    # Evaluate the trained CNN model on test data and print the accuracy
    score = model.model.evaluate(X_test, Y_test, batch_size=100)
    print("Test accuracy: ", round(score[1]*100, 2))
    print("Test loss: ", round(score[0], 2))

    # Get LSTM Model Predictions for test data
    # from sklearn.metrics import classification_report
    # predicted_classes = (model.predict(X_test) > 0.5).astype("int32")
    # print(classification_report(Y_test, predicted_classes, 
    # target_names=class_names))

In [None]:
if __name__ == '__main__':
    main()

In [None]:
%tensorboard --logdir graph/