# LSTM Forecasting Model

O'Reilly Machine Learning and Security
* https://github.com/oreilly-mlsec/mlsec.net

Chapter 3 Resources
* https://github.com/oreilly-mlsec/book-resources/blob/master/chapter3/lstm-anomaly-detection.ipynb

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers.core import Dense, Activation, Dropout
from utils import *

## Data Loading and Standardization

In [None]:
print("loading data...")
reader = read_tcpdump_file('data/week1_friday.tcpdump')
packets = np.array([f for f in featurize_packets(reader)])

print("standardizing data... (impossible given protocol is invariable)")
means = np.apply_along_axis(np.mean, 0, packets)
stds = np.apply_along_axis(np.std, 0, packets)
packets -= means
packets /= stds

print("done.")

# TODO: cull non-continuous variables OR manually define loss

In [None]:
print(packets[0]-packets[1])
print(packets[1])

## Hyper-parameters

In [None]:
epochs = 10
batch_size = 50
sequence_length = 4
features = 14

## Class Definition

In [None]:
class ForecastModel(object):
    
    def __init__(self):
        self.model = self.generate_model()
    
    def generate_model(self):
        
        model = Sequential()
        
        model.add(LSTM(input_shape=(sequence_length - 1, features), 
                       units=32, 
                       return_sequences=True))
        model.add(Dropout(0.2))
        
        model.add(LSTM(units=128,
                       return_sequences=True))
        model.add(Dropout(0.2))

        model.add(LSTM(units=100,
                       return_sequences=False))
        model.add(Dropout(0.2))
        
        model.add(Dense(units=features))
        model.add(Activation('linear'))
        
        model.compile(loss='mean_squared_error', optimizer='rmsprop')
        
        return model
    
    def prepare_data(self, data, train_start, train_end, test_start, test_end):
        
        print('creating train n-grams...')
        
        train_grams = []
        for i in range(train_start, train_end - sequence_length):
            train_grams.append(data[i: i + sequence_length])
        train_grams = np.array(train_grams)
        
        print('train data shape : ', train_grams.shape)
        
        self.x_train = train_grams[:, :-1]
        self.y_train = train_grams[:, -1]


        print('creating test n-grams...')
        
        test_grams = []
        for i in range(test_start, test_end - sequence_length):
            test_grams.append(data[i: i + sequence_length])
        test_grams = np.array(test_grams)
        
        print('test data shape : ', test_grams.shape)  
        
        self.x_test = test_grams[:, :-1]
        self.y_test = test_grams[:, -1]        
    
    def run(self, data):
        
        self.prepare_data(data, 0, 10000, 10000, 15000)
        self.model.fit(self.x_train, self.y_train,
                  batch_size=batch_size, epochs=epochs)
        
        # TODO: implement train vs. test
        # TODO: implement test statistics
        
        
        

In [None]:
model = ForecastModel()
model.run(packets)