# 1. Data Processing

In [1]:
#a
import sys
import os
import json
import pandas
import numpy
import optparse

from keras.callbacks import TensorBoard
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, SimpleRNN
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict

Using TensorFlow backend.


In [3]:
#b
dataframe = pandas.read_csv("Downloads/dev-access.csv", engine='python', quotechar='|', header=None)

In [4]:
#c
dataset = dataframe.values

In [5]:
#d
dataset.shape

(26773, 2)

In [6]:
#e
X = dataset[:,0]

In [7]:
#f
Y = dataset[:,1]

In [8]:
#g
for index, item in enumerate(X):
    # Quick hack to space out json elements
    reqJson = json.loads(item, object_pairs_hook=OrderedDict)
    del reqJson['timestamp']
    del reqJson['headers']
    del reqJson['source']
    del reqJson['route']
    del reqJson['responsePayload']
    X[index] = json.dumps(reqJson, separators=(',', ':'))

In [9]:
#h
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)

# we will need this later
num_words = len(tokenizer.word_index)+1
X = tokenizer.texts_to_sequences(X)

In [10]:
#i
max_log_length = 1024
X_processed = sequence.pad_sequences(X, maxlen=max_log_length)

In [12]:
#j
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_processed, Y, test_size=0.25, random_state=101)

# 2. Model 1 - RNN

In [14]:
#a-f
m1 = Sequential()
m1.add(Embedding(input_dim = num_words, output_dim = 32, input_length = max_log_length))

m1.add(SimpleRNN(units = 32,activation = 'relu'))

m1.add(Dense(units = 1, activation='relu'))

m1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

m1.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 4,129
Trainable params: 4,129
Non-trainable params: 0
_________________________________________________________________


In [15]:
#g
m1.fit(X_train, y_train, validation_split = 0.25, epochs=3, batch_size = 128)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 15059 samples, validate on 5020 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x157e2acc0>

In [19]:
#h
eva1 = m1.evaluate(X_test, y_test, batch_size=128)
print('Test loss for Model 1:', eva1[0])
print('Test accuracy for Model 1:', eva1[1])

Test loss for Model 1: 0.129257250464796
Test accuracy for Model 1: 0.6903197169303894


# 3. Model 2 - LSTM + Dropout Layers:

In [20]:
#a-c
m2 = Sequential()
m2.add(Embedding(input_dim = num_words, output_dim = 32, input_length = max_log_length))

m2.add(LSTM(units = 64, recurrent_dropout = 0.5))
m2.add(Dropout(rate=0.5))
m2.add(Dense(units = 1, activation='relu'))

m2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

m2.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 26,913
Trainable params: 26,913
Non-trainable params: 0
_________________________________________________________________


In [21]:
#d
m2.fit(X_train, y_train, validation_split = 0.25, epochs=3, batch_size = 128)

Train on 15059 samples, validate on 5020 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x15821aa90>

In [22]:
#e
eva2 = m2.evaluate(X_test, y_test, batch_size=128)
print('Test loss for Model 2:', eva2[0])
print('Test accuracy for Model 2:', eva2[1])

Test loss for Model 2: 1.081118873011934
Test accuracy for Model 2: 0.5123991370201111


# 4. Recurrent Neural Net Model 3: Build Your Own

In [23]:
#a-c
m3 = Sequential() 

m3.add(Embedding(input_dim = num_words, output_dim = 32, input_length = max_log_length))
m3.add(Dropout(rate=0.5))
m3.add(LSTM(units=64, recurrent_dropout=0.5))
m3.add(Dense(units=1, activation='tanh'))
m3.add(Dropout(rate=0.5))
m3.add(Dense(units=1, activation='softmax'))

m3.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
m3.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
dropout_3 (Dropout)          (None, 1024, 32)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
_________________________________________________________________
dropout_4 (Dropout)          (None, 1)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 2         
Total params: 26,915
Trainable params: 26,915
Non-trainable params: 0
__________________________________________________

In [24]:
#d
m3.fit(X_train, y_train, validation_split = 0.25, epochs=3, batch_size = 128)

Train on 15059 samples, validate on 5020 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x15b638dd8>

In [25]:
#e
eva3 = m3.evaluate(X_test, y_test, batch_size=128)
print('Test loss for Model 3:', eva3[0])
print('Test accuracy for Model 3:', eva3[1])

Test loss for Model 3: 7.731687462005752
Test accuracy for Model 3: 0.4929787814617157


# Conceptual Questions: 

5) Explain the difference between the relu activation function and the sigmoid activation function.

Answer: A sigmoid activation function has a output range of 0-1, where any input larger than 1 will be transformed to 1 and any inputs smaller than 0 will be transformed to 0. The values around the midpoint are very sensitive while the values around two endpoints tend to respond very less to the changes in x. This means the gradient at the two endpoints is very small, which raise the problem of gradient vanishing. With gradient vanishing, the network refuses to learn further and get stuck easily. 

On the other hand, relu activation function outputs the input if positive and outputs 0 if negative, and thus is has a range of 0 to infinite. This output help relu activation function to avoid gradient vanishing problem. In addition, unlike sigmoid activation function (which is differentiable the entire function), relu activation function is a piecewise function. This help the relu activation function to be less computationally challenging than sigmoid, but it brings the issue that since all the negative values become 0, it decrease the ability for the network to fi or train from the data properly (especially the negative input values).

6) Describe what one epoch actually is (epoch was a parameter used in the .fit() method).

Answer: One epoch means each sample in the training data has passes through (forward and backward) the entire network once. In other words, number of epochs defines the number of times that the network has work through the entire training data. We often set number of epoch to be large, which allows the network to learn until the error is sufficiently minimized since as number of epochs increase, more number of times the weights are updated in the network as the network performance change from underfitting to optimal, then to overfitting.  

7) Explain how dropout works (you can look at the keras code and/or documentation) for (a) training, and (b) test data sets.

Answer:

Dropout is a regularization tecnique in nerual network where the network will randomly dropping out nodes during training. This technique helps prevent overfitting, improve generalization error, and help the network trains faster as we are dropping some nodes. Dropped nodes will lose its connectivity to the prior and upcoming layer. For instance, in this assignment, we set our dropout rate to be 0.5 in our training phase, which means that a node has a 50% chance of being dropped in any given training iteration. In testing data, to deal with the dropped nodes during training process, we weight each connected by (1-0.5 (dropout_rate)) to balance the dropped nodes in training. 


8) Explain why problems such as this homework assignment are better modeled with RNNs than CNNs. What type of problem will CNNs outperform RNNs on?

Answer:

RNNs has the property of "memorizing" the previous layers, as the output of the layers are fed into the network as an input for upcoming layer. With this property, RNNs is preferrable for time series data and sequential data as the order of these data makes a different in training the network. Therefore, in this assignment, working with sequential data that its order plays a significant role, RNNs is a better choice as compared to CNNs. CNNs are more preferrable for problems where we need to map image data to an output variable, such as assignment 7. As CNNs has the ability to develope an internal representation of an image. Therefore, CNNs are the top choice for image classification or computer vision. 


9) Explain what RNN problem is solved using LSTM and briefly describe how.

Answer:

RNN suffers from a problem of short-term memory. As mentioned above, RNNs has the property of memorizing the information from prior states. However, if a sequence is long, it has difficulties to carry information from earlier states as compared to the laster states. In other words, information from earlier states will have smaller gradient update and slowly not carrying this information as more and more sequences/states are added in to the network. Intuitively, we want our RNN to carry useful information despite whether this information came in early states or later states. LSTM solved this problem by having internal mechanisms called gates to control which data in the sequence is important to keep or throw away. The forget gate decides what is the useful information to keep from prior steps; the input gate decides what information is useful to add form the current state; the output gate decides what the next hidden state should be. With the forget gate, it allows the network to better control the gradients value at each time step, and by using the appropriate parameters update for the forget gate, it allows LSTM to decide what information should or should not be forgotten.  
