# Lab 7 - Recurrent Neural Networks
##### By. Liam Lowsley-Williams & Fernando Vazquez

## Business Understanding

### Introduction

### Motivations

### Objectives

### Evaluation

## Init

In [1]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Please use TensorFlow version 1.0 or newer.  You are using {}'.format(tf.__version__)
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please ensure you have installed TensorFlow correctly')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 2.0.0
Default GPU Device: /device:GPU:0


In [36]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Input, Model
from keras.layers import Dense
from keras.layers import SimpleRNN, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping

from sklearn import metrics as mt
from matplotlib import pyplot as plt
%matplotlib inline

import pickle
import numpy as np
import pandas as pd

# graph visualization
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

keras.__version__

'2.3.1'

## Data Preperation

In [3]:
data = pd.read_csv("reviews.csv")
data

Unnamed: 0,ReviewTitle,ReviewBody,ReviewStar,Product
0,Honest review of an edm music lover\n,No doubt it has a great bass and to a great ex...,3,boAt Rockerz 255
1,Unreliable earphones with high cost\n,"This earphones are unreliable, i bought it be...",1,boAt Rockerz 255
2,Really good and durable.\n,"i bought itfor 999,I purchased it second time,...",4,boAt Rockerz 255
3,stopped working in just 14 days\n,Its sound quality is adorable. overall it was ...,1,boAt Rockerz 255
4,Just Awesome Wireless Headphone under 1000...😉\n,Its Awesome... Good sound quality & 8-9 hrs ba...,5,boAt Rockerz 255
...,...,...,...,...
14332,Good\n,Good\n,4,JBL T110BT
14333,Amazing Product\n,An amazing product but a bit costly.\n,5,JBL T110BT
14334,Not bad\n,Sound\n,1,JBL T110BT
14335,a good product\n,the sound is good battery life is good but the...,5,JBL T110BT


In [4]:
data_title = data["ReviewTitle"]
data_body = data["ReviewBody"]
y = data["ReviewStar"]

In [5]:
X_data = data["ReviewTitle"].map(str) + " " + data["ReviewBody"]
X_data

0        Honest review of an edm music lover\n No doubt...
1        Unreliable earphones with high cost\n This  ea...
2        Really good and durable.\n i bought itfor 999,...
3        stopped working in just 14 days\n Its sound qu...
4        Just Awesome Wireless Headphone under 1000...😉...
                               ...                        
14332                                        Good\n Good\n
14333    Amazing Product\n An amazing product but a bit...
14334                                    Not bad\n Sound\n
14335    a good product\n the sound is good battery lif...
14336    Average headphones , n overrated name\n M writ...
Length: 14337, dtype: object

5074

In [18]:
%%time
NUM_TOP_WORDS = None
MAX_ART_LEN = X_data.map(lambda x: len(x.split())).max() # maximum and minimum number of words
NUM_CLASSES = 5

tokenizer = Tokenizer(num_words=NUM_TOP_WORDS)
tokenizer.fit_on_texts(X_data)
sequences = tokenizer.texts_to_sequences(X_data)

word_index = tokenizer.word_index
NUM_TOP_WORDS = len(word_index) if NUM_TOP_WORDS==None else NUM_TOP_WORDS
top_words = min((len(word_index),NUM_TOP_WORDS))
print('Found %s unique tokens. Distilled to %d top words.' % (len(word_index),top_words))

X = pad_sequences(sequences, maxlen=MAX_ART_LEN)
# X = pad_sequences(sequences)

y_ohe = keras.utils.to_categorical(y)
y_ohe = y_ohe[:,1:]

print('Shape of data tensor:', X.shape)
print('Shape of label tensor:', y_ohe.shape)
print(np.max(X))

Found 12156 unique tokens. Distilled to 12156 top words.
Shape of data tensor: (14337, 868)
Shape of label tensor: (14337, 5)
12156
Wall time: 630 ms


In [19]:
X

array([[   0,    0,    0, ...,  149,   37,  133],
       [   0,    0,    0, ..., 1634,    9,  301],
       [   0,    0,    0, ...,  150,   31,   45],
       ...,
       [   0,    0,    0, ...,   11,   70,    6],
       [   0,    0,    0, ...,   18,    3,    9],
       [   0,    0,    0, ...,   82,  298,  208]])

In [20]:
from sklearn.model_selection import train_test_split
# Split it into train / test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y_ohe, test_size=0.2,
                                                              stratify=y, 
                                                              random_state=42)

print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)
print(np.sum(y_train,axis=0))
print(np.sum(y_test,axis=0))

(11469, 868) (11469, 5)
(2868, 868) (2868, 5)
[1994.  751. 1203. 2551. 4970.]
[ 499.  188.  300.  638. 1243.]


## Evaluation

### Model 1

In [45]:
max_review_length = MAX_ART_LEN
EMBED_SIZE = 150
embedding_layer = Embedding(X_train.shape[1],
                            EMBED_SIZE,
                            input_length=MAX_ART_LEN)

rnn = Sequential()
rnn.add(embedding_layer)
rnn.add(SimpleRNN(100,dropout=0.2, recurrent_dropout=0.2))
rnn.add(Dense(NUM_CLASSES, activation='sigmoid'))
rnn.compile(loss='categorical_crossentropy', 
              optimizer='rmsprop', 
              metrics=['accuracy'])
print(rnn.summary())

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 868, 150)          130200    
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 100)               25100     
_________________________________________________________________
dense_7 (Dense)              (None, 5)                 505       
Total params: 155,805
Trainable params: 155,805
Non-trainable params: 0
_________________________________________________________________
None


In [46]:
rnn.fit(X_train, y_train, 
        epochs=15, 
        batch_size=64, 
        validation_data=(X_test, y_test), 
        callbacks=[EarlyStopping(monitor='val_loss', patience=3)]
       )

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 11469 samples, validate on 2868 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15


<keras.callbacks.callbacks.History at 0x196383b7648>

In [47]:
rnn.save_weights('model_1_weights.h5')

### Model 2

In [33]:
%%time
# NUM_TOP_WORDS = None
# tokenizer = Tokenizer(num_words=NUM_TOP_WORDS)
# word_index = tokenizer.word_index

EMBED_SIZE = 200
# the embed size should match the file you load glove from
embeddings_index = {}
f = open('./glove/glove.6B.200d.txt', encoding="utf8")
# save key/array pairs of the embeddings
#  the key of the dictionary is the word, the array is the embedding
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# now fill in the matrix, using the ordering from the
#  keras word tokenizer from before
embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)


Found 400000 word vectors.
(12157, 200)
Wall time: 20.8 s


In [34]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBED_SIZE,
                            weights=[embedding_matrix],
                            input_length=MAX_ART_LEN,
                            trainable=False)

In [43]:
rnn2 = Sequential()
rnn2.add(embedding_layer)
rnn2.add(LSTM(300,dropout=0.2, recurrent_dropout=0.2))
rnn2.add(Dense(NUM_CLASSES, activation='sigmoid'))
rnn2.compile(loss='categorical_crossentropy', 
              optimizer='rmsprop', 
              metrics=['accuracy'])
print(rnn.summary())

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 868, 200)          2431400   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               120400    
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 505       
Total params: 2,552,305
Trainable params: 2,552,305
Non-trainable params: 0
_________________________________________________________________
None


In [44]:
rnn2.fit(X_train, 
        y_train, 
        epochs=15, 
        batch_size=64,
        validation_data=(X_test, y_test),
        callbacks=[EarlyStopping(monitor='val_loss', patience=2)]
       )

Train on 11469 samples, validate on 2868 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15


<keras.callbacks.callbacks.History at 0x19632e187c8>

In [None]:
rnn2.save_weights('model_2_weights.h5')