#Sentiment Analysis in Keras
###This project uses a Gated Recurrent Unit implemented in Keras to perform sentiment analysis on reviews from IMDB website


In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd 
from scipy.spatial.distance import cdist
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

##Data

In [None]:
train = pd.read_csv("./data/labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)

test = pd.read_csv("./data/testData.tsv", header=0, \
                    delimiter="\t", quoting=3)


In [3]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train.iloc[:,2], train.iloc[:,1], test_size=0.33, random_state=42)
text = pd.concat([x_train,x_test])

In [6]:
text.head()

12131    "When I saw previews of this movie I thought t...
12827    "One of the best if not the best rock'n'roll m...
2912     "I have made it my personal mission to go afte...
13762    "Lock Up Your Daughters is one of the best hig...
6369     "This is one movie that will take time to get ...
Name: review, dtype: object

##Tokenise

In [0]:
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(text)
x_train_tokens = tokenizer.texts_to_sequences(x_train)
x_test_tokens = tokenizer.texts_to_sequences(x_test)

##Padding

In [8]:
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)
print("Mean length is " + str(np.mean(num_tokens)))

Mean length is 224.05292


In [9]:
#Chose to 2 S.D to cover ~95% of data
max_tokens = np.mean(num_tokens) + 2*np.std(num_tokens)
max_tokens = int(max_tokens)
print(max_tokens)

552


In [0]:
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding='pre', truncating='pre')
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,
                           padding='pre', truncating='pre')

##Tokenizer Inverse Mapping

In [0]:

idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [0]:
def tokens_to_string(tokens):
    words = [inverse_map[token] for token in tokens if token != 0]
    text = " ".join(words)
    return text

##GRU

In [13]:
model = Sequential()
embedding_size = 100

model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=1e-3),
              metrics=['accuracy'])

W0718 22:54:57.585732 140679738730368 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0718 22:54:57.616472 140679738730368 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0718 22:54:58.625931 140679738730368 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.o

In [14]:
%%time
model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=3, batch_size=64)

Train on 15912 samples, validate on 838 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 18min 51s, sys: 1min 10s, total: 20min 1s
Wall time: 10min 43s


<tensorflow.python.keras.callbacks.History at 0x7ff2487dbba8>

In [26]:
%%time
result = model.evaluate(x_test_pad, y_test)
print("Accuracy: "+ str(result[1]))

Accuracy: 0.86921215
CPU times: user 1min 7s, sys: 1.66 s, total: 1min 8s
Wall time: 42.3 s


##Predicting

In [0]:
tokens = tokenizer.texts_to_sequences(test.iloc[:,1])
tokens_pad = pad_sequences(tokens, maxlen=max_tokens,
                           padding='pre', truncating='pre')


In [28]:
model.predict(tokens_pad)

array([[0.9756211 ],
       [0.03193003],
       [0.42068407],
       ...,
       [0.03210175],
       [0.9671382 ],
       [0.9544081 ]], dtype=float32)