In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from scipy.spatial.distance import cdist

In [2]:
from keras.models import Sequential
from keras.layers import Dense,Embedding,GRU
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [4]:
import imdb
imdb.data_dir = './IMDB' 

In [5]:
imdb.maybe_download_and_extract()

- Download progress: 100.0%
Download finished. Extracting files.
Done.


In [5]:
X_train,y_train = imdb.load_data(train = True)
X_test,y_test = imdb.load_data(train=False)

In [7]:
X_train[1]

'Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they\'ll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it\'s like to be homeless? That is Goddard Bolt\'s lesson.<br /><br />Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the streets for thirty days withou

In [8]:
y_train[1]

1.0

In [10]:
# Tokenizer
# Neural nets cannot process text strings in the current form.A tokenizer maps each word in the string to a unique
# token which is present in the dictionary assuming we are processing words from a dictionary of 10k most relevant words.

number_of_words = 10000
tokenizer = Tokenizer(num_words=number_of_words)

# Additionally, tokenizer also cleans the text before mapping it to tokens.This involves converting the text to lowercase,
# separating words(splitting by spcace),removing punctuations etc 

In [12]:
data_text = X_train + X_test

In [13]:
tokenizer.fit_on_texts(data_text)

In [15]:
tokenizer.word_index
# tokens are ordered by the number of occurences and each word is mapped to a unique token which has an index in the
# dictionary.This is done for whole of the corpus available with us.

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'be': 26,
 'one': 27,
 'he': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'so': 34,
 'who': 35,
 'from': 36,
 'like': 37,
 'or': 38,
 'just': 39,
 'her': 40,
 'out': 41,
 'about': 42,
 'if': 43,
 "it's": 44,
 'has': 45,
 'there': 46,
 'some': 47,
 'what': 48,
 'good': 49,
 'when': 50,
 'more': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'my': 56,
 'even': 57,
 'would': 58,
 'she': 59,
 'which': 60,
 'only': 61,
 'really': 62,
 'see': 63,
 'story': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'well': 69,
 'were': 70,
 'than': 71,
 'much': 72,
 'we': 73,
 'bad': 74,
 'been': 75,
 'get': 76,
 'do': 77,
 'great': 78,
 'other': 79,
 'will': 80,
 'also': 81,
 'into': 82,
 'p

In [20]:
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)
# converts all the reviews to a list of tokens corresponding to the words that appear in a particular review.

In [19]:
np.array(X_train_tokens[1])
# array of tokens for the 2nd review.Each word in the review has been mapped to the token (as seen in word_index).

array([  38,   14,  744, 3506,   45,   75,   32, 1771,   15,  153,   18,
        110,    3, 1344,    5,  343,  143,   20,    1,  920,   12,   70,
        281, 1228,  395,   35,  115,  267,   36,  166,    5,  368,  158,
         38, 2058,   15,    1,  504,   88,   83,  101,    4,    1, 4339,
         14,   39,    3,  432, 1148,  136, 8697,   42,  177,  138,   14,
       2791,    1,  295,   20, 5276,  351,    5, 3029, 2310,    1,   38,
       8697,   43, 3611,   26,  365,    5,  127,   53,   20,    1, 2032,
          7,    7,   18,   48,   43,   22,   70,  358,    3, 2343,    5,
        420,   20,    1, 2032,   15,    3, 3346,  208,    1,   22,  281,
         66,   36,    3,  344,    1,  728,  730,    3, 3864, 1320,   20,
          1, 1543,    3, 1293,    2,  267,   22,  281, 2734,    5,   63,
         48,   44,   37,    5,   26, 4339,   12,    6, 2079,    7,    7,
       3425, 2891,   35, 4446,   35,  405,   14,  297,    3,  986,  128,
         35,   45,  267,    8,    1,  181,  366, 69

In [25]:
# Padding and Truncating
# Although recurrent nets can deal with arbitary input lengths of sequences, it is necassary for the lengths within a batch
# of data to be the same.Between batches the sentences can be of different lengths.
# To achieve this we can do 2 things , either pad all the reviews with 0 until they match the largest review or truncate 
# the larger reviews to an average length.The second option seems more viable as it takes care of the outliers in the data.

# Calculates the number of tokens in each review

tokens = [len(tokens) for tokens in X_train_tokens+X_test_tokens]
tokens = np.array(tokens)
print(tokens.shape)

(50000,)


In [26]:
tokens

array([127, 401, 134, ..., 253, 119, 352])

In [27]:
np.mean(tokens)
# The mean length of a review is 220 words. 

221.27716

In [29]:
np.max(tokens)
# The largest review is of 2000 words.To pad all the remaining reviews with zeros would waste a lot of memory.
# Thus , we truncate the larger reviews and pad the smaller ones in order to cover most of the reviews.

2209

In [34]:
# To set a size of the maximum length of the review,we consider the value with mean and 2 standard deviations.
max_tokens = np.mean(tokens) + 2*np.std(tokens)
max_tokens = int(max_tokens)

In [35]:
# Padding/Truncating choice:- 'Pre' or 'Post'
# Sequences can be padded with zeros before or after the actual string.The problem with post padding can be:-
# suppose a sequence is input to an RNN cell, the cell will record something as memory and process the input.If, however
# now a string of zeros follow the actual string, the cell may lose some useful information about the sequence.


X_train_pad = pad_sequences(X_train_tokens,maxlen=max_tokens,padding='pre',truncating='pre')
X_test_pad = pad_sequences(X_test_tokens,maxlen=max_tokens,padding='pre',truncating='pre')

In [38]:
X_train_pad[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
         38,   14,  744, 3506,   45,   75,   32, 17

In [39]:
# To create inverse mapping from integer tokens to sequences

token_dict = tokenizer.word_index
inverse_dict = dict(zip(token_dict.values(),token_dict.keys()))

In [41]:
def from_tokens_to_text(tokens):
    words = [inverse_dict[token] for token in tokens]
    return ' '.join(words)

In [42]:
from_tokens_to_text(X_train_tokens[0])

"high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that satire is much closer to reality than is teachers the to survive the insightful students who can see right through their pathetic the of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately at high a classic line inspector i'm here to sack one of your teachers student welcome to high i expect that many adults of my age think that high is far fetched what a pity that it isn't"

In [43]:
X_train[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [44]:
# RNN Model

In [None]:
# Embedding Layer:
# The embedding layer is used to learn vectors of some predefined size since the current size of vectors if used can be 
# as high as 10,000.These vectors will learn a value ~ between -1 and 1.The size defined is 8 here for a small model.
# In general this value is around 100-300.These vectors are similar to those in word2vec.The learn similar values for 
# words that semantically similar.
# This is done simultaneously during the trainning time.

In [45]:
model = Sequential()
model.add(Embedding(input_dim=number_of_words,output_dim=8,input_length=max_tokens,name='embedding_layer'))
model.add(GRU(units=16,return_sequences=True))
model.add(GRU(units=8,return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1,activation='sigmoid'))
optimizer = Adam(lr=0.001)
model.compile(loss='binary_crossentropy',metrics=['accuracy'],optimizer=optimizer)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 544, 8)            80000     
_________________________________________________________________
gru_1 (GRU)                  (None, 544, 16)           1200      
_________________________________________________________________
gru_2 (GRU)                  (None, 544, 8)            600       
_________________________________________________________________
gru_3 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 5         
Total params: 81,961
Trainable params: 81,961
Non-trainable params: 0
_________________________________________________________________


In [46]:
model.fit(X_train_pad,y_train,validation_split=0.05,epochs=3,batch_size=64)

Train on 23750 samples, validate on 1250 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1843063d208>

In [48]:
result = model.evaluate(X_test_pad,y_test)



In [57]:
print("Accuracy: {0:.2%}".format(result[1]))

Accuracy: 86.84%


In [58]:
from keras.models import load_model
model.save('senti_model.h5')

In [59]:
del model

In [61]:
model = load_model('senti_model.h5')

In [62]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 544, 8)            80000     
_________________________________________________________________
gru_1 (GRU)                  (None, 544, 16)           1200      
_________________________________________________________________
gru_2 (GRU)                  (None, 544, 8)            600       
_________________________________________________________________
gru_3 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 5         
Total params: 81,961
Trainable params: 81,961
Non-trainable params: 0
_________________________________________________________________


In [63]:
# Prediction on new data

In [68]:
rev_1 = "The movie was shitty."
rev_2 = "It was preposterously fascinating to see the characters dressed like veggies in that awful movie."
rev_3 = "I didn't like the movie but actors were good."

rev_list = [rev_1,rev_2,rev_3]
tokenized_text = tokenizer.texts_to_sequences(rev_list)
padded_tokens = pad_sequences(tokenized_text,maxlen=max_tokens,padding='pre',truncating='pre')

In [69]:
model.predict(padded_tokens)

array([[0.81739676],
       [0.41902947],
       [0.7428668 ]], dtype=float32)

In [None]:
# The model is mis judging the first review to be a positive one where it is clearly negative.This could be due to the fact
# that the model is not aware about any word close to 'shit.'
# The other 2 predictions are reasonable.