In [6]:
# Credits: https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
# LSTM for sequence classification in the IMDB dataset
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)


In [7]:
#Refer: https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification

# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words)




  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [8]:
print(X_train[1])
# print(X_train[20])

[1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 4369, 2, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 3103, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 2, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 4901, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 2, 5, 163, 11, 3215, 2, 4, 1153, 9, 194, 775, 7, 2, 2, 349, 2637, 148, 605, 2, 2, 15, 123, 125, 68, 2, 2, 15, 349, 165, 4362, 98, 5, 4, 228, 9, 43, 2, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 4373, 228, 2, 5, 2, 656, 245, 2350, 5, 4, 2, 131, 152, 491, 18, 2, 32, 2, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95]


#### How the words are encoded?

    Let there is a data corpus of reviews
    x1: review1, x2: review2, x3:review3, ......
    review1 = ([w1,w2,w3,w4,w5,w6] , y1), review2 = ([w1,w2,w3,w4,w5,w6,w7,w8,w9,w10], y2), .........
    
    1.Build a set of all the words in the complete reviews corpus (let v)
    2.compute the frequency of all the words (word : frequency)
    3.Sort by frequency in decending order
    4.Give all the index numbers (Whenever we see a word, we will replace it with its index)
    5.let is:50k , a:49k , for:20k , this:5k , food:50 , dogs:10 = [1,2,3,4,5]
      e.g: This food is for dogs : We can represent the sentence as [4,5,1,3,6] 

In [9]:
print(type(X_train[1]))       
print(len(X_train[1]))   # Review1 contains 189 words and it was encoded in the above cell
print(len(X_train[2]))   # Review2 contains 141 words and it was encoded in the above cell

<class 'list'>
189
141


In [10]:
# truncate and/or pad input sequences
max_review_length = 600
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)  # We are performing a pre padding with max_length=600 
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

print(X_train.shape) # (25000 reviews, 600words from each review)
print(X_train[1])    # review1 is having 189 words, but we are adding 0 at other 411 places and it is a pre padding operation.

(25000, 600)
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    

### Why should we pad?

    Every review is of different length.
    For e.g:  review1: The movie was nice. 
              review2: I liked the movie, it was amazing, 
              review3: It was the worst movie I have seen in my life
    
    We have selected that our review length should be of 600 words.
    So we need to add 0 except at the words present in the review to make all the reviews of same length.
    If we don't pad, then we need to send all the words of a review at a time and then another review and so on. This will make the process too much slow.
    So we can do batch update to speed up the process by sending more words at a time with a batch size.

    e.g: review1 = [0,0,0,0,0,w1, w2, w3, w4, w5,.........,w40]   Total 42 words are there and after padding the length became 45
         review2 = [0,0,0,w1, w2, w3, w4, w5,.........,w42]   Total 42 words are there and after padding the length became 45
         
         Instead of sending all the words in a review one after another(which will make the process extremely slow), 
         we can specify a batch and send the words in a batch, like {x11,x21,x31,x41,x51} , {x12,x22,x32,x42,x52} , .... so on. Here batch size = 5
         Sending the words in a batch will speed up the process.


In [14]:
# create the model
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(top_words+1, embedding_vector_length, input_length=max_review_length))  # Embedding layer converts the integers into vectors.
''' In above line we are just embedding our top 5000 words (which is our input) with an output vector length of 32 for 600 reviews
    Number of embeddings = (number of words * output vector size) = 5000*32 = 160000 '''
model.add(LSTM(100)) 
'''100 LSTMs are there, all the 32 dimension vector go to each LSTM and each LSTM gives a different output.
   m = 32 = (inputs) , n = 100 = (outputs),    Number of parameters = 4(nm + sqr(n) + n) = 53200'''
#Refer: https://datascience.stackexchange.com/questions/10615/number-of-parameters-in-an-lstm-model

model.add(Dense(1, activation='sigmoid'))    # 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Embedding layer converts the integers into vectors. e.g: [[4],[20]] = []
# The number of parameters of LSTM, taking input vectors of size m and giving output vectors of size n with bias factor is 4(nm+n2+n)
# Our dense sigmoid layer is connected to 100 weights and we add 1 bias to it, so it becomes 101

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 600, 32)           160032    
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 213,333
Trainable params: 213,333
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
model.fit(X_train, y_train, epochs=10, batch_size=64) 
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 86.52%
