<a href="https://colab.research.google.com/github/harnalashok/deeplearning-sequences/blob/main/2_simple_rnn_IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Last amended: 07th March, 2021
My folder: /home/ashok/Documents/8.rnn
           github/harnalashok/
Ref: Page

Objectives:
        i)   To use SimpleRNN for Sentiment analysis
        ii)  To understand structure of Embedding layer
	    iii) To perform tokenization, see file:
             8.rnn/3.keras_tokenizer_class.py OR file
             8.rnn/0.document_to_id_conversion.py
	         And a quick note at the end of this code.

"""


In [31]:

# 1.0 Call libraries
%reset -f
import numpy as np

# 1.1 Import module imdb & other keras modules
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# 1.2 Misc
import matplotlib.pyplot as plt
import time


In [32]:
# 1.1
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [33]:
# 2.1 Define some constants
max_vocabulary = 10000        # words
max_len_review = 500          # words

In [4]:
# 2.2 About imdb module
help(imdb)

Help on package tensorflow.keras.datasets.imdb in tensorflow.keras.datasets:

NAME
    tensorflow.keras.datasets.imdb - IMDB sentiment classification dataset.

PACKAGE CONTENTS


FILE
    /usr/local/lib/python3.7/dist-packages/tensorflow/keras/datasets/imdb/__init__.py




In [34]:
# 2.3 Get imdb reviews. Limit vocabulary to size max_vocabulary
#      imdb reviews will be downloaded unless available at ~/.keras/datasets
# ************
#      See comments at the end as to how to quickly convert text to integers
# ************
(x_train,y_train),(x_test,y_test) = imdb.load_data(num_words=max_vocabulary)

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [35]:
# 2.4 Our downloaded data file is here:

!ls -la /root/.keras/datasets

total 17064
drwxr-xr-x 2 root root     4096 Mar  7 07:11 .
drwxr-xr-x 3 root root     4096 Mar  7 07:11 ..
-rw-r--r-- 1 root root 17464789 Mar  7 07:11 imdb.npz


In [36]:
# 2.5 About data
type(x_train)      # numpy.ndarray
x_train.shape      # (25000,)  Total 25000 reviews
x_test.shape       # (25000,)  Total 25000 reviews
y_train.shape      # (25000,)  Total 25000 pos/neg labels
y_test.shape       # (25000,)  Total 25000 pos/neg labels

numpy.ndarray

(25000,)

(25000,)

(25000,)

(25000,)

In [37]:
# 2.5.1
x_train[:2]       # Have a look at two documents
print("\n\n------------\n\n")
y_train[:4]       # array([1, 0, 0, 1])

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1153, 194, 8255, 78, 228,



------------




array([1, 0, 0, 1])

In [38]:
# 2.5.2 Every comment has different number of words
len(x_train[1])     # 189
print("\n\n------------\n\n")
len(x_train[10])    # 450

189



------------




450

In [39]:
# 2.6 Check max and min length of reviews
maxLen = 0         # Start with a low number
minLen = 200       # Start with a high number
for i in range(x_train.shape[0]):
    if maxLen < len(x_train[i]):
        maxLen = len(x_train[i])
    if minLen > len(x_train[i]):
        minLen = len(x_train[i])


In [40]:
# 2.6.1
maxLen         # 2494
minLen         # 11

2494

11

In [41]:
# 2.7 We want to pad all sequences to max_len_review size.
#     Reviews more in size will be truncated and less in
#     size will be padded with zeros
# help(sequence.pad_sequences)


In [42]:
# 2.7.1 Pad x_train sequences

x_train = sequence.pad_sequences(
                                 x_train,   # A list of lists where each inner
                                            # list is a sequence, Or,
                                            # An array of lists with each
                                            #  list being a sequence
                                 maxlen = max_len_review,
                                 padding = 'pre'
                                 )


In [43]:
# 2.7.2 Recheck again:

type(x_train)          # numpy.ndarray
print("\n\n------------\n\n")
x_train.shape          # (25000, 500) Each sequence becomes one row
print("\n\n------------\n\n")
len(x_train[1])     # 189
print("\n\n------------\n\n")
len(x_train[10])    # 450



numpy.ndarray



------------




(25000, 500)



------------




500



------------




500

In [44]:
# 3.0 Model now
model = Sequential()
# 3.1 Embedding layer
model.add(Embedding(max_vocabulary,            # Decides number of input neurons
                    32,                        # Decides number of neurons in hidden layer
                    input_length= max_len_review) # (optional) Decides how many times
                                                  # RNN should loop around
                                                  # If omitted, decided autoamtically
                                                  # during 'model.fit()' by considering
                                                  # x_train.shape[1]
                    )


In [45]:
# 3.2
# It is instructive to see number of parameters
#  in the summary. This tells us about the Embedding
#   layer as being two layered network with no of neurons
#    as max_vocabulary and output (hidden) layer with 32 neurons
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           320000    
Total params: 320,000
Trainable params: 320,000
Non-trainable params: 0
_________________________________________________________________


In [46]:
# 3.3 Ideally we should be adding not one RNN but as many RNNs as
#     there are timesteps ie sequence length or 'max_len_review'.
#     But we add just one and perform internal looping. Note that
#     internal weights and hence LSTM parameters remain same from one
#     'timestep' to another 'timestep'. You can verify this by
#     changing the value of max_len_review and seein that number
#     of parameters in the model summary after adding the following
#     do not change.

model.add(SimpleRNN(32,
                    return_sequences = False   # Make it True
                                               # And add layer #3.4
                    )
                    )   # Output


In [None]:
# 3.4 JUMP FOLLOWING UNLESS YOU WANT 'RNN' ABOVE 'RNN'. IT WORKS.
#     BUT TAKES TIME.
# 3.4 Make return_sequences = True in 3.3 above, before you add
#     the following layer with return_sequences = False. Else JUMP it.
#     ACCURACY IS SOMEWHAT MORE

model.add(SimpleRNN(
                    32,
                    return_sequences = False   # Make return_sequences = True
                                               # in earlier RNN for this to work
                    )
                    )   # Output



In [47]:
"""
Why SimpleRNN adds 2080 parameters?
    input_features * output_features = 32 * 32  = 1024
    state_t * output_features        = 32 * 32  = 1024
    Bias                                            32
    Total                                         2080
This total is INDEPENDENT of sequence length or timesteps.
"""
model.summary()     # Why SimpleRNN adds 2080 parameters?
                    # input_features * output_features = 32 * 32  = 1024
                    # state_t * output_features        = 32 * 32  = 1024
                    # Bias                                            32
                    # Total                                         2080


'\nWhy SimpleRNN adds 2080 parameters?\n    input_features * output_features = 32 * 32  = 1024\n    state_t * output_features        = 32 * 32  = 1024\n    Bias                                            32\n    Total                                         2080\nThis total is INDEPENDENT of sequence length or timesteps.\n'

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           320000    
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 32)                2080      
Total params: 322,080
Trainable params: 322,080
Non-trainable params: 0
_________________________________________________________________


In [48]:
# 3.5
model.add(Dense(1, activation = 'sigmoid'))
model.summary()
#help(model.compile)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           320000    
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 322,113
Trainable params: 322,113
Non-trainable params: 0
_________________________________________________________________


In [49]:
# 3.5.1
model.compile(loss = 'binary_crossentropy',
              optimizer = 'rmsprop',
              metrics = ['acc'])



In [50]:
# 4.0
epochs = 10
start = time.time()
history = model.fit(x_train,
                    y_train,
                    batch_size = 32,             # Number of samples per gradient update
                    validation_split = 0.2,      # Fraction of training data to be used as validation data
                    epochs = epochs,
                    shuffle = True,              # Shuffle training data before each epoch
                    verbose =1
                    )
end = time.time()
(end-start)/60


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


9.425282375017803

In [None]:
# 5.0 Plot how network learns as per epochs
def plot_learning_curve():
    val_acc = history.history['val_acc']
    tr_acc=history.history['acc']
    epochs = range(1, len(val_acc) +1)
    plt.plot(epochs,val_acc, 'b', label = "Validation accu")
    plt.plot(epochs, tr_acc, 'r', label = "Training accu")
    plt.title("Learning Curve: Training and validation accuracy")
    plt.legend()
    plt.show()


In [None]:
# 5.1
plot_learning_curve()

In [None]:
# 6.1 Get x_test padded
x_test = sequence.pad_sequences(
                                 x_test,   # A list of lists where each inner
                                            # list is a sequence, Or,
                                            # An array of lists with each
                                            #  list being a sequence
                                 maxlen = max_len_review,
                                 padding = 'pre'
                                 )

In [None]:
# 6.2 Predict now
out = model.predict(x_test)
out[out > 0.5]  = 1
out[out <= 0.5] = 0
out

In [None]:
# 6.3
model.evaluate(x_test,y_test)
# 7.3.1
model.metrics_names        # ['loss', 'acc']


In [None]:
############ I am done ################

In [None]:

###############################################################
# Here is Quick text to integer conversion
#  For more study, please see file: 3.keras_tokenizer_class.py
###############################################################

from tensorflow.keras.preprocessing.text import Tokenizer
texts = ["Sun shines brightly  in June!",
         "Star light shines on water?",
         "Water is flowing.",
         "Flowing water, shines",
         "Sun is star?",
         "World shines",
         "Star also shines",
         "water is life",
         "Sun is energy"]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
tokenizer.word_index       # Index is created based on word-frequencies
                           # Most frequent word gets the least index
tokenizer.texts_to_sequences(texts)
#########
