In [1]:
import tensorflow as tf
from tensorflow import keras
from keras import Sequential
from keras.layers import Dense, SimpleRNN

In [2]:
# SimpleRNN (Batchsize, timesteps, no of features)

model = Sequential()
model.add(SimpleRNN(3, input_shape=(4,5))) # SimpleRNN(Batch_size, (TimeSteps, no_of_features))
model.add(Dense(1, activation='sigmoid'))
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 3)                 27        
                                                                 
 dense (Dense)               (None, 1)                 4         
                                                                 
Total params: 31 (124.00 Byte)
Trainable params: 31 (124.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [3]:
print(model.get_weights()[0].shape)
model.get_weights()[0]

(5, 3)


array([[-0.8110293 , -0.6211981 , -0.18416983],
       [-0.34915495, -0.08607161, -0.02857989],
       [-0.52748   ,  0.23798114,  0.5137809 ],
       [ 0.22792763, -0.6575132 ,  0.6429303 ],
       [-0.2554695 , -0.7696267 ,  0.27624768]], dtype=float32)

In [4]:
print(model.get_weights()[1].shape)
model.get_weights()[1]

(3, 3)


array([[ 0.32521558,  0.3259725 ,  0.88768053],
       [ 0.94243145, -0.03446494, -0.33261847],
       [-0.07783061,  0.9447509 , -0.3184152 ]], dtype=float32)

In [5]:
print(model.get_weights()[2].shape)
model.get_weights()[2]

(3,)


array([0., 0., 0.], dtype=float32)

In [6]:
print(model.get_weights()[3].shape)
model.get_weights()[3]

(3, 1)


array([[ 0.7860352 ],
       [-0.02757668],
       [-0.8455278 ]], dtype=float32)

In [7]:
print(model.get_weights()[4].shape)
model.get_weights()[4]

(1,)


array([0.], dtype=float32)

In [8]:
print(model.get_weights()[5].shape)
model.get_weights()[5]

IndexError: list index out of range

## Approach 1 -  Integer encoding method

In [9]:
import numpy as np
documents = ['Good Morning', 'Hope you are all doing well', 
             'We are upgrading ourself in advance deep learning', 'India won the match', 'Fight between Israel and Hamaz',
             'America always support Israel', 'Australia won the match against pakistan', 'Microchip company opened in 1984', 'Kohli Kohli', 
             'Modi Ji jai ho', 'Chandrayaan 3 landed in south moon part']



In [10]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token='<kumar>') #Out of vocablury specification

In [11]:
tokenizer.fit_on_texts(documents)
tokenizer.word_index

{'<kumar>': 1,
 'in': 2,
 'are': 3,
 'won': 4,
 'the': 5,
 'match': 6,
 'israel': 7,
 'kohli': 8,
 'good': 9,
 'morning': 10,
 'hope': 11,
 'you': 12,
 'all': 13,
 'doing': 14,
 'well': 15,
 'we': 16,
 'upgrading': 17,
 'ourself': 18,
 'advance': 19,
 'deep': 20,
 'learning': 21,
 'india': 22,
 'fight': 23,
 'between': 24,
 'and': 25,
 'hamaz': 26,
 'america': 27,
 'always': 28,
 'support': 29,
 'australia': 30,
 'against': 31,
 'pakistan': 32,
 'microchip': 33,
 'company': 34,
 'opened': 35,
 '1984': 36,
 'modi': 37,
 'ji': 38,
 'jai': 39,
 'ho': 40,
 'chandrayaan': 41,
 '3': 42,
 'landed': 43,
 'south': 44,
 'moon': 45,
 'part': 46}

In [12]:
tokenizer.word_counts

OrderedDict([('good', 1),
             ('morning', 1),
             ('hope', 1),
             ('you', 1),
             ('are', 2),
             ('all', 1),
             ('doing', 1),
             ('well', 1),
             ('we', 1),
             ('upgrading', 1),
             ('ourself', 1),
             ('in', 3),
             ('advance', 1),
             ('deep', 1),
             ('learning', 1),
             ('india', 1),
             ('won', 2),
             ('the', 2),
             ('match', 2),
             ('fight', 1),
             ('between', 1),
             ('israel', 2),
             ('and', 1),
             ('hamaz', 1),
             ('america', 1),
             ('always', 1),
             ('support', 1),
             ('australia', 1),
             ('against', 1),
             ('pakistan', 1),
             ('microchip', 1),
             ('company', 1),
             ('opened', 1),
             ('1984', 1),
             ('kohli', 2),
             ('modi', 1),
             ('

In [13]:
tokenizer.document_count

11

In [14]:
documents

['Good Morning',
 'Hope you are all doing well',
 'We are upgrading ourself in advance deep learning',
 'India won the match',
 'Fight between Israel and Hamaz',
 'America always support Israel',
 'Australia won the match against pakistan',
 'Microchip company opened in 1984',
 'Kohli Kohli',
 'Modi Ji jai ho',
 'Chandrayaan 3 landed in south moon part']

In [15]:
#Representing words with tokens
sequences = tokenizer.texts_to_sequences(documents)
sequences

[[9, 10],
 [11, 12, 3, 13, 14, 15],
 [16, 3, 17, 18, 2, 19, 20, 21],
 [22, 4, 5, 6],
 [23, 24, 7, 25, 26],
 [27, 28, 29, 7],
 [30, 4, 5, 6, 31, 32],
 [33, 34, 35, 2, 36],
 [8, 8],
 [37, 38, 39, 40],
 [41, 42, 43, 2, 44, 45, 46]]

In [16]:
#Applying zero padding (Pre or post)
from keras.utils import pad_sequences
sequences = pad_sequences(sequences, padding='post')
sequences

array([[ 9, 10,  0,  0,  0,  0,  0,  0],
       [11, 12,  3, 13, 14, 15,  0,  0],
       [16,  3, 17, 18,  2, 19, 20, 21],
       [22,  4,  5,  6,  0,  0,  0,  0],
       [23, 24,  7, 25, 26,  0,  0,  0],
       [27, 28, 29,  7,  0,  0,  0,  0],
       [30,  4,  5,  6, 31, 32,  0,  0],
       [33, 34, 35,  2, 36,  0,  0,  0],
       [ 8,  8,  0,  0,  0,  0,  0,  0],
       [37, 38, 39, 40,  0,  0,  0,  0],
       [41, 42, 43,  2, 44, 45, 46,  0]], dtype=int32)

## Case study - IMDB movie review sentiment classification dataset

In [25]:
from keras.datasets import imdb
from keras import Sequential
from keras.layers import Dense, SimpleRNN, Embedding, Flatten

In [26]:
(x_train, y_train), (x_test, y_test) = imdb.load_data()
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(25000,) (25000,) (25000,) (25000,)


In [None]:
print(x_train) #Dataset already has tokenised data

[list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32])
 list([1, 194, 1153, 194, 825

In [None]:
x_train[100]

[1,
 13,
 244,
 6,
 87,
 337,
 7,
 628,
 2219,
 5,
 28,
 285,
 15,
 240,
 93,
 23,
 288,
 549,
 18,
 1455,
 673,
 4,
 241,
 534,
 3635,
 8448,
 20,
 38,
 54,
 13,
 258,
 46,
 44,
 14,
 13,
 1241,
 7258,
 12,
 5,
 5,
 51,
 9,
 14,
 45,
 6,
 762,
 7,
 17802,
 1309,
 328,
 5,
 428,
 2473,
 15,
 26,
 1292,
 5,
 3939,
 6728,
 5,
 1960,
 279,
 13,
 92,
 124,
 803,
 52,
 21,
 279,
 14,
 9,
 43,
 6,
 762,
 7,
 595,
 15,
 16,
 28911,
 23,
 4,
 1071,
 467,
 4,
 403,
 7,
 628,
 2219,
 8,
 97,
 6,
 171,
 3596,
 99,
 387,
 72,
 97,
 12,
 788,
 15,
 13,
 161,
 459,
 44,
 4,
 3939,
 1101,
 173,
 21,
 69,
 8,
 401,
 22239,
 4,
 481,
 88,
 61,
 4731,
 238,
 28,
 32,
 11,
 32,
 14,
 9,
 6,
 545,
 1332,
 766,
 5,
 203,
 73,
 28,
 43,
 77,
 317,
 11,
 4,
 22228,
 953,
 270,
 17,
 6,
 3616,
 13,
 545,
 386,
 25,
 92,
 1142,
 129,
 278,
 23,
 14,
 241,
 46,
 7,
 158]

In [None]:
print(len(x_train[0]))
print(len(x_train[1]))
print(len(x_train[5]))  
#Utterance lengths are varying (different timestamp), so padding is required


218
189
43


In [None]:
#Padding is required as we have different lenghts/timestamps for each review
from keras.utils import pad_sequences

x_train = pad_sequences(x_train, padding='post')
y_test = pad_sequences(x_test, padding='post')

In [None]:
print(len(x_train[0]))
print(len(x_train[1]))
print(len(x_train[5]))  

2494
2494
2494


In [None]:
#Perform padding and cap the data to 100, to reduce the complexity (Data is lost here) -- Not recommended in PROD
x_train = pad_sequences(x_train, padding='post', maxlen=200)
y_test = pad_sequences(x_test, padding='post', maxlen=200)

In [None]:
print(len(x_train[0]))
print(len(x_train[1]))
print(len(x_train[5]))  

200
200
200


### Simple RNN model - Approach 1 (Interger Encoding)

In [None]:
model = Sequential()
model.add(SimpleRNN(32, input_shape = (200, 1), return_sequences=False)) #200 max vocab size and 1 sequence, return_sequences=Output at the end
model.add(Dense(1, activation='sigmoid')) #1 Neuron as the review is either positive or negative 
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_1 (SimpleRNN)    (None, 32)                1088      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1121 (4.38 KB)
Trainable params: 1121 (4.38 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
1*32+ 32*32+ 32

1088

In [None]:
32*1 + 1

33

In [None]:
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test))

Epoch 1/10




ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

## Approach 2 - Embedding method (Dense representation)

In [28]:
model = Sequential()
model.add(Embedding(10000,2))   #Vocab size = 10K(random), each vocab has 2 dimention. Means 2 numbers togather rep a word
model.add(SimpleRNN(32, return_sequences=False)) # Many to one scenario. Sentiment analysis (Many words, one sentiment) specified by return_sequences=F
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 2)           20000     
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 32)                1120      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 21153 (82.63 KB)
Trainable params: 21153 (82.63 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [30]:
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test))

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).