<a href="https://colab.research.google.com/github/jeffyelson/IntroductiontoDeepLearning/blob/main/IDL_06_RNNImplementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt

In [None]:
max_words = 20000
max_len = 200

(train_sequences, train_labels), (test_sequences, test_labels) = tf.keras.datasets.imdb.load_data(num_words=max_words)


def preprocess(sequences, labels):
    return sequences, labels.astype(np.int32)

train_sequences, train_labels = preprocess(train_sequences, train_labels)
test_sequences, test_labels = preprocess(test_sequences, test_labels)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [None]:
vocabulary = tf.keras.datasets.imdb.get_word_index()
char_to_ind = vocabulary
ind_to_char = {ind: char for (char, ind) in vocabulary.items()}

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [None]:
# remember this? doesn't work...
train_data = tf.data.Dataset.from_tensor_slices((train_sequences, train_labels))

ValueError: ignored

In [None]:
# we can create a dataset from a python generator. first, we have to write the generator
# this is a very simple one, but we could execute arbitrary python code in here
# (say, loading files from disk and preparing the loaded inputs somehow)
def gen():
    for sequence, label in zip(train_sequences, train_labels):
        yield sequence, label

In [None]:
# we have to tell TF what to expect from the generator ("Tensor Specification")
train_data = tf.data.Dataset.from_generator(gen, output_signature=(
         tf.TensorSpec(shape=(None,), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.int32)))

# regular .batch wouldn't work because the inputs are different length.
# padded batch automatically pads all elements in the batch to the longest length
# per dimension.
# you can also specify different shapes and padding values other than 0.
# padding is always "post"
train_data = train_data.padded_batch(32)

In [None]:
for sequence, label in train_data:
    print(sequence.shape, label.shape)
    input()

(32, 888) (32,)


KeyboardInterrupt: ignored

In [None]:
# we have to tell TF what to expect from the generator ("Tensor Specification")
train_data = tf.data.Dataset.from_generator(gen, output_signature=(
         tf.TensorSpec(shape=(None,), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.int32)))

# alternatively, we can use bucketing. the idea is to define buckets for specific
# sequence lengths, and put all sequences in their corresponding bucket.
# when a batch is requested, first a bucket is selected and then all elements of
# the batch are taken from this bucket.
# this guarantees that all elements in a batch are roughly the same length,
# minimizing the amount of padding.

# here is an example with buckets in steps of 50. all sequences above length 500
# end up in the same bucket. same for sequences below length 50.
# do note that I by no means claim that this is a "good" bucketing. play around with it!
buckets = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
bucket_batch_size = [32] * (len(buckets) + 1)
train_data = train_data.bucket_by_sequence_length(lambda sequence, label: tf.shape(sequence)[0],
                                                  bucket_boundaries=buckets, bucket_batch_sizes=bucket_batch_size)
print(bucket_batch_size)

[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32]


In [None]:
# NOTE!!
# you should probably still remove very long sequences (longer than some cutoff)
# before converting to a dataset

In [None]:
# compare the average batch shapes with the padded_batch example. there, batches are
# often length 800 or so because the longest sequence in the batch happened to
# have that length.
# with bucketing, we get many much smaller batches, meaning more efficient training.
for sequence, label in train_data:
    print(sequence.shape, label.shape)
    input()

(32, 147) (32,)


KeyboardInterrupt: ignored

In [None]:
# here's a very simple toy example for a keras lstm
# the "hidden dimensions" are just randomly chosen. 
# you probably don't want to use a hidden size of 12 =) (but maybe it's actually really good?)


# embedding comes first to replace one-hot vectors. 
#    mask_zero=True to prevent computations on padded time steps.
# then an arbitrary number of RNN layers.
# deeper RNN layers take as input sequence the state sequence of the layer before,
# so all layers except the last one should return_sequences=True
# finally, a Dense layer for the output, since the output computation is *not*
# included in the RNN cells; all cells provided by Keras only compute the states
model = tf.keras.Sequential([tf.keras.layers.Embedding(max_words, 20, mask_zero=True), 
                             tf.keras.layers.LSTM(12, return_sequences=True),
                             tf.keras.layers.LSTM(15),
                             tf.keras.layers.Dense(1)])


# FYI, the third line is the same as the first two lines together.
# the second option can use a much more efficient implementation, it will be SOOO much faster.
# try it yourself!
#rnn_cell = tf.keras.layers.LSTMCell(12)
#rnn = tf.keras.layers.RNN(rnn_cell, return_sequences=False)
rnn = tf.keras.layers.LSTM(12, return_sequences=False)
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 20)          400000    
                                                                 
 lstm (LSTM)                 (None, None, 12)          1584      
                                                                 
 lstm_1 (LSTM)               (None, 15)                1680      
                                                                 
 dense (Dense)               (None, 1)                 16        
                                                                 
Total params: 403,280
Trainable params: 403,280
Non-trainable params: 0
_________________________________________________________________


In [None]:
# calling RNN layers is easy!
one_hot_batch = tf.one_hot(sequence, depth=max_words)
rnn(one_hot_batch)

<tf.Tensor: shape=(32, 12), dtype=float32, numpy=
array([[-2.57647363e-03,  1.14720734e-02,  9.63272154e-03,
        -1.06069241e-02, -9.16691124e-03,  2.82309111e-03,
        -1.08515322e-02,  1.08175557e-02, -5.31226955e-03,
        -4.05588560e-03,  4.98163095e-03,  2.03115423e-03],
       [-2.19507259e-03,  3.69410450e-03, -5.19726053e-03,
         6.80072419e-03,  5.02034416e-03, -5.49666956e-03,
         2.79236771e-03,  2.44621094e-03, -1.27803083e-04,
         7.07393570e-04, -1.50572625e-03,  2.48943339e-03],
       [-2.61177751e-03,  1.88819710e-02,  1.44013604e-02,
        -1.96601860e-02, -1.25700189e-02,  1.01099873e-03,
        -1.73655991e-02,  1.34780919e-02, -8.68466683e-03,
        -5.60008176e-03,  7.07190018e-03,  4.41082334e-03],
       [-2.97777378e-03,  1.81576516e-02,  1.39152529e-02,
        -1.87522210e-02, -1.23918587e-02,  1.52583828e-03,
        -1.70959458e-02,  1.34468116e-02, -8.21813941e-03,
        -5.71518950e-03,  6.46761293e-03,  4.00661118e-03],
  

# Different RNN Implementations

##Model 1 - Basic LSTM

In [None]:
def gen_test():
    for sequence, label in zip(test_sequences, test_labels):
        yield sequence, label

test_data = tf.data.Dataset.from_generator(gen_test, output_signature=(
         tf.TensorSpec(shape=(None,), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.int32)))
buckets = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
bucket_batch_size = [32] * (len(buckets) + 1)
test_data = test_data.bucket_by_sequence_length(lambda sequence, label: tf.shape(sequence)[0],
                                                  bucket_boundaries=buckets, bucket_batch_sizes=bucket_batch_size)

In [None]:
rnnModel1 = tf.keras.Sequential([tf.keras.layers.Embedding(max_words, 64 , mask_zero=True), 
                             tf.keras.layers.LSTM(32, return_sequences=True),
                             tf.keras.layers.LSTM(64, return_sequences=True),
                             tf.keras.layers.LSTM(128),
                             tf.keras.layers.Dense(64,activation='relu'),
                             tf.keras.layers.Dense(1)])
rnnModel1.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 64)          1280000   
                                                                 
 lstm_6 (LSTM)               (None, None, 32)          12416     
                                                                 
 lstm_7 (LSTM)               (None, None, 64)          24832     
                                                                 
 lstm_8 (LSTM)               (None, 128)               98816     
                                                                 
 dense_3 (Dense)             (None, 64)                8256      
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,424,385
Trainable params: 1,424,385
No

In [None]:
optimizer = tf.optimizers.Adam()

loss_fn = tf.losses.BinaryCrossentropy(from_logits=True)

In [None]:
rnnModel1.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])


In [None]:
rnnModel1.fit(train_data, batch_size=bucket_batch_size, validation_data=test_data,epochs=5)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f4cb58234d0>

In [None]:
rnnModel1.evaluate(test_data)



[0.5721970200538635, 0.8433600068092346]

##Model 2 - Bidirectional LSTM with maximum sequence length

In [None]:
rnnModel2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_words, 64, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])
rnnModel2.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 200, 64)           1280000   
                                                                 
 bidirectional_6 (Bidirectio  (None, 200, 128)         66048     
 nal)                                                            
                                                                 
 bidirectional_7 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dense_9 (Dense)             (None, 64)                4160      
                                                                 
 dense_10 (Dense)            (None, 1)                 65        
                                                                 
Total params: 1,391,489
Trainable params: 1,391,489
No

In [None]:
rnnModel2.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

In [None]:
rnnModel2.fit(train_data, batch_size=bucket_batch_size, validation_data=test_data,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f4c44f6bd50>

In [None]:
rnnModel2.evaluate(test_data)



[0.39166027307510376, 0.8832399845123291]

##Model 3 - Bidirectional RNN without  mask_zero = True



In [None]:
rnnModel3 = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_words, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(32, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])
rnnModel3.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, None, 64)          1280000   
                                                                 
 bidirectional_10 (Bidirecti  (None, None, 64)         6208      
 onal)                                                           
                                                                 
 bidirectional_11 (Bidirecti  (None, 128)              16512     
 onal)                                                           
                                                                 
 dense_13 (Dense)            (None, 64)                8256      
                                                                 
 dense_14 (Dense)            (None, 1)                 65        
                                                                 
Total params: 1,311,041
Trainable params: 1,311,041
No

In [None]:
rnnModel3.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

In [None]:
rnnModel3.fit(train_data, batch_size=bucket_batch_size, validation_data=test_data,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f4c45803cd0>

In [None]:
rnnModel3.evaluate(test_data)



[0.9867584109306335, 0.6502400040626526]

The performance without mask_zero seems to be slow. We also see that the model has overfit. The reason could also be large number of neurons.

##Model 4 - Bidirectional RNN with mask_zero = True

In [None]:
rnnModel4 = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_words, 64,mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(20, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(32)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])
rnnModel4.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, None, 64)          1280000   
                                                                 
 bidirectional_12 (Bidirecti  (None, None, 40)         3400      
 onal)                                                           
                                                                 
 bidirectional_13 (Bidirecti  (None, 64)               4672      
 onal)                                                           
                                                                 
 dense_15 (Dense)            (None, 32)                2080      
                                                                 
 dense_16 (Dense)            (None, 1)                 33        
                                                                 
Total params: 1,290,185
Trainable params: 1,290,185
No

In [None]:
rnnModel4.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

In [None]:
rnnModel4.fit(train_data, batch_size=bucket_batch_size, validation_data=test_data,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f4c32bf0b90>

In [None]:
rnnModel4.evaluate(test_data)



[0.5568530559539795, 0.8342800140380859]

It is seen that the model performs better and a higher accuracy can be seen

## Model 5 - GRU Implementation

In [None]:
rnnModel5 = tf.keras.Sequential([tf.keras.layers.Embedding(max_words, 64 , mask_zero=True), 
                             tf.keras.layers.GRU(32, return_sequences=True),
                             tf.keras.layers.GRU(64, return_sequences=True),
                             tf.keras.layers.GRU(128),
                             tf.keras.layers.Dense(64,activation='relu'),
                             tf.keras.layers.Dense(1)])
rnnModel5.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, None, 64)          1280000   
                                                                 
 gru (GRU)                   (None, None, 32)          9408      
                                                                 
 gru_1 (GRU)                 (None, None, 64)          18816     
                                                                 
 gru_2 (GRU)                 (None, 128)               74496     
                                                                 
 dense_17 (Dense)            (None, 64)                8256      
                                                                 
 dense_18 (Dense)            (None, 1)                 65        
                                                                 
Total params: 1,391,041
Trainable params: 1,391,041
No

In [None]:
rnnModel5.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

In [None]:
rnnModel5.fit(train_data, batch_size=bucket_batch_size, validation_data=test_data,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f4be04693d0>

In [None]:
rnnModel5.evaluate(test_data)



[0.6090323328971863, 0.8679599761962891]

## Model 6 - Bidirectional GRU Implementation

In [None]:
rnnModel6 = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_words, 64, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])
rnnModel6.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 200, 64)           1280000   
                                                                 
 bidirectional_14 (Bidirecti  (None, 200, 128)         49920     
 onal)                                                           
                                                                 
 bidirectional_15 (Bidirecti  (None, 64)               31104     
 onal)                                                           
                                                                 
 dense_19 (Dense)            (None, 64)                4160      
                                                                 
 dense_20 (Dense)            (None, 1)                 65        
                                                                 
Total params: 1,365,249
Trainable params: 1,365,249
N

In [None]:
rnnModel6.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

In [None]:
rnnModel6.fit(train_data, batch_size=bucket_batch_size, validation_data=test_data,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f4b6601ed10>

In [None]:
rnnModel6.evaluate(test_data)



[0.5706406831741333, 0.863319993019104]

## Model 7 - Bidirectional LSTM with lower vocabulary

In [None]:
rnnModel7 = tf.keras.Sequential([
    tf.keras.layers.Embedding(5000, 64, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])
rnnModel7.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 200, 64)           320000    
                                                                 
 bidirectional_16 (Bidirecti  (None, 200, 128)         66048     
 onal)                                                           
                                                                 
 bidirectional_17 (Bidirecti  (None, 64)               41216     
 onal)                                                           
                                                                 
 dense_21 (Dense)            (None, 64)                4160      
                                                                 
 dense_22 (Dense)            (None, 1)                 65        
                                                                 
Total params: 431,489
Trainable params: 431,489
Non-t

In [None]:
rnnModel7.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

In [None]:
rnnModel7.fit(train_data, batch_size=bucket_batch_size, validation_data=test_data,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f4b69d72410>

In [None]:
rnnModel7.evaluate(test_data)



[0.28544676303863525, 0.8867999911308289]

With lower vocabulary the model performs better