In [24]:
import numpy as np
from numpy import random
from time import perf_counter
import tensorflow as tf

In [25]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

In [26]:
random.seed(10)                 # Random seed, so your results match ours
emb = 128                       # Embedding size
T = 256                         # Length of sequence
h_dim = 16                      # Hidden State dimension
h_0 = np.zeros((h_dim, 1))      # Initial Hidden State

In [27]:
w1 = random.standard_normal((h_dim, emb + h_dim))
w2 = random.standard_normal((h_dim, emb + h_dim))
w3 = random.standard_normal((h_dim, emb + h_dim))
b1 = random.standard_normal((h_dim, 1))
b2 = random.standard_normal((h_dim, 1))
b3 = random.standard_normal((h_dim, 1))

In [28]:
# Random initialization of input X
X = random.standard_normal((T, emb, 1))

In [29]:
# Define the lists of weights as you will need them for the two different layers
weights_vanilla = [w1, b1]
weights_GRU = [w1.copy(), w2, w3, b1.copy(), b2, b3]

In [30]:
def forward_V_RNN(inputs, weights): # Forward propagation for a vanilla RNN cell
    x, h_t = inputs
    wh, bh = weights

    # Next Hidden State
    h_t = np.dot(wh, np.concatenate([h_t, x])) + bh
    h_t = sigmoid(h_t)
    
    # Transformation from h to y is considered with identity activation and weights
    y = h_t
    
    return y, h_t

In [31]:
def forward_GRU(inputs, weights): # Forward propagation for a single GRU cell
    x, h_t = inputs

    # weights.
    wu, wr, wc, bu, br, bc = weights

    # Update gate
    u = np.dot(wu, np.concatenate([h_t, x])) + bu
    u = sigmoid(u)
    
    # Relevance gate
    r = np.dot(wr, np.concatenate([h_t, x])) + br
    r = sigmoid(r)
    
    # Candidate hidden state 
    c = np.dot(wc, np.concatenate([r * h_t, x])) + bc
    c = np.tanh(c)
    
    # New Hidden state h_t
    h_t = u * c + (1 - u) * h_t
    
    # Transformation from h to y is considered with identity activation and weights
    y = h_t
    
    return y, h_t

In [32]:
forward_GRU([X[1], h_0], weights_GRU)[0]

array([[ 9.77779014e-01],
       [-9.97986240e-01],
       [-5.19958083e-01],
       [-9.99999886e-01],
       [-9.99707004e-01],
       [-3.02197037e-04],
       [-9.58733503e-01],
       [ 2.10804828e-02],
       [ 9.77365398e-05],
       [ 9.99833090e-01],
       [ 1.63200940e-08],
       [ 8.51874303e-01],
       [ 5.21399924e-02],
       [ 2.15495959e-02],
       [ 9.99878828e-01],
       [ 9.77165472e-01]])

In [33]:
def scan(fn, elems, weights, h_0):
    h_t = h_0
    ys = []
    
    for x in elems:
        y, h_t = fn([x, h_t], weights)
        ys.append(y)
        
    return ys, h_t

In [34]:
# Simple RNN
tic = perf_counter()
ys, h_T = scan(forward_V_RNN, X, weights_vanilla, h_0)
toc = perf_counter()
RNN_time=(toc-tic)*1000
print (f"It took {RNN_time:.2f}ms to run the forward method for the vanilla RNN.")

It took 2.06ms to run the forward method for the vanilla RNN.


In [35]:
# GRU
tic = perf_counter()
ys, h_T = scan(forward_GRU, X, weights_GRU, h_0)
toc = perf_counter()
GRU_time=(toc-tic)*1000
print (f"It took {GRU_time:.2f}ms to run the forward method for the GRU.")

It took 4.23ms to run the forward method for the GRU.


In [36]:
model_GRU = tf.keras.Sequential([
    tf.keras.layers.GRU(256, return_sequences=True, name='GRU_1_returns_seq'),
    tf.keras.layers.GRU(128, return_sequences=True, name='GRU_2_returns_seq'),
    # tf.keras.layers.GRU(64, name='GRU_3_returns_last_only'),
    tf.keras.layers.Dense(10)
])

In [37]:
batch_size = 60
sequence_length = 50
word_vector_length = 40

input_data = tf.random.normal([batch_size, sequence_length, word_vector_length])

# Pass the data through the network
prediction = model_GRU(input_data)

# Show the summary of the model
model_GRU.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 GRU_1_returns_seq (GRU)     (60, 50, 256)             228864    
                                                                 
 GRU_2_returns_seq (GRU)     (60, 50, 128)             148224    
                                                                 
 dense_1 (Dense)             (60, 50, 10)              1290      
                                                                 
Total params: 378378 (1.44 MB)
Trainable params: 378378 (1.44 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [38]:
new_word_vector_length = 44  # Before it was 40
# Keep the batch_size = 60 and sequence_length = 50 as originally
input_data_1 = tf.random.normal([batch_size, sequence_length, new_word_vector_length])

# Pass the data through the network. This should Fail (if you ran all the cells above)
try:
    prediction = model_GRU(input_data_1)
except Exception as e:
    print(e)

Exception encountered when calling layer 'sequential_1' (type Sequential).

Input 0 of layer "GRU_1_returns_seq" is incompatible with the layer: expected shape=(None, None, 40), found shape=(60, 50, 44)

Call arguments received by layer 'sequential_1' (type Sequential):
  • inputs=tf.Tensor(shape=(60, 50, 44), dtype=float32)
  • training=None
  • mask=None


In [42]:
new_sequence_length = 55  # Before it was 50
# Keep the batch_size = 60 and word_vector_length = 40 as originally
input_data_2 = tf.random.normal([batch_size, new_sequence_length, word_vector_length])
try:
    prediction = model_GRU(input_data_2)
except Exception as e:
    print(e)
prediction.shape
model_GRU.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 GRU_1_returns_seq (GRU)     (60, None, 256)           228864    
                                                                 
 GRU_2_returns_seq (GRU)     (60, None, 128)           148224    
                                                                 
 dense_1 (Dense)             (60, None, 10)            1290      
                                                                 
Total params: 378378 (1.44 MB)
Trainable params: 378378 (1.44 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [43]:
model_GRU_2 = tf.keras.Sequential([
    tf.keras.layers.GRU(256, return_sequences=True, name='GRU_1_returns_seq'),
    tf.keras.layers.GRU(128, return_sequences=True, name='GRU_2_returns_seq'),
    tf.keras.layers.GRU(64, name='GRU_3_returns_last_only'),
    tf.keras.layers.Dense(10)
])

model_GRU_2.build([None, None, word_vector_length])

model_GRU_2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 GRU_1_returns_seq (GRU)     (None, None, 256)         228864    
                                                                 
 GRU_2_returns_seq (GRU)     (None, None, 128)         148224    
                                                                 
 GRU_3_returns_last_only (G  (None, 64)                37248     
 RU)                                                             
                                                                 
 dense_2 (Dense)             (None, 10)                650       
                                                                 
Total params: 414986 (1.58 MB)
Trainable params: 414986 (1.58 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
