<a href="https://colab.research.google.com/github/PhatHuynhTranSon99/Neural-Networks-From-Scratch/blob/main/Gated_Recurrent_Unit_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gated Recurrent Unit from scratch (using only Numpy)

## Library import

In [1]:
import numpy as np
import random

## Utility functions

In [4]:
def softmax(x):
  return np.exp(x) / np.sum(np.exp(x))

def tanh(x):
  return np.tanh(x)

def sigmoid(x):
  return 1 / (1 + np.exp(-x))

## Create sentiment analysis dataset

In [5]:
# Create a dataset for sentiment analysis
train_data = {
  'good': True,
  'bad': False,
  'happy': True,
  'sad': False,
  'not good': False,
  'not bad': True,
  'not happy': False,
  'not sad': True,
  'very good': True,
  'very bad': False,
  'very happy': True,
  'very sad': False,
  'i am happy': True,
  'this is good': True,
  'i am bad': False,
  'this is bad': False,
  'i am sad': False,
  'this is sad': False,
  'i am not happy': False,
  'this is not good': False,
  'i am not bad': True,
  'this is not sad': True,
  'i am very happy': True,
  'this is very good': True,
  'i am very bad': False,
  'this is very sad': False,
  'this is very happy': True,
  'i am good not bad': True,
  'this is good not bad': True,
  'i am bad not good': False,
  'i am good and happy': True,
  'this is not good and not happy': False,
  'i am not at all good': False,
  'i am not at all bad': True,
  'i am not at all happy': False,
  'this is not at all sad': True,
  'this is not at all happy': False,
  'i am good right now': True,
  'i am bad right now': False,
  'this is bad right now': False,
  'i am sad right now': False,
  'i was good earlier': True,
  'i was happy earlier': True,
  'i was bad earlier': False,
  'i was sad earlier': False,
  'i am very bad right now': False,
  'this is very good right now': True,
  'this is very sad right now': False,
  'this was bad earlier': False,
  'this was very good earlier': True,
  'this was very bad earlier': False,
  'this was very happy earlier': True,
  'this was very sad earlier': False,
  'i was good and not bad earlier': True,
  'i was not good and not happy earlier': False,
  'i am not at all bad or sad right now': True,
  'i am not at all good or happy right now': False,
  'this was not happy and not good earlier': False,
}

test_data = {
  'this is happy': True,
  'i am good': True,
  'this is not happy': False,
  'i am not good': False,
  'this is not bad': True,
  'i am not sad': True,
  'i am very good': True,
  'this is very bad': False,
  'i am very sad': False,
  'this is bad not good': False,
  'this is good and happy': True,
  'i am not good and not happy': False,
  'i am not at all sad': True,
  'this is not at all good': False,
  'this is not at all bad': True,
  'this is good right now': True,
  'this is sad right now': False,
  'this is very bad right now': False,
  'this was good earlier': True,
  'i was not happy and not good earlier': False,
}

In [6]:
# Firstly, calculate the vocabulary size of training and test set
word_to_index = {}
current_index = 0

for sentence in train_data:
  # Split sentences to get words
  words = sentence.split()

  # Put into word to index
  for word in words:
    if word not in word_to_index:
      word_to_index[word] = current_index
      current_index += 1

for sentence in test_data:
  # Split sentences to get words
  words = sentence.split()

  # Put into word to index
  for word in words:
    if word not in word_to_index:
      word_to_index[word] = current_index
      current_index += 1

# Function to one-hot-encode word
def encode(word, word_to_index, vocab_size):
  # Create a numpy array
  encoding = np.zeros(vocab_size)

  # Place 1 into position of word
  encoding[word_to_index[word]] = 1

  return encoding

# Build the dataset from each sentences:
def build_dataset(data, word_to_index, vocab_size):
  # Initialize an array
  dataset = []

  # For each sentence
  for sentence in data:
    # Initialize current X and y
    current_X = []
    
    # Get the label 
    label = data[sentence]
    current_y = int(label)

    # Split into words
    words = sentence.split()

    # One-hot-encode each word and put it in the database
    for word in words:
      current_X.append(encode(word, word_to_index, vocab_size))

    # Then add X and Y into dataset
    dataset.append((current_X, current_y))

  return dataset

# Create datasets for training and testing
train_dataset = build_dataset(train_data, word_to_index, vocab_size=current_index)
test_dataset = build_dataset(test_data, word_to_index, vocab_size=current_index)

# Define dimensions
input_size = len(word_to_index)
hidden_size = 64

In [None]:
# Check the training dataset
input, label = train_dataset[20]
# Display
print("X: ", input)
print("Y: ", label)

X:  [array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.]), array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.]), array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.]), array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.])]
Y:  1


## Define layers

### Softmax layer

In [8]:
class Softmax:
  def __init__(self, input_size, output_size):
    # Cache the size
    self.input_size = input_size
    self.output_size = output_size

    # Create the parameters for learning
    self.W = np.random.randn(self.output_size, self.input_size) / 1000
    self.b = np.zeros((self.output_size,))

  def forward(self, x, y):
    # Cache the input x and output yy
    self.x = x
    self.y = y

    # Perform softmax calculation
    # And cache the result
    z = self.W @ x + self.b
    y_hat = softmax(z)
    self.y_hat = y_hat

    # Calculate the loss
    loss = - np.log(y_hat[y])

    # Return both the loss and z
    return y_hat, loss

  def backward(self, learning_rate=0.01):
    # Calculate dLoss/dz
    dz = - self.y_hat
    dz[self.y] -= 1

    # Calculate dLoss/dW and dLoss/dz
    dW = np.outer(dz, self.x)
    db = dz

    # Perform gradient descent
    self.W -= learning_rate * dW
    self.b -= learning_rate * db

    # Calculate the loss w.r.t to input
    dx = dz @ self.W

    return dx

In [20]:
class LogisticRegression:
  def __init__(self, input_size, learning_rate):
    self.input_size = input_size
    self.learning_rate = learning_rate
    self.u = np.random.randn(input_size)
    self.b_y = 0

  def forward(self, h):
    self.h = h;
    z = self.u.dot(h) + self.b_y
    y_hat = sigmoid(z)
    self.y_hat = y_hat
    return y_hat

  def backward(self, y):
    dz = self.y_hat - y
    du = dz * self.h
    dh = dz * self.u
    db_y = dz

    self.u -= self.learning_rate * du
    self.b_y -= self.learning_rate * db_y

    return dh

In [9]:
# Test softmax layer
x = np.random.randn(5)
softmax_layer = Softmax(5, 3)

# Forward pass
y_hat, loss = softmax_layer.forward(x, 2)

# Check if y_hat sums to 1
# and match the dimension
assert y_hat.sum() == 1
assert y_hat.shape == (3,)

# Backwardpass
dx = softmax_layer.backward()

# Check the dimension of gradient
assert dx.shape == (5,)

## Gated Recurrent Unit layer

In [30]:
class GatedRecurrentUnit:
  def __init__(self, input_size, hidden_size):
    # Cache the input and hidden size
    self.input_size = input_size
    self.hidden_size = hidden_size

    # Create the parameters
    self.W_u = np.random.randn(self.hidden_size, self.hidden_size) / 1000
    self.U_u = np.random.randn(self.hidden_size, self.input_size) / 1000
    self.b_u = np.zeros((self.hidden_size,))

    self.W_r = np.random.randn(self.hidden_size, self.hidden_size) / 1000
    self.U_r = np.random.randn(self.hidden_size, self.input_size) / 1000
    self.b_r = np.zeros((self.hidden_size,))

    self.W_h = np.random.randn(self.hidden_size, self.hidden_size) / 1000
    self.U_h = np.random.randn(self.hidden_size, self.input_size) / 1000
    self.b_h = np.zeros((self.hidden_size,))

  def forward(self, x):
    # Create the first hidden state h_0
    h_0 = np.zeros((self.hidden_size,))

    # Cache the input and hidden state
    self.x_cache = x
    self.state_cache = [{
        "a_t": None,
        "u_t": None,
        "b_t": None,
        "r_t": None,
        "c_t": None,
        "d_t": None,
        "h_t_candidate": None,
        "h_t": h_0
    }]

    # Perform forward propagation through time
    h = h_0
    for t in range(len(self.x_cache)):
      # Perform calculation
      a_t = self.W_u @ h + self.U_u @ x[t] + self.b_u
      u_t = sigmoid(a_t)

      b_t = self.W_r @ h + self.U_r @ x[t] + self.b_r
      r_t = sigmoid(b_t)

      c_t = r_t * h
      d_t = self.W_h @ c_t + self.U_h @ x[t] + self.b_h
      h_t_candidate = tanh(d_t)

      h_t = (1 - u_t) * h + u_t * h_t_candidate

      # Cache h value
      h = h_t
      self.state_cache.append({
          "a_t": a_t,
          "u_t": u_t,
          "b_t": b_t,
          "r_t": r_t,
          "c_t": c_t,
          "d_t": d_t,
          "h_t_candidate": h_t_candidate,
          "h_t": h_t
      })

    # Return the final hidden state (h_T)
    return h

  def backward(self, dh_T, learning_rate=0.01):
    # Save the gradient with respect to the weights and matrices
    dW_u = np.zeros_like(self.W_u)
    dU_u = np.zeros_like(self.U_u)
    db_u = np.zeros_like(self.b_u)

    dW_r = np.zeros_like(self.W_r)
    dU_r = np.zeros_like(self.U_r)
    db_r = np.zeros_like(self.b_r)

    dW_h = np.zeros_like(self.W_h)
    dU_h = np.zeros_like(self.U_h)
    db_h = np.zeros_like(self.b_h)

    # Start backpropagation through time
    dh = dh_T

    for t in reversed(range(len(self.x_cache))):
      # Get input
      x_t = self.x_cache[t]

      # Unwrap the previous state
      state_cache = self.state_cache[t+1]
      a_t = state_cache["a_t"]
      u_t = state_cache["u_t"]
      b_t = state_cache["b_t"]
      r_t = state_cache["r_t"]
      c_t = state_cache["c_t"]
      d_t = state_cache["d_t"]
      h_t_candidate = state_cache["h_t_candidate"]

      # This is h_(t-1)
      prev_cache = self.state_cache[t]
      prev_h_t = prev_cache["h_t"]

      # Calculate the gradient
      du = dh * (h_t_candidate - prev_h_t)
      dh_candidate = dh * u_t
      dh_prev_1 = - dh * u_t

      dd = dh_candidate * (1 - tanh(d_t) ** 2)
      dW_h_current = np.outer(dd, c_t)
      dU_h_current = np.outer(dd, x_t)
      db_h_current = dd
      dc = dd @ self.W_h
      dr = dc * prev_h_t
      dh_prev_2 = dc * r_t

      db = dr * r_t * (1 - r_t)
      dW_r_current = np.outer(db, prev_h_t)
      dU_r_current = np.outer(db, x_t)
      db_r_current = db
      dh_prev_3 = db @ self.W_r

      da = du * u_t * (1 - u_t)
      dW_u_current = np.outer(da, prev_h_t)
      dU_u_current = np.outer(da, x_t)
      db_u_current = da
      dh_prev_4 = da @ self.W_u

      # Accumulate to get previous h gradient
      dh_prev = dh_prev_1 + dh_prev_2 + dh_prev_3 + dh_prev_4

      # Update 
      dW_u += dW_u_current
      dU_u += dU_u_current
      db_u += db_u_current

      dW_r += dW_r_current
      dU_r += dU_r_current
      db_r += db_r_current

      dW_h += dW_h_current
      dU_h += dU_h_current
      db_h += db_h_current

      # Assign new dh
      dh = dh_prev

    # Gradient clipping to prevent
    # gradient explosion
    dW_u = np.clip(dW_u, -1, 1)
    dU_u = np.clip(dU_u, -1, 1)
    db_u = np.clip(db_u, -1, 1)

    dW_r = np.clip(dW_r, -1, 1)
    dU_r = np.clip(dU_r, -1, 1)
    db_r = np.clip(db_r, -1, 1)

    dW_h = np.clip(dW_h, -1, 1)
    dU_h = np.clip(dU_h, -1, 1)
    db_h = np.clip(db_h, -1, 1)

    # Update the parameters
    self.W_u -= learning_rate * dW_u
    self.U_u -= learning_rate * dU_u
    self.b_u -= learning_rate * db_u

    self.W_r -= learning_rate * dW_r
    self.U_r -= learning_rate * dU_r
    self.b_r -= learning_rate * db_r

    self.W_h -= learning_rate * dW_h
    self.U_h -= learning_rate * dU_h
    self.b_h -= learning_rate * db_h

In [11]:
# Test gru layer with training example
input, target = train_dataset[5]

# Define the size
input_size = input[0].shape[0]
hidden_size = 64

# Create layer
gru = GatedRecurrentUnit(
    input_size=input_size,
    hidden_size=64
)

# Forward pass
final_hidden_state = gru.forward(input)

# Check dimension
assert final_hidden_state.shape == (64,)
assert len(gru.x_cache) == len(input)
assert len(gru.state_cache) == len(input) + 1

# Check backward pass
dh_T = np.random.randn(64)
gru.backward(dh_T)

## Training phase

In [24]:
def train(rnn, log, epochs = 2000):
  for i in range(epochs):
    loss = 0
    train_correct = 0
    test_correct = 0

    # Loop through items in train dataset and train
    for x, y in train_dataset:
      h_T = rnn.forward(x)
      y_hat = log.forward(h_T)

      current_loss = - y * np.log(y_hat) - (1 - y) * np.log(1 - y_hat)
      loss += current_loss

      if (y == 0 and y_hat < 0.5) or (y == 1 and y_hat >= 0.5):
        train_correct += 1

      dh_T = log.backward(y)
      rnn.backward(dh_T)

    # Calculate test accuracy
    for x, y in test_dataset:
      h_T = rnn.forward(x)
      y_hat = log.forward(h_T)

      if (y == 0 and y_hat < 0.5) or (y == 1 and y_hat >= 0.5):
        test_correct += 1

    print(f"Epochs: {i}")
    print(f"Loss: {loss / len(train_dataset)}")
    print(f"Train Accuracy: {train_correct / len(train_dataset)}")
    print(f"Test Accuracy: {test_correct / len(test_dataset)}")
    print("--------------------")

In [29]:
# Create GRU and Softmax
gru_layer = GatedRecurrentUnit(
    input_size=input_size,
    hidden_size=64
)

log_layer = LogisticRegression(
    input_size=64,
    learning_rate=0.01
)

# Train
train(gru_layer, log_layer)

Epochs: 279
Loss: 0.014942297125941953
Train Accuracy: 1.0
Test Accuracy: 1.0
--------------------
Epochs: 280
Loss: 0.014782132824850214
Train Accuracy: 1.0
Test Accuracy: 1.0
--------------------
Epochs: 281
Loss: 0.014625460827046787
Train Accuracy: 1.0
Test Accuracy: 1.0
--------------------
Epochs: 282
Loss: 0.014472177426328425
Train Accuracy: 1.0
Test Accuracy: 1.0
--------------------
Epochs: 283
Loss: 0.014322173591104102
Train Accuracy: 1.0
Test Accuracy: 1.0
--------------------
Epochs: 284
Loss: 0.014175343084418687
Train Accuracy: 1.0
Test Accuracy: 1.0
--------------------
Epochs: 285
Loss: 0.014031559943360233
Train Accuracy: 1.0
Test Accuracy: 1.0
--------------------
Epochs: 286
Loss: 0.013890628906226929
Train Accuracy: 1.0
Test Accuracy: 1.0
--------------------
Epochs: 287
Loss: 0.013752715517049185
Train Accuracy: 1.0
Test Accuracy: 1.0
--------------------
Epochs: 288
Loss: 0.013617740621092923
Train Accuracy: 1.0
Test Accuracy: 1.0
--------------------
Epochs: 28

KeyboardInterrupt: ignored