In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np


# GRU

The GRU has a similar structure to the LSTM, but with fewer parameters. It consists of two gates: a reset gate and an update gate. The reset gate determines how much of the previous hidden state is retained, and the update gate determines how much of the new input is added to the hidden state.  

![gru](https://www.researchgate.net/publication/343002752/figure/fig1/AS:914127664979968@1594956427113/The-Architecture-of-basic-Gated-Recurrent-Unit-GRU.ppm)

The GRU update equations are as follows:

Reset gate: $r_t = \sigma(W_r x_t + U_r h_{t-1} + b_r)$  
Update gate: $z_t = \sigma(W_z x_t + U_z h_{t-1} + b_z)$  
Candidate activation: $\tilde{h_t} = \tanh(W x_t + r_t \odot U h_{t-1} + b)$  
Hidden state: $h_t = (1 - z_t) \odot h_{t-1} + z_t \odot \tilde{h_t}$  

where:

$x_t$ is the input at time step $t$.    
$h_{t-1}$ is the hidden state at the previous time step.  
$r_t$ is the reset gate at time step $t$.  
$z_t$ is the update gate at time step $t$.  
$\tilde{h_t}$ is the candidate activation at time step $t$.   
$h_t$ is the new hidden state at time step $t$.  
$W_r$, $U_r$, $b_r$, $W_z$, $U_z$, $b_z$, $W$, $U$, and $b$ are learnable parameters.  
Like the LSTM, the GRU can be stacked to create deeper architectures, and it can be bidirectional to capture information from both past and future time steps.  

In [7]:
"""
The model is defined in the GRU class, which inherits from the nn.Module class in PyTorch. The constructor method (__init__) initializes the parameters of the model, including the input size, hidden size, number of layers, bias, and output size. It also creates a list of GRU cells (self.rnn_cell_list), which will be used to compute the hidden state at each time step of the input sequence.

The forward method defines the forward pass of the model. It takes as input a tensor input of shape (batch_size, seqence length, input_size), and an optional initial hidden state hx. If hx is not provided, it is initialized to a tensor of zeros.

The method then iterates over the time steps of the input sequence, and computes the hidden state at each time step using the GRU cells in self.rnn_cell_list. The hidden state of each layer is stored in the hidden list, which is initialized to hx if provided, or to zeros otherwise.

The `GRU` class inherits from `nn.Module` and defines a GRU architecture for sequence classification. It takes in the following arguments:

- `input_size`: The number of expected features in the input.
- `hidden_size`: The number of features in the hidden state `h`.
- `num_layers`: Number of recurrent layers. E.g., setting `num_layers=2` would mean stacking two GRUs together to form a `stacked GRU`, with the second GRU taking in outputs of the first GRU and producing the final results.
- `bias`: If `False`, then the layer does not use bias weights `b_ih` and `b_hh`. Default: `True`.
- `output_size`: The size of the output layer, which is the number of classes in the classification problem.

The `__init__` method initializes the architecture by creating a list of `GRUCell`s based on the number of layers specified. It also creates a linear layer `fc` to convert the final hidden state into the desired output size. 

The `forward` method takes in the input tensor `input` of shape `(batch_size, sequence_length, input_size)` and an optional initial hidden state `hx`. The method first initializes the initial hidden state `h0` to zeros if `hx` is not provided. 

The method then loops through the time steps of the input tensor, passing the input and hidden states through each layer of the GRU. The hidden states are stored in the `hidden` list, with each element of the list corresponding to the hidden state at each layer. The output at each time step is stored in the `outs` list.

After the loop, the method takes the last output from the `outs` list, applies a linear layer `fc` to it, and returns the final output tensor of shape `(batch_size, output_size)`.

"""

class GRUCell(nn.Module):
    def __init__(self, input_size, hidden_size, bias=True):
        super(GRUCell, self).__init__()
        self.input_size = input
        self.hidden_size = hidden_size
        self.bias = bias 

        self.x2h = nn.Linear(input_size, 3*hidden_size, bias=bias)
        self.h2h = nn.Linear(hidden_size, 3*hidden_size, bias=bias)

        self.reset_parameters()

    def reset_parameters(self):
        std = 1.0 / np.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, input, hx=None):
        # Inputs:
        #       input: of shape (batch_size, input_size)
        #       hx: of shape (batch_size, hidden_size)
        # Output:
        #       hy: of shape (batch_size, hidden_size)

        if hx is None:
            hx = Variable(input.new_zeros(input.size(0), self.hidden_size))

        x_t = self.x2h(input)
        h_t = self.h2h(hx)

        x_reset, x_upd, x_new = x_t.chunk(3,1)
        h_reset, h_upd, h_new = h_t.chunk(3,1)

        reset_gate = torch.sigmoid(x_reset + h_reset)
        update_gate = torch.sigmoid(x_upd + h_upd)
        new_gate = torch.tanh(x_new + (reset_gate * h_new))

        hy = update_gate * hx + (1 - update_gate) * new_gate

        return hy


In [5]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bias, output_size):
        super(GRU, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bias = bias 
        self.output_size = output_size

        self.rnn_cell_list = nn.ModuleList()

        self.rnn_cell_list.append(
            GRUCell(
                self.input_size,
                self.hidden_size,
                self.bias
            )
        )

        for l in range(1, self.num_layers):
            self.rnn_cell_list.append(GRUCell(self.hidden_size,
                                              self.hidden_size,
                                              self.bias))
        self.fc = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hx=None):

        # Input of shape (batch_size, seqence length, input_size)
        #
        # Output of shape (batch_size, output_size)

        if hx is None:
            if torch.cuda.is_available():
                h0 = Variable(torch.zeros(self.num_layers,
                              input.size(0), self.hidden_size).cuda())
            else:
                h0 = Variable(torch.zeros(self.num_layers,
                              input.size(0), self.hidden_size))

        else:
            h0 = hx

        outs = []

        hidden = list()
        for layer in range(self.num_layers):
            hidden.append(h0[layer, :, :])

        for t in range(input.size(1)):

            for layer in range(self.num_layers):

                if layer == 0:
                    hidden_l = self.rnn_cell_list[layer](
                        input[:, t, :], hidden[layer])
                else:
                    hidden_l = self.rnn_cell_list[layer](
                        hidden[layer - 1], hidden[layer])
                hidden[layer] = hidden_l

                hidden[layer] = hidden_l

            outs.append(hidden_l)

        # Take only last time step. Modify for seq to seq
        out = outs[-1].squeeze()

        out = self.fc(out)

        return out


In [6]:
# Define input tensor of shape (batch_size, sequence_length, input_size)
input_tensor = torch.rand((2, 5, 3))

# Define LSTM module with input size=3, hidden size=4, 2 layers, bias=True, output size=2
gru = GRU(input_size=3, hidden_size=4,
            num_layers=2, bias=True, output_size=2)

# Forward pass through the LSTM module
output_tensor = gru(input_tensor)

# Print the shape of the output tensor
print(output_tensor.shape)
print(output_tensor)


torch.Size([2, 2])
tensor([[ 0.0959, -0.3729],
        [ 0.0868, -0.3723]], grad_fn=<AddmmBackward0>)
