<div style="line-height:0.5">
<h1 style="color:#BF66F2 ">  Bidirectional Recurrent Networks in PyTorch 1 </h1>
<h4> Vanilla Elman Network + Bidirectional LSTM + BiGRU. </h4> 
</div>

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

In [3]:
import torch
import torch.nn as nn  
import torch.optim as optim  
import torch.nn.functional as func
from torch.utils.data import DataLoader
import torchvision.datasets as datasets 
import torchvision.transforms as transforms 

from tqdm import tqdm  

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [5]:
""" Hyperparameters """
input_size = 28
sequence_length = 28
num_layers = 2
hidden_size = 256
num_classes = 10
learning_rate = 3e-4
batch_size = 64
num_epochs = 2

<h3 style="color:#BF66F2"> Recap: RNN </h3>
<div style="margin-top: -17px;">

In the RNN bidirectional case, the RNN processes the input sequence both forward and backward in time,      
NOT ONLY FROM LEFT TO RIGHT AS Elman unidirectional Nets.    
It concatenates the output of the forward and backward passes.     
This allows the model to capture information from both past and future time steps, improving its ability to learn patterns in the data.     
In this case, the output of the bidirectional RNN has shape (batch_size, sequence_length, hidden_size * 2), instead of (batch_size, sequence_length, hidden_size). <br>
In fact, in BiRNN case, the output  of each forward and backward pass is concatenated along the hidden state dimension. 

Therefore, the final hidden state should be extracted from the concatenated output tensor before applying the linear transformation.    
This is done by selecting the last time step of the output tensor along the sequence dimension,     
using the indexing operation out[:, -1, :], which results in a tensor of shape (batch_size, hidden_size * 2).    
This tensor is then passed through the linear layer to obtain the final output tensor of shape (batch_size, num_classes).      

In [6]:
class myBiRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(myBiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        """ Perform a Forward Pass (bidirectional case)
            Details: 
                # Initialize the hidden states as a tensor of zeros with dimension (num_layers *2, batch_size, hidden_size).
                # [No reshape() here, just an indexing operation] => out[:, -1, :] 
                    ==> selects the entire batch of output sequences, but only the last time step of each sequence, and 
                    all elements along the hidden state dimension. 
                    : selects all elements along the first dimension (which corresponds to the batch dimension).
                    -1 selects the last element along the second dimension (which corresponds to the sequence dimension).
                    : selects all elements along the third dimension (which corresponds to the hidden state dimension).

            Returns:
                Output tensor of shape (batch_size *2, output_size) => torch.Tensor.
        """
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

In [7]:
class myBSLTM(nn.Module):
    """ GRU with bidirectional=True parameter and Linear layer fullyconnected \\ 
    to map the final hidden state to the output classes.
    """
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(myBSLTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)

        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])

        return out

In [8]:
class myBiGRU(nn.Module):
    """  GRU with bidirectional=True parameter and Linear layer fullyconnected \\ 
    to map the final hidden state to the output classes.
    """
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(myBiGRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])
        return out

In [9]:
train_dataset = datasets.MNIST(root="dataset/", train=True, transform=transforms.ToTensor(), download=True)
test_dataset = datasets.MNIST(root="dataset/", train=False, transform=transforms.ToTensor(), download=True)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)
test_loader

<torch.utils.data.dataloader.DataLoader at 0x7fcbedf27be0>

<h2 style="color:#BF66F2 "> Example 1: myBiRNN </h1>

In [10]:
""" Initialize network """
model1 = myBiRNN(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model1.parameters(), lr=learning_rate)

In [11]:
######### Train
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(tqdm(train_loader)):
        data = data.to(device=device).squeeze(1)
        targets = targets.to(device=device)
        # Forward pass
        scores = model1(data)
        loss = criterion(scores, targets)
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        # Adam update
        optimizer.step()

  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
100%|██████████| 938/938 [01:37<00:00,  9.65it/s]
100%|██████████| 938/938 [02:28<00:00,  6.32it/s]


<h2 style="color:#BF66F2 "> Example 2: myBiGRU </h1>

In [12]:
model2 = myBiRNN(input_size, hidden_size, num_layers, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model2.parameters(), lr=learning_rate)

In [13]:
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(tqdm(train_loader)):
        data = data.to(device=device).squeeze(1)
        targets = targets.to(device=device)
        # Forward pass
        scores = model2(data)
        loss = criterion(scores, targets)
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        # Adam update
        optimizer.step()

100%|██████████| 938/938 [02:48<00:00,  5.58it/s]
100%|██████████| 938/938 [02:27<00:00,  6.36it/s]


<h2 style="color:#BF66F2 "> Example 3: myBSLTM </h1>

In [14]:
model3 = myBSLTM(input_size, hidden_size, num_layers, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model3.parameters(), lr=learning_rate)

In [15]:
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(tqdm(train_loader)):
        data = data.to(device=device).squeeze(1)
        targets = targets.to(device=device)
        # Forward pass
        scores = model3(data)
        loss = criterion(scores, targets)
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        # Adam update
        optimizer.step()

100%|██████████| 938/938 [09:02<00:00,  1.73it/s]
100%|██████████| 938/938 [08:33<00:00,  1.83it/s]


In [16]:
""" Train and check accuracy on training and test sets. """
def check_accuracy(loader, model):
    if loader.dataset.train:
        print("Checking accuracy on training data")
    else:
        print("Checking accuracy on test data")

    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device).squeeze(1)
            y = y.to(device=device)
            
            scores = model(x)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

        print(f"Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}")

    model.train()

In [17]:
check_accuracy(train_loader, model1)
check_accuracy(test_loader, model1)

Checking accuracy on training data
Got 56456 / 60000 with accuracy 94.09
Checking accuracy on test data
Got 9424 / 10000 with accuracy 94.24


In [18]:
check_accuracy(train_loader, model2)
check_accuracy(test_loader, model2)

Checking accuracy on training data
Got 56660 / 60000 with accuracy 94.43
Checking accuracy on test data
Got 9462 / 10000 with accuracy 94.62


In [19]:
check_accuracy(train_loader, model3)
check_accuracy(test_loader, model3)

Checking accuracy on training data
Got 58122 / 60000 with accuracy 96.87
Checking accuracy on test data
Got 9648 / 10000 with accuracy 96.48
