# Lab 03 : Gated Recurrent Units (GRUs) with Attention -- exercise

#### Task

Implement GRU-based seq2seq model with Luong attention (https://arxiv.org/pdf/1508.04025.pdf) and train the model on the french to english translation dataset. (the use of the function nn.GRU() is allowed but the attention scheme needs to be implemented **explicitly**) :

The Luong attention algorithm performs the following operations:

1. The encoder generates a set of hidden states, $H = \textbf{h}_i, i = 1, 2, .....T$ , from the input sentence. The decoder generates a set of hidden states, $S = \textbf{s}_t, t =1, 2, .....$.
2. The current decoder hidden state is computed as: $\textbf{s}_t = GRU_{decoder}(\textbf{s}_{t-1}, y_{t-1})$. Here, $\textbf{s}_{t-1}$ denotes the previous hidden decoder state, and $y_{t-1}$ the current input, which is also the expected output for the previous timestep.

3. A dot product on the encoder hidden state $\textbf{h}_i$ and the current decoder hidden state $\textbf{s}_t$ to compute the alignment scores: $e_{t,i} = \textbf{s}_t . \textbf{h}_i$.

4. A softmax function is applied to the alignment scores, effectively normalizing them into attention weights in a range between 0 and 1: $\alpha_{t, i} = \text{softmax}(e_{t, i}/ \textbf{e}_t)$.

5. These attention weights together with the encoder hidden states are used to generate a context vector through a weighted sum: $\textbf{c}_t = \sum_{i=1}^T\alpha_{t, i}\textbf{h}_i$.

6. An attentional hidden state is computed based on a weighted concatenation of the context vector and the current decoder hidden state: $\tilde{\textbf{s}_t} = \text{tanh}\big(W_c\big[\textbf{c}_t; \textbf{s}_t\big]\big)$.

7. The decoder produces a final output by feeding it a weighted attentional hidden state: $y_t = \text{softmax}(W_y\tilde{\textbf{s}_t})$.

8. Steps 2-7 are repeated until the end of the sequence.

The attention has to be calculated in parallel via matrix multiplication. For loop $\textbf{should not}$ be used.

**Hints:**
1. torch.swapaxes or torch.transpose to convert from [seq_len, bs, hidden_size] to [bs, seq_len, hidden_size].
1. torch.bmm to perform batch matrix multiplication
1. torch.concat to concatenate $c_t$ and $s_t$
1. Training took around ~1 minute per epoch

*Prepared by Liu Xiaokang with the contribution of Chew Kin Whye*


In [None]:
# For Google Colaboratory
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/My Drive/CS5242_2025_codes/labs_lecture07/lab03_gru_attention'
    print(path_to_file)
    # change current path to the folder containing "file_name"
    os.chdir(path_to_file)
    !pwd

#### Dataset Download

The Europarl parallel corpus is extracted from the proceedings of the European Parliament. It includes versions in 21 European languages.

In this lab, we focus on translation between English and French. Please download the dataset by executing the following cell. (The downloading process may take a while)


In [None]:
import subprocess
import sys

try:
    import wget
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "wget"])
    import wget

import tarfile
import os

url = 'https://www.statmt.org/europarl/v7/fr-en.tgz'
file_path = './'

if not os.path.exists(os.path.join(file_path, "fr-en.tgz")):
    downloaded_file = wget.download(url, file_path)

    # Extract the .tgz file
    with tarfile.open(downloaded_file, 'r:gz') as tar:
        tar.extractall(path=file_path)


#### Install Spacy if necessary

In [None]:
try:
    import spacy
except ImportError:
    !pip install spacy==3.7.5
    

#### Dataset class

This cell may take a while to executes as it tries to download the tokenizer.

In [None]:
import subprocess
import sys
import spacy
import os
import numpy as np
import pickle
from typing import Iterable, List
from torch.nn.utils.rnn import pad_sequence
import torch
from torch.utils.data import DataLoader, Dataset
from collections import Counter

# Make sure the spaCy models are downloaded
subprocess.check_call([sys.executable, "-m", "spacy", "download", "fr_core_news_sm"])
subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])

# For this dataset, we are trying to translate french to english
SRC_LANGUAGE = 'fr'
TGT_LANGUAGE = 'en'

# Custom dataset for french-to-english translation
class CustomDataset(Dataset):
    def __init__(self, train, train_size=10000, test_size=1000, max_len=50):
        self.en_dir = "europarl-v7.fr-en.en"
        self.fr_dir = "europarl-v7.fr-en.fr"
        with open(self.en_dir, "r", encoding="utf8") as f:
            self.english_data = f.readlines()
        with open(self.fr_dir, "r", encoding="utf8") as f:
            self.french_data = f.readlines()
        # Only select sentences with length less than max_len
        self.indicies = np.array([i for i in range(len(self.english_data)) if len(self.english_data[i]) < max_len])
        if train:
            self.english_data = [self.english_data[i] for i in self.indicies][:train_size]
            self.french_data = [self.french_data[i] for i in self.indicies][:train_size]
        else:
            self.english_data = [self.english_data[i] for i in self.indicies][train_size:train_size+test_size]
            self.french_data = [self.french_data[i] for i in self.indicies][train_size:train_size+test_size]

    def __len__(self):
        return len(self.english_data)

    def __getitem__(self, idx):
        return self.french_data[idx], self.english_data[idx]

# Instantiate dataset
# dataset = CustomDataset(train=True)
train_size = 2500
test_size = 250
dataset = CustomDataset(train=True, train_size=train_size, test_size=test_size)


# A simple get_tokenizer function using spaCy directly
def get_tokenizer(model: str, language: str):
    # Load the appropriate spaCy model
    nlp = spacy.load(model)
    def tokenize(text: str) -> List[str]:
        return [token.text for token in nlp(text)]
    return tokenize

# Load tokenizers for source and target languages
token_transform = {}
token_transform[SRC_LANGUAGE] = get_tokenizer('fr_core_news_sm', SRC_LANGUAGE)
token_transform[TGT_LANGUAGE] = get_tokenizer('en_core_web_sm', TGT_LANGUAGE)

class Vocab:
    def __init__(self, token_to_index: dict, default_index: int = 0):
        self.token_to_index = token_to_index
        self.default_index = default_index
        self.index_to_token = {index: token for token, index in token_to_index.items()}

    def __call__(self, tokens: List[str]) -> List[int]:
        return [self.token_to_index.get(token, self.default_index) for token in tokens]

    def __len__(self):
        return len(self.token_to_index)

    def set_default_index(self, index: int):
        self.default_index = index

    def lookup_token(self, index: int) -> str:
        return self.index_to_token.get(index, self.index_to_token.get(self.default_index, '<unk>'))


# Function to build a vocabulary from an iterator over token lists
def build_vocab_from_iterator(iterator: Iterable[List[str]], min_freq: int = 1,
                              specials: List[str] = None, special_first: bool = True) -> Vocab:
    if specials is None:
        specials = []
    counter = Counter()
    for token_list in iterator:
        counter.update(token_list)
    # Filter tokens below frequency threshold
    tokens = [token for token, freq in counter.items() if freq >= min_freq]
    tokens = sorted(tokens)
    if special_first:
        final_tokens = specials + tokens
    else:
        final_tokens = tokens + specials
    # Create token to index mapping
    token_to_index = {token: idx for idx, token in enumerate(final_tokens)}
    # Use the index of the first special token as default (usually <unk>)
    default_index = token_to_index[specials[0]] if specials else 0
    return Vocab(token_to_index, default_index)

# Helper function to yield tokens from the dataset iterator
def yield_tokens(data_iter: Iterable, language: str) -> Iterable[List[str]]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}
    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Build vocabulary for both languages
# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

vocab_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    train_iter = iter(dataset)
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                      min_freq=1,
                                                      specials=special_symbols,
                                                      special_first=True)
    vocab_transform[ln].set_default_index(UNK_IDX)

torch.manual_seed(0)
SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])

# Functions to transform input sentences for training

def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

def tensor_transform(token_ids: List[int]):
    # Convert list of token indices to a tensor and add BOS/EOS tokens
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# Build text_transform dictionary combining tokenization, numericalization and tensor conversion
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln],  # Tokenization
                                               vocab_transform[ln],  # Numericalization
                                               tensor_transform)     # Add BOS/EOS tokens

def collate_fn(src, tgt):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in zip(src, tgt):
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch


In [None]:
# Print an example
batch_size = 8
dataset = CustomDataset(train=True, train_size=train_size, test_size=test_size)
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
fr_sentence, eng_sentence = next(iter(train_dataloader))
print(f"Raw Inputs: {fr_sentence[0]}\n{eng_sentence[0]}")
# First we split the sentence into tokens
fr_token, eng_token = [token_transform["fr"](i.rstrip("\n")) for i in fr_sentence], [token_transform["en"](i.rstrip("\n")) for i in eng_sentence]
print(f"Tokenized Inputs: {fr_token[0]}\n{eng_token[0]}")
# # Next we transform the tokens into numbers
fr_idx, eng_idx = [vocab_transform["fr"](i) for i in fr_token], [vocab_transform["en"](i) for i in eng_token]
print(f"Tokenized Inputs to indicies: {fr_idx[0]}\n{eng_idx[0]}")
# # Next, we add the beginning of sentence, end of sentence
fr_pad, eng_pad = [tensor_transform(i) for i in fr_idx], [tensor_transform(i) for i in eng_idx]
print(f"Tokenized Indicies with begin (2) and end token (3): {fr_pad[0]}\n{eng_pad[0]}")
# # Lastly, we pad the rest of the sentence
# This also changes the shape from (bs, seq_len) to (seq_len, bs)
fr_pad, eng_pad = pad_sequence(fr_pad, padding_value=PAD_IDX), pad_sequence(eng_pad, padding_value=PAD_IDX)
print(f"After padding (1): {fr_pad[:, 0]}\n{eng_pad[:, 0]}")

# All the above is combined into collate_fn
x, y = collate_fn(fr_sentence, eng_sentence)
print(f"Same Outputs: {x[:, 0]}\n{y[:, 0]}")


In [None]:
import gc
import pdb
import torch.nn as nn
import time
import math
import utils
# Hyperparameters
num_epochs = 5
hidden_size = 256
my_lr = 1.3
bs = 32

# Variables
SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
train_dataset = CustomDataset(train=True, train_size=train_size, test_size=test_size)
train_dataloader = DataLoader(train_dataset, batch_size=bs, shuffle=True)
test_dataset = CustomDataset(train=False, train_size=train_size, test_size=test_size)
test_dataloader = DataLoader(test_dataset, batch_size=bs, shuffle=False)

def eval_on_test_set():

    running_loss=0
    num_batches=0

    h = torch.zeros(1, bs, hidden_size)

    h=h.to(device)

    for x, y in test_dataloader:
        x, y = collate_fn(x, y)
        # Batch size might be different for the last batch
        batch_size = x.size()[1]
        seq_length = y.size()[0] - 1
        # set the initial h to be the zero vector
        h = torch.zeros(1, batch_size, hidden_size)
        # send them to the gpu
        minibatch_data=x.type(torch.LongTensor).to(device)
        minibatch_label=y.type(torch.LongTensor).to(device)
        h=h.to(device)

        # COMPLETE HERE 
        scores = 
        minibatch_label = 
        loss = 
        # COMPLETE HERE 

        loss = criterion(scores ,  minibatch_label )

        # backward pass to compute dL/dR, dL/dV and dL/dW
        loss.backward()

        # update the running loss
        running_loss += loss.detach().item()
        num_batches += 1
        # Collect garbage to prevent OOM
        gc.collect()

    total_loss = running_loss/num_batches
    print('test: exp(loss) = ', math.exp(total_loss)  )
    return math.exp(total_loss)

class LuongAttention(nn.Module):
    def __init__(self, hidden_size):
        super(LuongAttention, self).__init__()
        self.source_embedding = nn.Embedding( SRC_VOCAB_SIZE, hidden_size )
        self.target_embedding = nn.Embedding( TGT_VOCAB_SIZE, hidden_size )
        # COMPLETE HERE 
        self.gru_encoder =
        self.gru_decoder =
        self.W_c =
        self.W_y =
        # COMPLETE HERE 

    def forward(self, x, y, h_init):
        # x.shape = (15, 32)
        source_seq = self.source_embedding(x)  #(15, 32, 256)
        target_seq = self.target_embedding(y)  # (13, 32, 256)
        # COMPLETE HERE 


        # COMPLETE HERE 

        return 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

if torch.cuda.is_available():
    print('cuda available with GPU:',torch.cuda.get_device_name(0))

net = LuongAttention( hidden_size )
utils.display_num_param(net)

print(net)

net = net.to(device)

print('')
criterion = nn.CrossEntropyLoss()
def normalize_gradient(net):

    grad_norm_sq=0

    for p in net.parameters():
        grad_norm_sq += p.grad.data.norm()**2

    grad_norm=math.sqrt(grad_norm_sq)

    if grad_norm<1e-4:
        net.zero_grad()
        print('grad norm close to zero')
    else:
        for p in net.parameters():
            p.grad.data.div_(grad_norm)

    return grad_norm

start=time.time()

for epoch in range(num_epochs):
      # keep the learning rate to 1 during the first 4 epochs, then divide by 1.1 at every epoch
    if epoch >= 4:
        my_lr = my_lr / 1.1

    # create a new optimizer and give the current learning rate.
    optimizer=torch.optim.SGD( net.parameters() , lr=my_lr )

    # set the running quantities to zero at the beginning of the epoch
    running_loss=0
    num_batches=0

    for x, y in train_dataloader:
        # Set the gradients to zeros
        optimizer.zero_grad()
        # Transform inputs
        x, y = collate_fn(x, y)

        # Batch size might be different for the last batch
        batch_size = x.size()[1]
        seq_length = y.size()[0] - 1
        # set the initial h to be the zero vector
        h = torch.zeros(1, batch_size, hidden_size)
        # send them to the gpu
        minibatch_data=x.type(torch.LongTensor).to(device)
        minibatch_label=y.type(torch.LongTensor).to(device)

        h=h.to(device)

        try:
            # COMPLETE HERE 
            scores = 
            scores = 
            minibatch_label = 
            # COMPLETE HERE 

            loss = criterion(scores ,  minibatch_label )
        except RuntimeError:
            pdb.set_trace()
        h=h.detach()

        # backward pass to compute dL/dR, dL/dV and dL/dW
        loss.backward()

        # do one step of stochastic gradient descent: R=R-lr(dL/dR), V=V-lr(dL/dV), ...
        normalize_gradient(net)
        optimizer.step()
        # update the running loss
        running_loss += loss.detach().item()
        num_batches += 1
        # Collect garbage to prevent OOM
        gc.collect()
    # compute stats for the full training set
    total_loss = running_loss / num_batches
    elapsed = time.time() - start

    print('')
    print('epoch=',epoch, '\t time=', elapsed,'\t lr=', my_lr, '\t exp(loss)=',  math.exp(total_loss))
    eval_on_test_set()
    

In [None]:
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

count = 0
for x, y in test_dataloader:
    print(x)
    h = torch.zeros(1, 1, hidden_size)

    h=h.to(device)
    x, y = collate_fn(x, y)
    minibatch_data=x.type(torch.LongTensor).to(device)
    start_index = torch.tensor([[2]]).type(torch.LongTensor).to(device)
    predictions=start_index
    for _ in range(20):
        predictions = net.forward(minibatch_data, predictions, h)
        predictions = torch.reshape(predictions, (-1, TGT_VOCAB_SIZE, 1))
        predictions = torch.argmax(predictions, dim=1)
        predictions = torch.cat([start_index, predictions], 0)
        if predictions[-1].item() == 3:
            break
    predictions = predictions.reshape(-1)
    predictions = [vocab_transform[TGT_LANGUAGE].lookup_token(i.item()) for i in list(predictions)]
    print(f"Label: {[vocab_transform[TGT_LANGUAGE].lookup_token(i.item()) for i in y]}")
    print(f"Predicted: {predictions}")
    count += 1
    if count > 10:
        break
        