# **Lab Session: Implement a sequence-to-sequence model for machine translation using an encoder-decoder architecture.**

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# Add any missing imports here, e.g.,
# from IPython import get_ipython
# from IPython.display import display

# Example dataset class for parallel sentences
class TranslationDataset(Dataset):
    def __init__(self, src_sentences, trg_sentences, src_vocab, trg_vocab, max_len=50):
        self.src_sentences = src_sentences
        self.trg_sentences = trg_sentences
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        src = self.tokenize(self.src_sentences[idx], self.src_vocab)
        trg = self.tokenize(self.trg_sentences[idx], self.trg_vocab)
        return torch.tensor(src), torch.tensor(trg)

    def tokenize(self, sentence, vocab):
        tokens = sentence.lower().strip().split()
        idxs = [vocab.get(tok, vocab['<unk>']) for tok in tokens]
        idxs = [vocab['<sos>']] + idxs + [vocab['<eos>']]
        # pad
        if len(idxs) < self.max_len:
            idxs += [vocab['<pad>']] * (self.max_len - len(idxs))
        else:
            idxs = idxs[:self.max_len]
        return idxs





In [8]:
# Encoder: bidirectional GRU
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers=2, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.gru = nn.GRU(emb_dim, hid_dim, n_layers,
                          dropout=dropout, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hid_dim * 2, hid_dim)

    def forward(self, src):
        # src: [batch, src_len]
        embedded = self.dropout(self.embedding(src))  # [batch, src_len, emb_dim]
        outputs, hidden = self.gru(embedded)
        # outputs: [batch, src_len, hid_dim*2]
        # hidden: [n_layers*2, batch, hid_dim]
        # Concatenate forward + backward hidden states
        hidden = torch.tanh(self.fc(torch.cat(
        (hidden[-2,:,:], hidden[-1,:,:]), dim=1)))  # [batch, hid_dim]
        # Reshape to have n_layers
        return outputs, hidden.unsqueeze(0).repeat(2, 1, 1)  # hidden for decoder init, with 2 layers


In [9]:

# Attention mechanism
class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        # The input to the linear layer should be the sum of the hidden dimensions of the two inputs to torch.cat
        self.attn = nn.Linear(hid_dim + hid_dim * 2, hid_dim)
        self.v = nn.Linear(hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden: [n_layers, batch, hid_dim]
        # encoder_outputs: [batch, src_len, hid_dim*2]
        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]

        # hidden: [batch, n_layers, hid_dim] -> [batch, 1, n_layers, hid_dim]
        hidden = hidden.transpose(0, 1).unsqueeze(1)

        # hidden: [batch, src_len, n_layers, hid_dim]
        hidden = hidden.repeat(1, src_len, 1, 1)

        # encoder_outputs: [batch, src_len, hid_dim*2] -> [batch, src_len, 1, hid_dim*2]
        encoder_outputs = encoder_outputs.unsqueeze(2)

        # encoder_outputs: [batch, src_len, n_layers, hid_dim*2]
        encoder_outputs = encoder_outputs.repeat(1, 1, hidden.shape[2], 1)

        # Concatenate hidden and encoder_outputs along the last dimension
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=-1)))
        # energy: [batch, src_len, n_layers, hid_dim]

        attention = self.v(energy).squeeze(-1)  # attention: [batch, src_len, n_layers]

        # Taking the average across n_layers, giving us attention weights for each word in the src sequence.
        attention = attention.mean(dim=-1)  # attention: [batch, src_len]

        return torch.softmax(attention, dim=1)

In [10]:

# Decoder: with attention
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers=2, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.gru = nn.GRU(hid_dim*2 + emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hid_dim*3 + emb_dim, output_dim)
        self.attention = Attention(hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        # input: [batch]
        input = input.unsqueeze(1)  # [batch, 1]
        embedded = self.dropout(self.embedding(input))  # [batch, 1, emb_dim]
        attn_weights = self.attention(hidden, encoder_outputs)  # [batch, src_len]
        attn_weights = attn_weights.unsqueeze(1)  # [batch, 1, src_len]
        context = torch.bmm(attn_weights, encoder_outputs)  # [batch, 1, hid_dim*2]
        rnn_input = torch.cat((embedded, context), dim=2)
        output, hidden = self.gru(rnn_input, hidden)
        # output: [batch, 1, hid_dim]
        embedded = embedded.squeeze(1)
        output = output.squeeze(1)
        context = context.squeeze(1)
        prediction = self.fc_out(torch.cat((output, context, embedded), dim=1))
        # prediction: [batch, output_dim]
        return prediction, hidden, attn_weights.squeeze(1)


In [11]:
# Seq2Seq wrapper
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.embedding.num_embeddings
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
        input = trg[:,0]  # <sos>
        for t in range(1, trg_len):
            output, hidden, _ = self.decoder(input, hidden, encoder_outputs)
            outputs[:,t,:] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:,t] if teacher_force else top1
        return outputs


In [12]:
# Usage example (training loop)
if __name__ == "__main__":
    # **Data Loading (Replace with your data loading logic)**
    # Example:
    # from utils import load_data, build_vocab
    # src_sentences, trg_sentences = load_data("data.txt")
    # src_vocab = build_vocab(src_sentences)
    # trg_vocab = build_vocab(trg_sentences)

    # Placeholder (replace with actual data and vocabularies)
    src_sentences = ["I am fine .", "How are you ?"]
    trg_sentences = ["Je vais bien .", "Comment allez-vous ?"]
    src_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3, 'i': 4, 'am': 5, 'fine': 6, '.': 7, 'how': 8, 'are': 9, 'you': 10, '?': 11}
    trg_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3, 'je': 4, 'vais': 5, 'bien': 6, '.': 7, 'comment': 8, 'allez-vous': 9, '?': 10}

    dataset = TranslationDataset(src_sentences, trg_sentences, src_vocab, trg_vocab)
    loader = DataLoader(dataset, batch_size=32, shuffle=True)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    encoder = Encoder(input_dim=len(src_vocab), emb_dim=256, hid_dim=512).to(device)
    decoder = Decoder(output_dim=len(trg_vocab), emb_dim=256, hid_dim=512).to(device)
    model = Seq2Seq(encoder, decoder, device).to(device)

    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss(ignore_index=trg_vocab['<pad>'])

    model.train()

In [13]:

    for epoch in range(1, 11):  # Example: Train for 10 epochs
        epoch_loss = 0
        for src_batch, trg_batch in loader:
            src_batch, trg_batch = src_batch.to(device), trg_batch.to(device)
            optimizer.zero_grad()
            output = model(src_batch, trg_batch)
            # reshape: [(batch*trg_len), vocab]
            output_dim = output.shape[-1]
            output = output[:,1:,:].reshape(-1, output_dim)
            trg = trg_batch[:,1:].reshape(-1)
            loss = criterion(output, trg)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch} Loss: {epoch_loss/len(loader):.4f}")

Epoch 1 Loss: 2.3255
Epoch 2 Loss: 2.2261
Epoch 3 Loss: 1.9156
Epoch 4 Loss: 1.4333
Epoch 5 Loss: 1.2246
Epoch 6 Loss: 0.8954
Epoch 7 Loss: 0.6293
Epoch 8 Loss: 0.4267
Epoch 9 Loss: 0.2713
Epoch 10 Loss: 0.2876
