#Projects in AI and ML HW5 Task 3

In [2]:
import numpy as np
def scaled_dot_product_attention(Q, K, V):
    """
    Q: Query matrix
    K: Key matrix
    V: Value matrix
    """
    d_k = Q.shape[-1]
    scores = np.dot(Q, K.T) / np.sqrt(d_k)
    attention_weights = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
    attention_weights /= np.sum(attention_weights, axis=-1, keepdims=True)
    output = np.dot(attention_weights, V)
    return output, attention_weights

# Example input
Q = np.array([[1, 0, 1]])
K = np.array([[1, 0, 1], [0, 1, 0], [1, 1, 1]])
V = np.array([[1, 2], [0, 3], [4, 5]])

output, attn_weights = scaled_dot_product_attention(Q, K, V)
print("Attention Output:\n", output)
print("Attention Weights:\n", attn_weights)


Attention Output:
 [[2.15968551 3.4319371 ]]
Attention Weights:
 [[0.4319371 0.1361258 0.4319371]]


In [3]:
# Part 2: Seq2Seq with Attention
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        hidden = hidden[-1].repeat(src_len, 1, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return F.softmax(attention, dim=0)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(hidden_dim * 2 + emb_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim * 2 + emb_dim + hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        attn_weights = self.attention(hidden, encoder_outputs)
        attn_weights = attn_weights.unsqueeze(1)
        weighted = torch.bmm(attn_weights, encoder_outputs.permute(1, 0, 2))
        weighted = weighted.permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        prediction = self.fc_out(torch.cat((output.squeeze(0), weighted.squeeze(0), embedded.squeeze(0)), dim=1))
        return prediction, hidden, cell

# Define model parameters
INPUT_DIM, OUTPUT_DIM = 3000, 3000
ENC_EMB_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT = 256, 256, 512, 2, 0.5

attn = Attention(HID_DIM)
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT, attn)


In [4]:
# Part 3: Machine Translation Dataset and Training
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu

# Load dataset
dataset = load_dataset("opus_books", "en-fr")
train_data = dataset["train"]

# Prepare data
src_texts = [example["translation"]["en"] for example in train_data]
trg_texts = [example["translation"]["fr"] for example in train_data]

print(f"Loaded {len(src_texts)} sentence pairs for training.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loaded 127085 sentence pairs for training.


In [5]:
# Part 4: Simplified Transformer Model

import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, Dataset

class TranslationDataset(Dataset):
    def __init__(self, src_texts, trg_texts, tokenizer, max_len=50):
        self.src_texts = src_texts
        self.trg_texts = trg_texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        # Use the Hugging Face tokenizer directly
        src = self.tokenizer(self.src_texts[idx], padding='max_length', max_length=self.max_len, truncation=True, return_tensors='pt').input_ids.squeeze()
        trg = self.tokenizer(self.trg_texts[idx], padding='max_length', max_length=self.max_len, truncation=True, return_tensors='pt').input_ids.squeeze()
        return src, trg

class Transformer(nn.Module):
    def __init__(self, input_dim, output_dim, emb_dim=64, n_heads=2, n_layers=2, ff_dim=128, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.positional_encoding = self.get_positional_encoding(100, emb_dim)
        self.encoder_layers = nn.TransformerEncoderLayer(d_model=emb_dim, nhead=n_heads, dim_feedforward=ff_dim, dropout=dropout)
        self.encoder = nn.TransformerEncoder(self.encoder_layers, num_layers=n_layers)
        self.fc_out = nn.Linear(emb_dim, output_dim)

    def get_positional_encoding(self, max_len, emb_dim):
        position = np.arange(max_len)[:, np.newaxis]
        div_term = np.exp(np.arange(0, emb_dim, 2) * -(np.log(10000.0) / emb_dim))
        pe = np.zeros((max_len, emb_dim))
        pe[:, 0::2] = np.sin(position * div_term)
        pe[:, 1::2] = np.cos(position * div_term)
        return torch.tensor(pe, dtype=torch.float)

    def forward(self, src):
        embedded = self.embedding(src) + self.positional_encoding[:src.size(0), :]
        encoder_output = self.encoder(embedded)
        output = self.fc_out(encoder_output)
        return output

transformer = Transformer(INPUT_DIM, OUTPUT_DIM)



In [None]:
# Apply Transformer to Dataset and BLEU Score Evaluation
def calculate_bleu(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()
    return sentence_bleu(reference, candidate)

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr") # or any other suitable tokenizer

dataset = TranslationDataset(src_texts[:1000], trg_texts[:1000], tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

for batch in dataloader:
    src_batch, trg_batch = batch
    output = transformer(src_batch)
    pred_sentences = ["".join(map(str, sent)) for sent in output.argmax(-1).tolist()]
    ref_sentences = ["".join(map(str, sent)) for sent in trg_batch.tolist()]
    bleu_scores = [calculate_bleu(ref, pred) for ref, pred in zip(ref_sentences, pred_sentences)]
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    print(f"Average BLEU Score: {avg_bleu:.4f}")



In general, Transformers might perform better due to its ability to capture long-range dependencies, but it could also be worse if not trained sufficiently. Additionally, the Transformer could have longer runtime but faster inference time.
