# 1. Imports and setup

In [None]:
# Import all necessary libraries
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn.functional as F
import math

In [None]:
#print(torch.cuda.get_device_name(0))

# 2. Multi-Head Attention

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, d_model, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.num_heads = num_heads
        self.d_k = d_model // num_heads # model is "sliced" into num_heads sections - each section learns something different

        # Linear projections for Q, K, V
        # nn.Linear(in_features, out_features) -> applies a matrix multiplication + bias to every input vector (XWT+b). So embedding vec * weight matrix for Q, K or V + bias
        self.q_linear = nn.Linear(d_model, d_model) # We have d_model as in_features and out_features because we calculate query, key, value matrices
        self.k_linear = nn.Linear(d_model, d_model) # first and then slice them into the desired dimensions. So each head receives a different Q, K, V matrix subset.
        self.v_linear = nn.Linear(d_model, d_model) # Here we created 3 independent Linear layers.

        # Output linear layer
        self.out = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(dropout)

    # üîπ Nested Scaled Dot-Product Attention block
    def scaled_dot_product_attention(self, Q, K, V, mask=None, dropout=None):
        """
        Q, K, V: (batch, heads, seq_len, d_k)
        mask: (batch, 1, 1, seq_len_k) or None
        """
        d_k = Q.size(-1)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (d_k ** 0.5) # we scale by d_k the dot product gets large when d_k is large, making softmax outputs very peaky (too confident).
                                                                     # Dividing by ‚àöd_k keeps the values in a reasonable range.

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = F.softmax(scores, dim=-1)

        if dropout is not None:
            attn = dropout(attn)

        output = torch.matmul(attn, V)
        return output, attn

    # üîπ Full Multi-Head Attention forward pass
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0) #batch size from the input embedding tensor X

        # 1Ô∏è‚É£ Linear projections
        Q = self.q_linear(query) # Dont get confused by the name in the parantheses. query, key and value are all the same matrix - i.e., the embedding matrix (X)
        K = self.k_linear(key)   # and not the weight matrix for q,k,v. Weights and bias are initialized randomly at first for all.
        V = self.v_linear(value) # Model will determine the "correct" weight and bias through its training. Once this training is complete, we get (batch size, seq_len, d_model) for each - Q, K, V

        # 2Ô∏è‚É£ Split into heads
        Q = Q.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) # -1 here tells Pytorch "I don‚Äôt care what this dimension should be.
        K = K.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) # Compute it so that the total number of elements stays the same". -1 automatically becomes seq_len. We are going from 3d to 4d here.
        V = V.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) # We first split each token‚Äôs embedding into head slices,then transpose so heads come first because when we do attention, we compute dot products per head.
                                                                             # If we simply revrse the order like V.view(batch_size, self.num_heads, -1, self.d_k) then it would assume the wrong data order in memory.
                                                                             # Whats happening in this code block: Q, K, V transformed matrices that we got from the previous step are being sliced into matrices with 64 cols each (512/8) with a batch size as determined by query.size(0) since we will be processing it in batches and can run in parallel

        # 3Ô∏è‚É£ Apply scaled dot-product attention per head
        x, attn = self.scaled_dot_product_attention(Q, K, V, mask, self.dropout)

        # 4Ô∏è‚É£ Concatenate heads
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k) # Transpose back to original shape (batch, seq_len, heads, d_k) and merge all heads ‚Üí (batch, seq_len, 512)
        # when we do .transpose(), PyTorch does not physically rearrange elements in memory. It says "Hey, when someone reads me, pretend my rows and columns are swapped ‚Äî but don‚Äôt actually copy or move anything yet". view() requires the tensor‚Äôs elements to be laid out contiguously (in order) in memory. So we use contiguous() to copy our data into memory now, in the correct transposed order for view to work correctly

        # 5Ô∏è‚É£ Final linear layer
        output = self.out(x)  # this layer performs output= xWOT‚Äã+bO‚Äã where WO is a learnable weight. We need this because right now, the merged head features from the previous step are just sitting side by side. They don‚Äôt interact. This lets the model learn how to combine and weight the heads optimally.
                              # It decides, for example: Maybe head 3‚Äôs info matters more than head 5‚Äôs or Maybe features from head 1 and head 7 should be mixed together.

        return output, attn


# 5. Positional Encoding

In [None]:
# The positional encoding matrix is computed once, stored forever, and reused for every sentence ‚Äî only the needed portion is added each time.
# These sine‚Äìcosine patterns are fixed mathematical signals. They don‚Äôt depend on the data and don‚Äôt need to be learned
# That‚Äôs why they‚Äôre stored as a buffer using: self.register_buffer('pe', pe) so they move with the model (to GPU/CPU) but aren‚Äôt updated by backpropagation.

class PositionalEncoding(nn.Module): # we use a class instead of a function because we want PyTorch to treat positional encoding as a layer in the model not just some helper calculation
                                     # Unlike a class, a fucntion runs once and forgets everything; Can‚Äôt save data (like the precomputed encoding); Isn‚Äôt part of the model (so you can‚Äôt save or load it easily).
                                     # A function is like a cook ‚Äî you tell them the recipe every time. A class is like a kitchen machine ‚Äî you set it up once, and it‚Äôs ready to work whenever you need it.
                                     # The nn.Module class in PyTorch is the base class for all neural network modules. It provides the fundamental structure and functionality for building and managing neural network architectures.

    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__() # super() is a built-in Python function that lets a child class (PositionalEncoding) call methods from its parent class (nn.Module)
                                                   # Why does super take PositionalEncoding as an argument? - "Find the parent class of PositionalEncoding and call its methods in the context of self (this object)""


        # Create a matrix of shape (max_len, d_model)
        # Each row is a position, each column is a dimension
        pe = torch.zeros(max_len, d_model) # max_len = the largest number of tokens (words/subwords) that your model's positional encoding table will support. It is a global limit, not per sentence

        # Create a column vector of positions [0, 1, 2, ..., max_len-1]
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # shape: (max_len, 1). Creates a column vector with 1 column and rows = max_len

        # Compute the "divisor term" from the formula: 10000^(2i/d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # formula is manipulated to make it easier to code. 2i is the even position

        # Apply sin to even indices (0, 2, 4, ...)
        pe[:, 0::2] = torch.sin(position * div_term)

        # Apply cos to odd indices (1, 3, 5, ...)
        pe[:, 1::2] = torch.cos(position * div_term)

        # Add a batch dimension (1, max_len, d_model)
        pe = pe.unsqueeze(0) # Before this line, pe has shape [max_len, d_model]. After this line - [1, max_len, d_model].
                             # when we add positional encodings to our input embeddings later, the input x will have shape [batch_size, seq_len, d_model].
                             # To make the addition possible, both tensors must have compatible shapes.

        # Register as a buffer so it‚Äôs not a learnable parameter but moves with the model (to GPU if needed)
        self.register_buffer('pe', pe)

    def forward(self, x): # Use of this function? Every class that inherits from nn.Module must define how data flows through it ‚Äî that‚Äôs what the forward() method does. "When I feed data into this layer, what should happen to it?"
        """
        x: (batch_size, seq_len, d_model)
        """
        # Add the positional encoding up to the sequence length
        x = x + self.pe[:, :x.size(1)] # self.pe[:, :x.size(1)] and self.pe[:, :x.size(1), :] do the same thing ‚Äî the last : is just optional because PyTorch automatically includes all remaining dimensions.
        return x


# 6. Positionwise Feed Forward layer

In [None]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1): # The constructor initializes the layer. d_model: input and output dimension (e.g. 512), d_ff: hidden layer dimension (e.g. 2048), dropout: dropout rate for regularization
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff) # First linear layer that expands each token‚Äôs embedding from size d_model ‚Üí d_ff.
        self.linear2 = nn.Linear(d_ff, d_model) # Second linear layer that projects it back down from d_ff ‚Üí d_model.
        self.dropout = nn.Dropout(dropout) # Randomly drops some activations during training to prevent overfitting.

    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x)))) # inside to outside. apply the first linear layer (expand the dims), apply relu activation, apply dropout, project back to the og dim


# 7. Encoder Layer

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, num_heads, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(num_heads, d_model, dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout) # in __init__ we just specify the dropout rate. Here we specify how to actually use this number. This creates an actual dropout layer

    def forward(self, x, mask=None):
        # 1Ô∏è‚É£ Self-Attention sublayer (with residual connection + LayerNorm)
        attn_output, _ = self.self_attn(x, x, x, mask) # When you call self.self_attn(...), PyTorch automatically runs that module‚Äôs own forward() method. So the arguments we see here are for forward and different than what we specified in  __init__
        x = x + self.dropout(attn_output)
        x = self.norm1(x)

        # 2Ô∏è‚É£ Feed-Forward sublayer (with residual connection + LayerNorm)
        ff_output = self.feed_forward(x)
        x = x + self.dropout(ff_output)
        x = self.norm2(x)

        return x

# 8. Decoder Layer

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, num_heads, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(num_heads, d_model, dropout)
        self.enc_dec_attn = MultiHeadAttention(num_heads, d_model, dropout) # creates the cross-attention sub-layer inside the decoder, which lets the decoder look at and extract relevant information from the encoder‚Äôs output.
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        # 1Ô∏è‚É£ Masked Self-Attention (decoder looks at past tokens only)
        _x, _ = self.self_attn(x, x, x, tgt_mask) # ‚ÄúWhile predicting this word, which encoder words should I pay attention to?‚Äù
        x = x + self.dropout(_x)
        x = self.norm1(x)

        # 2Ô∏è‚É£ Encoder-Decoder Attention
        _x, _ = self.enc_dec_attn(x, enc_output, enc_output, src_mask) # x = decoder's current hidden states (become queries Q), enc_output = encoder output (become keys K and values V), src_mask = mask for padding or attention limits
        x = x + self.dropout(_x)
        x = self.norm2(x)

        # 3Ô∏è‚É£ Feed-Forward
        _x = self.feed_forward(x)
        x = x + self.dropout(_x)
        x = self.norm3(x)

        return x

# 9. Encoder Stack

In [None]:
class Encoder(nn.Module):
    def __init__(self, num_layers, num_heads, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.layers = nn.ModuleList([
            EncoderLayer(num_heads, d_model, d_ff, dropout)
            for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

# 10. Decoder Stack

In [None]:
class Decoder(nn.Module):
    def __init__(self, num_layers, num_heads, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.layers = nn.ModuleList([
            DecoderLayer(num_heads, d_model, d_ff, dropout)
            for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        for layer in self.layers:
            x = layer(x, enc_output, src_mask, tgt_mask)
        return self.norm(x)

# 11. Load Dataset

In [None]:
!pip install -q datasets sentencepiece

from datasets import load_dataset

# Load OPUS Books English‚ÄìGerman dataset
dataset = load_dataset("opus_books", "de-en")
train = dataset["train"]

# Correct slicing method
subset = train.select(range(min(50000, len(train))))

src_texts = [ex["de"] for ex in subset["translation"]]
tgt_texts = [ex["en"] for ex in subset["translation"]]

print("Loaded", len(src_texts), "sentence pairs")
print("Example:\n", src_texts[0], "‚Üí", tgt_texts[0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

de-en/train-00000-of-00001.parquet:   0%|          | 0.00/8.80M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51467 [00:00<?, ? examples/s]

Loaded 50000 sentence pairs
Example:
 Source: http://www.zeno.org - Contumax GmbH & Co. KG ‚Üí Source: Project Gutenberg


In [None]:
# Combine both languages into a single text file for shared BPE
with open("train_combined.txt", "w", encoding="utf-8") as f:
    for de, en in zip(src_texts, tgt_texts):
        f.write(de.strip() + "\n")
        f.write(en.strip() + "\n")

print("Combined text file created with", len(src_texts) * 2, "lines")

Combined text file created with 100000 lines


Train SentencePiece BPE Tokenizer (shared vocab)

In [None]:
import sentencepiece as spm

# Train SentencePiece tokenizer
spm.SentencePieceTrainer.train(
    input="train_combined.txt",
    model_prefix="bpe",
    vocab_size=37000,         # same as paper
    model_type="bpe",         # Byte-Pair Encoding
    character_coverage=1.0,   # cover all characters
    pad_id=0, unk_id=1, bos_id=2, eos_id=3
)

print("‚úÖ SentencePiece tokenizer trained! Files generated: bpe.model, bpe.vocab")

‚úÖ SentencePiece tokenizer trained! Files generated: bpe.model, bpe.vocab


Load Tokenizer & Test Encoding/Decoding

In [None]:
sp = spm.SentencePieceProcessor(model_file="bpe.model")

# Test on one pair
src_example = src_texts[10]
tgt_example = tgt_texts[10]

src_ids = sp.encode(src_example, out_type=int)
tgt_ids = sp.encode(tgt_example, out_type=int)

print("German:", src_example)
print("‚Üí src_ids:", src_ids[:20])
print("English:", tgt_example)
print("‚Üí tgt_ids:", tgt_ids[:20])

# Decode back to verify
print("Decoded back (src):", sp.decode(src_ids))

German: ¬ªJane, ich liebe weder Spitzfindigkeiten noch Fragen; au√üerdem ist es gradezu widerlich, wenn ein Kind √§ltere Leute in dieser Weise zur Rede stellt.
‚Üí src_ids: [94, 5110, 36864, 147, 2815, 3031, 9272, 17226, 3561, 307, 4557, 36888, 6878, 217, 179, 22058, 357, 28596, 36864, 428]
English: "Jane, I don't like cavillers or questioners; besides, there is something truly forbidding in a child taking up her elders in that manner.
‚Üí tgt_ids: [150, 5110, 36864, 63, 944, 36877, 36851, 547, 9366, 178, 229, 363, 1954, 229, 36888, 4381, 36864, 399, 128, 1041]
Decoded back (src): ¬ªJane, ich liebe weder Spitzfindigkeiten noch Fragen; au√üerdem ist es gradezu widerlich, wenn ein Kind √§ltere Leute in dieser Weise zur Rede stellt.


Prepare Tokenized Tensors

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

def prepare_example(src_text, tgt_text, sp):
    src_ids = sp.encode(src_text, out_type=int)
    tgt_ids = sp.encode(tgt_text, out_type=int)

    src = [2] + src_ids + [3]         # <sos> + src + <eos>
    tgt_in = [2] + tgt_ids            # <sos> + tgt
    tgt_out = tgt_ids + [3]           # tgt + <eos>

    return torch.tensor(src), torch.tensor(tgt_in), torch.tensor(tgt_out)


def collate_fn(batch):
    src_batch, tgt_in_batch, tgt_out_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=0, batch_first=True)
    tgt_in_batch = pad_sequence(tgt_in_batch, padding_value=0, batch_first=True)
    tgt_out_batch = pad_sequence(tgt_out_batch, padding_value=0, batch_first=True)
    return src_batch, tgt_in_batch, tgt_out_batch


Build a Small Dataset Loader

In [None]:
from torch.utils.data import DataLoader

# Build a small subset for testing
pairs = list(zip(src_texts[:1000], tgt_texts[:1000]))  # 1000 pairs for quick runs
tokenized_data = [prepare_example(src, tgt, sp) for src, tgt in pairs]

train_loader = DataLoader(tokenized_data, batch_size=32, collate_fn=collate_fn, shuffle=True)

for src, tgt_in, tgt_out in train_loader:
    print("Batch shapes ‚Üí src:", src.shape, "tgt_in:", tgt_in.shape, "tgt_out:", tgt_out.shape)
    break

Batch shapes ‚Üí src: torch.Size([32, 121]) tgt_in: torch.Size([32, 97]) tgt_out: torch.Size([32, 97])


# 12. Training loop