# 1. Imports and setup

In [None]:
# Import all necessary libraries
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn.functional as F
import math

In [None]:
#print(torch.cuda.get_device_name(0))

# 2. Multi-Head Attention

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, d_model, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.num_heads = num_heads
        self.d_k = d_model // num_heads # model is "sliced" into num_heads sections - each section learns something different

        # Linear projections for Q, K, V
        # nn.Linear(in_features, out_features) -> applies a matrix multiplication + bias to every input vector (XWT+b). So embedding vec * weight matrix for Q, K or V + bias
        self.q_linear = nn.Linear(d_model, d_model) # We have d_model as in_features and out_features because we calculate query, key, value matrices
        self.k_linear = nn.Linear(d_model, d_model) # first and then slice them into the desired dimensions. So each head receives a different Q, K, V matrix subset.
        self.v_linear = nn.Linear(d_model, d_model) # Here we created 3 independent Linear layers.

        # Output linear layer
        self.out = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(dropout)

    # üîπ Nested Scaled Dot-Product Attention block
    def scaled_dot_product_attention(self, Q, K, V, mask=None, dropout=None):
        """
        Q, K, V: (batch, heads, seq_len, d_k)
        mask: (batch, 1, 1, seq_len_k) or None
        """
        d_k = Q.size(-1)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (d_k ** 0.5) # we scale by d_k the dot product gets large when d_k is large, making softmax outputs very peaky (too confident).
                                                                     # Dividing by ‚àöd_k keeps the values in a reasonable range.

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = F.softmax(scores, dim=-1)

        if dropout is not None:
            attn = dropout(attn)

        output = torch.matmul(attn, V)
        return output, attn

    # üîπ Full Multi-Head Attention forward pass
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0) #batch size from the input embedding tensor X

        # 1Ô∏è‚É£ Linear projections
        Q = self.q_linear(query) # Dont get confused by the name in the parantheses. query, key and value are all the same matrix - i.e., the embedding matrix (X)
        K = self.k_linear(key)   # and not the weight matrix for q,k,v. Weights and bias are initialized randomly at first for all.
        V = self.v_linear(value) # Model will determine the "correct" weight and bias through its training. Once this training is complete, we get (batch size, seq_len, d_model) for each - Q, K, V

        # 2Ô∏è‚É£ Split into heads
        Q = Q.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) # -1 here tells Pytorch "I don‚Äôt care what this dimension should be.
        K = K.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) # Compute it so that the total number of elements stays the same". -1 automatically becomes seq_len. We are going from 3d to 4d here.
        V = V.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) # We first split each token‚Äôs embedding into head slices,then transpose so heads come first because when we do attention, we compute dot products per head.
                                                                             # If we simply revrse the order like V.view(batch_size, self.num_heads, -1, self.d_k) then it would assume the wrong data order in memory.
                                                                             # Whats happening in this code block: Q, K, V transformed matrices that we got from the previous step are being sliced into matrices with 64 cols each (512/8) with a batch size as determined by query.size(0) since we will be processing it in batches and can run in parallel

        # 3Ô∏è‚É£ Apply scaled dot-product attention per head
        x, attn = self.scaled_dot_product_attention(Q, K, V, mask, self.dropout)

        # 4Ô∏è‚É£ Concatenate heads
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k) # Transpose back to original shape (batch, seq_len, heads, d_k) and merge all heads ‚Üí (batch, seq_len, 512)
        # when we do .transpose(), PyTorch does not physically rearrange elements in memory. It says "Hey, when someone reads me, pretend my rows and columns are swapped ‚Äî but don‚Äôt actually copy or move anything yet". view() requires the tensor‚Äôs elements to be laid out contiguously (in order) in memory. So we use contiguous() to copy our data into memory now, in the correct transposed order for view to work correctly

        # 5Ô∏è‚É£ Final linear layer
        output = self.out(x)  # this layer performs output= xWOT‚Äã+bO‚Äã where WO is a learnable weight. We need this because right now, the merged head features from the previous step are just sitting side by side. They don‚Äôt interact. This lets the model learn how to combine and weight the heads optimally.
                              # It decides, for example: Maybe head 3‚Äôs info matters more than head 5‚Äôs or Maybe features from head 1 and head 7 should be mixed together.

        return output, attn


# 5. Positional Encoding

In [None]:
# The positional encoding matrix is computed once, stored forever, and reused for every sentence ‚Äî only the needed portion is added each time.
# These sine‚Äìcosine patterns are fixed mathematical signals. They don‚Äôt depend on the data and don‚Äôt need to be learned
# That‚Äôs why they‚Äôre stored as a buffer using: self.register_buffer('pe', pe) so they move with the model (to GPU/CPU) but aren‚Äôt updated by backpropagation.

class PositionalEncoding(nn.Module): # we use a class instead of a function because we want PyTorch to treat positional encoding as a layer in the model not just some helper calculation
                                     # Unlike a class, a fucntion runs once and forgets everything; Can‚Äôt save data (like the precomputed encoding); Isn‚Äôt part of the model (so you can‚Äôt save or load it easily).
                                     # A function is like a cook ‚Äî you tell them the recipe every time. A class is like a kitchen machine ‚Äî you set it up once, and it‚Äôs ready to work whenever you need it.
                                     # The nn.Module class in PyTorch is the base class for all neural network modules. It provides the fundamental structure and functionality for building and managing neural network architectures.

    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__() # super() is a built-in Python function that lets a child class (PositionalEncoding) call methods from its parent class (nn.Module)
                                                   # Why does super take PositionalEncoding as an argument? - "Find the parent class of PositionalEncoding and call its methods in the context of self (this object)""


        # Create a matrix of shape (max_len, d_model)
        # Each row is a position, each column is a dimension
        pe = torch.zeros(max_len, d_model) # max_len = the largest number of tokens (words/subwords) that your model's positional encoding table will support. It is a global limit, not per sentence

        # Create a column vector of positions [0, 1, 2, ..., max_len-1]
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # shape: (max_len, 1). Creates a column vector with 1 column and rows = max_len

        # Compute the "divisor term" from the formula: 10000^(2i/d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # formula is manipulated to make it easier to code. 2i is the even position

        # Apply sin to even indices (0, 2, 4, ...)
        pe[:, 0::2] = torch.sin(position * div_term)

        # Apply cos to odd indices (1, 3, 5, ...)
        pe[:, 1::2] = torch.cos(position * div_term)

        # Add a batch dimension (1, max_len, d_model)
        pe = pe.unsqueeze(0) # Before this line, pe has shape [max_len, d_model]. After this line - [1, max_len, d_model].
                             # when we add positional encodings to our input embeddings later, the input x will have shape [batch_size, seq_len, d_model].
                             # To make the addition possible, both tensors must have compatible shapes.

        # Register as a buffer so it‚Äôs not a learnable parameter but moves with the model (to GPU if needed)
        self.register_buffer('pe', pe)

    def forward(self, x): # Use of this function? Every class that inherits from nn.Module must define how data flows through it ‚Äî that‚Äôs what the forward() method does. "When I feed data into this layer, what should happen to it?"
        """
        x: (batch_size, seq_len, d_model)
        """
        # Add the positional encoding up to the sequence length
        x = x + self.pe[:, :x.size(1)] # self.pe[:, :x.size(1)] and self.pe[:, :x.size(1), :] do the same thing ‚Äî the last : is just optional because PyTorch automatically includes all remaining dimensions.
        return x


# 6. Positionwise Feed Forward layer

In [None]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1): # The constructor initializes the layer. d_model: input and output dimension (e.g. 512), d_ff: hidden layer dimension (e.g. 2048), dropout: dropout rate for regularization
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff) # First linear layer that expands each token‚Äôs embedding from size d_model ‚Üí d_ff.
        self.linear2 = nn.Linear(d_ff, d_model) # Second linear layer that projects it back down from d_ff ‚Üí d_model.
        self.dropout = nn.Dropout(dropout) # Randomly drops some activations during training to prevent overfitting.

    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x)))) # inside to outside. apply the first linear layer (expand the dims), apply relu activation, apply dropout, project back to the og dim


# 7. Encoder Layer

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, num_heads, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(num_heads, d_model, dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout) # in __init__ we just specify the dropout rate. Here we specify how to actually use this number. This creates an actual dropout layer

    def forward(self, x, mask=None):
        # 1Ô∏è‚É£ Self-Attention sublayer (with residual connection + LayerNorm)
        attn_output, _ = self.self_attn(x, x, x, mask) # When you call self.self_attn(...), PyTorch automatically runs that module‚Äôs own forward() method. So the arguments we see here are for forward and different than what we specified in  __init__
        x = x + self.dropout(attn_output)
        x = self.norm1(x)

        # 2Ô∏è‚É£ Feed-Forward sublayer (with residual connection + LayerNorm)
        ff_output = self.feed_forward(x)
        x = x + self.dropout(ff_output)
        x = self.norm2(x)

        return x

# 8. Decoder Layer

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, num_heads, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(num_heads, d_model, dropout)
        self.enc_dec_attn = MultiHeadAttention(num_heads, d_model, dropout) # creates the cross-attention sub-layer inside the decoder, which lets the decoder look at and extract relevant information from the encoder‚Äôs output.
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        # 1Ô∏è‚É£ Masked Self-Attention (decoder looks at past tokens only)
        _x, _ = self.self_attn(x, x, x, tgt_mask) # ‚ÄúWhile predicting this word, which encoder words should I pay attention to?‚Äù
        x = x + self.dropout(_x)
        x = self.norm1(x)

        # 2Ô∏è‚É£ Encoder-Decoder Attention
        _x, _ = self.enc_dec_attn(x, enc_output, enc_output, src_mask) # x = decoder's current hidden states (become queries Q), enc_output = encoder output (become keys K and values V), src_mask = mask for padding or attention limits
        x = x + self.dropout(_x)
        x = self.norm2(x)

        # 3Ô∏è‚É£ Feed-Forward
        _x = self.feed_forward(x)
        x = x + self.dropout(_x)
        x = self.norm3(x)

        return x

# 9. Encoder Stack

In [None]:
class Encoder(nn.Module):
    def __init__(self, num_layers, num_heads, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.layers = nn.ModuleList([
            EncoderLayer(num_heads, d_model, d_ff, dropout)
            for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

# 10. Decoder Stack

In [None]:
class Decoder(nn.Module):
    def __init__(self, num_layers, num_heads, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.layers = nn.ModuleList([
            DecoderLayer(num_heads, d_model, d_ff, dropout)
            for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        for layer in self.layers:
            x = layer(x, enc_output, src_mask, tgt_mask)
        return self.norm(x)

# 11. Load Dataset

In [None]:
!pip install -q datasets sentencepiece

from datasets import load_dataset

# Load OPUS Books English‚ÄìGerman dataset
dataset = load_dataset("opus_books", "de-en")
train = dataset["train"]

# Correct slicing method
subset = train.select(range(min(50000, len(train))))

src_texts = [ex["de"] for ex in subset["translation"]]
tgt_texts = [ex["en"] for ex in subset["translation"]]

print("Loaded", len(src_texts), "sentence pairs")
print("Example:\n", src_texts[0], "‚Üí", tgt_texts[0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

de-en/train-00000-of-00001.parquet:   0%|          | 0.00/8.80M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51467 [00:00<?, ? examples/s]

Loaded 50000 sentence pairs
Example:
 Source: http://www.zeno.org - Contumax GmbH & Co. KG ‚Üí Source: Project Gutenberg


In [None]:
# Combine both languages into a single text file for shared BPE
with open("train_combined.txt", "w", encoding="utf-8") as f:
    for de, en in zip(src_texts, tgt_texts):
        f.write(de.strip() + "\n")
        f.write(en.strip() + "\n")

print("Combined text file created with", len(src_texts) * 2, "lines")

Combined text file created with 100000 lines


Train SentencePiece BPE Tokenizer (shared vocab)

In [None]:
import sentencepiece as spm

# Train SentencePiece tokenizer
spm.SentencePieceTrainer.train(
    input="train_combined.txt",
    model_prefix="bpe",
    vocab_size=37000,         # same as paper
    model_type="bpe",         # Byte-Pair Encoding
    character_coverage=1.0,   # cover all characters
    pad_id=0, unk_id=1, bos_id=2, eos_id=3
)

print("‚úÖ SentencePiece tokenizer trained! Files generated: bpe.model, bpe.vocab")

‚úÖ SentencePiece tokenizer trained! Files generated: bpe.model, bpe.vocab


Load Tokenizer & Test Encoding/Decoding

In [None]:
sp = spm.SentencePieceProcessor(model_file="bpe.model")

# Test on one pair
src_example = src_texts[10]
tgt_example = tgt_texts[10]

src_ids = sp.encode(src_example, out_type=int)
tgt_ids = sp.encode(tgt_example, out_type=int)

print("German:", src_example)
print("‚Üí src_ids:", src_ids[:20])
print("English:", tgt_example)
print("‚Üí tgt_ids:", tgt_ids[:20])

# Decode back to verify
print("Decoded back (src):", sp.decode(src_ids))

German: ¬ªJane, ich liebe weder Spitzfindigkeiten noch Fragen; au√üerdem ist es gradezu widerlich, wenn ein Kind √§ltere Leute in dieser Weise zur Rede stellt.
‚Üí src_ids: [94, 5110, 36864, 147, 2815, 3031, 9272, 17226, 3561, 307, 4557, 36888, 6878, 217, 179, 22058, 357, 28596, 36864, 428]
English: "Jane, I don't like cavillers or questioners; besides, there is something truly forbidding in a child taking up her elders in that manner.
‚Üí tgt_ids: [150, 5110, 36864, 63, 944, 36877, 36851, 547, 9366, 178, 229, 363, 1954, 229, 36888, 4381, 36864, 399, 128, 1041]
Decoded back (src): ¬ªJane, ich liebe weder Spitzfindigkeiten noch Fragen; au√üerdem ist es gradezu widerlich, wenn ein Kind √§ltere Leute in dieser Weise zur Rede stellt.


Prepare Tokenized Tensors

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

def prepare_example(src_text, tgt_text, sp):
    src_ids = sp.encode(src_text, out_type=int)
    tgt_ids = sp.encode(tgt_text, out_type=int)

    src = [2] + src_ids + [3]         # <sos> + src + <eos>
    tgt_in = [2] + tgt_ids            # <sos> + tgt
    tgt_out = tgt_ids + [3]           # tgt + <eos>

    return torch.tensor(src), torch.tensor(tgt_in), torch.tensor(tgt_out)


def collate_fn(batch):
    src_batch, tgt_in_batch, tgt_out_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=0, batch_first=True)
    tgt_in_batch = pad_sequence(tgt_in_batch, padding_value=0, batch_first=True)
    tgt_out_batch = pad_sequence(tgt_out_batch, padding_value=0, batch_first=True)
    return src_batch, tgt_in_batch, tgt_out_batch


Build a Small Dataset Loader

In [None]:
from torch.utils.data import DataLoader

# 1) Pair up all sentences
pairs = list(zip(src_texts, tgt_texts))

# (NEW) Filter out very long sentences so training is easier
def filter_by_length(pairs, sp, max_src_len=60, max_tgt_len=60):
    filtered = []
    for de, en in pairs:
        de_ids = sp.encode(de, out_type=int)
        en_ids = sp.encode(en, out_type=int)
        if len(de_ids) <= max_src_len and len(en_ids) <= max_tgt_len:
            filtered.append((de, en))
    return filtered

pairs = filter_by_length(pairs, sp, max_src_len=60, max_tgt_len=60)
print(f"After length filter: {len(pairs)} sentence pairs")


# 2) Simple 90/10 split into train / val
split_idx = int(0.9 * len(pairs))
train_pairs = pairs[:split_idx]
val_pairs   = pairs[split_idx:]

print(f"Total pairs: {len(pairs)}, train: {len(train_pairs)}, val: {len(val_pairs)}")

# 3) (Optional) limit sizes for Colab
train_pairs = train_pairs[:30000]
val_pairs   = val_pairs[:3000]

# 4) Tokenize using prepare_example()
train_data = [prepare_example(src, tgt, sp) for src, tgt in train_pairs]
val_data   = [prepare_example(src, tgt, sp) for src, tgt in val_pairs]

batch_size = 32
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True,  collate_fn=collate_fn)
val_loader   = DataLoader(val_data,   batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Quick sanity check
for src, tgt_in, tgt_out in train_loader:
    print("Batch shapes ‚Üí src:", src.shape, "tgt_in:", tgt_in.shape, "tgt_out:", tgt_out.shape)
    break


After length filter: 45393 sentence pairs
Total pairs: 45393, train: 40853, val: 4540
Batch shapes ‚Üí src: torch.Size([32, 59]) tgt_in: torch.Size([32, 57]) tgt_out: torch.Size([32, 57])


# Transformer Wrapper

In [None]:
import math
import torch.nn as nn

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, num_layers, num_heads, d_model, d_ff, dropout=0.1, max_len=5000):
        super().__init__()

        # Embedding layers
        self.src_embed = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embed = nn.Embedding(tgt_vocab_size, d_model)

        # Positional encodings
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        self.pos_decoder = PositionalEncoding(d_model, max_len)

        # Encoder and decoder stacks
        self.encoder = Encoder(num_layers, num_heads, d_model, d_ff, dropout)
        self.decoder = Decoder(num_layers, num_heads, d_model, d_ff, dropout)

        # Final linear output layer
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

        self.dropout = nn.Dropout(dropout)
        self.d_model = d_model

    def forward(self, src, tgt_in, src_mask=None, tgt_mask=None):
        # 1Ô∏è‚É£ Embed source and target
        src = self.src_embed(src) * math.sqrt(self.d_model)
        tgt = self.tgt_embed(tgt_in) * math.sqrt(self.d_model)

        # 2Ô∏è‚É£ Add positional encodings
        src = self.pos_encoder(self.dropout(src))
        tgt = self.pos_decoder(self.dropout(tgt))

        # 3Ô∏è‚É£ Encoder-decoder pass
        enc_output = self.encoder(src, src_mask)
        dec_output = self.decoder(tgt, enc_output, src_mask, tgt_mask)

        # 4Ô∏è‚É£ Final linear layer
        output = self.fc_out(dec_output)
        return output

# Attention Mask

In [None]:
def create_padding_mask(seq, pad_token=0):
    return (seq != pad_token).unsqueeze(1).unsqueeze(2)  # (batch, 1, 1, seq_len)

def create_subsequent_mask(size):
    # upper triangle (including diagonal) = True; we flip later
    mask = torch.triu(torch.ones(size, size), diagonal=1).bool()
    mask = ~mask  # now: lower triangle incl diag = True, future positions = False
    # Shape: (1, 1, size, size) so it broadcasts nicely with padding mask
    return mask.unsqueeze(0).unsqueeze(1)

# Instantiate the Model

In [None]:
# Instantiate the Model

# Get vocab size from tokenizer
src_vocab_size = tgt_vocab_size = sp.get_piece_size()

device = "cuda" if torch.cuda.is_available() else "cpu"

model = Transformer(
    src_vocab_size=src_vocab_size,
    tgt_vocab_size=tgt_vocab_size,
    num_layers=3,     # a bit deeper
    num_heads=4,      # fewer heads
    d_model=256,      # smaller d_model -> much faster
    d_ff=1024,
    dropout=0.1
).to(device)

print("Model initialized with vocab size:", src_vocab_size)


Model initialized with vocab size: 37000


# Loss & Optimizer

In [None]:
# Slight label smoothing helps MT a lot
import torch.optim as optim
criterion = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1)

# Adam with Transformer-style LR schedule
optimizer = optim.Adam(
    model.parameters(),
    lr=1.0,              # base LR, real LR comes from scheduler
    betas=(0.9, 0.98),
    eps=1e-9,
)

warmup_steps = 4000

def lr_lambda(step):
    # step starts from 1
    step = max(step, 1)
    return (model.d_model ** -0.5) * min(step ** -0.5, step * (warmup_steps ** -1.5))

scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

In [None]:
def evaluate_val_loss(model, val_loader, criterion, device="cpu"):
    model.eval()
    total_loss = 0.0
    n_batches = 0

    with torch.no_grad():
        for src, tgt_in, tgt_out in val_loader:
            src, tgt_in, tgt_out = src.to(device), tgt_in.to(device), tgt_out.to(device)

            src_mask = create_padding_mask(src)
            tgt_mask = create_padding_mask(tgt_in) & create_subsequent_mask(tgt_in.size(1)).to(device)

            output = model(src, tgt_in, src_mask, tgt_mask)

            output_flat = output.view(-1, output.size(-1))
            tgt_flat = tgt_out.view(-1)

            loss = criterion(output_flat, tgt_flat)
            total_loss += loss.item()
            n_batches += 1

    return total_loss / max(1, n_batches)

In [None]:
def greedy_decode(model, src_sentence, sp, max_len=50, device="cpu"):
    model.eval()
    with torch.no_grad():
        # Prepare source
        src_ids = [2] + sp.encode(src_sentence, out_type=int) + [3]  # <sos> ... <eos>
        src = torch.tensor(src_ids, dtype=torch.long, device=device).unsqueeze(0)  # (1, src_len)
        src_mask = create_padding_mask(src)

        # Encode EXACTLY like in model.forward
        src_emb = model.src_embed(src) * math.sqrt(model.d_model)
        src_emb = model.pos_encoder(model.dropout(src_emb))
        enc_output = model.encoder(src_emb, src_mask)

        # Start target with <sos>
        tgt = torch.tensor([[2]], dtype=torch.long, device=device)

        for _ in range(max_len):
            tgt_mask = create_padding_mask(tgt) & create_subsequent_mask(tgt.size(1)).to(device)

            tgt_emb = model.tgt_embed(tgt) * math.sqrt(model.d_model)
            tgt_emb = model.pos_decoder(model.dropout(tgt_emb))

            dec_output = model.decoder(tgt_emb, enc_output, src_mask, tgt_mask)
            logits = model.fc_out(dec_output[:, -1])  # (1, vocab_size)
            next_token = logits.argmax(-1).item()

            tgt = torch.cat(
                [tgt, torch.tensor([[next_token]], dtype=torch.long, device=device)],
                dim=1,
            )

            if next_token == 3:  # <eos>
                break

        # remove <sos> and possible final <eos>
        out_tokens = tgt.squeeze(0).tolist()[1:]
        if out_tokens and out_tokens[-1] == 3:
            out_tokens = out_tokens[:-1]
        return sp.decode(out_tokens)


In [None]:
!pip install -q sacrebleu
import sacrebleu

def evaluate_bleu(model, val_pairs, sp, device="cpu", num_examples=50, max_len=50):
    model.eval()
    hyps = []
    refs = []

    num_examples = min(num_examples, len(val_pairs))

    with torch.no_grad():
        for i in range(num_examples):
            src_text, tgt_text = val_pairs[i]
            hyp = greedy_decode(model, src_text, sp, max_len=max_len, device=device)

            hyps.append(hyp)
            refs.append(tgt_text)

    bleu = sacrebleu.corpus_bleu(hyps, [refs])
    return bleu.score


[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/104.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m104.1/104.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
def show_sample_translations(model, val_pairs, sp, device="cpu", num_examples=5, max_len=50):
    model.eval()
    print("\nSample translations:")
    print("-" * 80)
    with torch.no_grad():
        for i in range(min(num_examples, len(val_pairs))):
            src_text, tgt_text = val_pairs[i]
            pred = greedy_decode(model, src_text, sp, max_len=max_len, device=device)
            print(f"[SRC] {src_text}")
            print(f"[PRED] {pred}")
            print(f"[TGT] {tgt_text}")
            print("-" * 80)

In [None]:
from tqdm import tqdm

n_epochs = 20  # more epochs now that model is smaller
# device is already defined above when we created the model

global_step = 0

for epoch in range(1, n_epochs + 1):
    # ---- Train ----
    model.train()
    total_train_loss = 0.0

    loop = tqdm(train_loader, leave=True)
    for src, tgt_in, tgt_out in loop:
        src, tgt_in, tgt_out = src.to(device), tgt_in.to(device), tgt_out.to(device)

        src_mask = create_padding_mask(src)
        tgt_mask = create_padding_mask(tgt_in) & create_subsequent_mask(tgt_in.size(1)).to(device)

        output = model(src, tgt_in, src_mask, tgt_mask)

        output_flat = output.view(-1, output.size(-1))
        tgt_flat = tgt_out.view(-1)

        loss = criterion(output_flat, tgt_flat)

        optimizer.zero_grad()
        loss.backward()

        # (NEW) gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()
        global_step += 1

        total_train_loss += loss.item()
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

    avg_train_loss = total_train_loss / len(train_loader)

    # ---- Validation loss ----
    val_loss = evaluate_val_loss(model, val_loader, criterion, device=device)

    # ---- BLEU on a subset of validation pairs ----
    bleu = evaluate_bleu(model, val_pairs, sp, device=device, num_examples=50, max_len=50)

    print(f"\nEpoch {epoch} summary:")
    print(f"  Train loss: {avg_train_loss:.4f}")
    print(f"  Val loss:   {val_loss:.4f}")
    print(f"  BLEU (50 val examples): {bleu:.2f}")

    # Show a few sample translations from the validation set
    show_sample_translations(model, val_pairs, sp, device=device, num_examples=3, max_len=50)


Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:47<00:00, 19.64it/s, loss=6.13]



Epoch 1 summary:
  Train loss: 7.4071
  Val loss:   6.2583
  BLEU (50 val examples): 0.51

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "I's a few.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He was a little, and the other, and the other.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] "I's a little, he had been to be a little, he had been to be a little, he had been to b

Epoch 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:46<00:00, 19.98it/s, loss=5.84]



Epoch 2 summary:
  Train loss: 5.9766
  Val loss:   5.9151
  BLEU (50 val examples): 0.86

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, he is a little girl.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He said to the young girl, and said, and said, and had been a little girl.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] "You have a little girl, he said, he had bee

Epoch 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:46<00:00, 20.00it/s, loss=5.74]



Epoch 3 summary:
  Train loss: 5.6574
  Val loss:   5.7688
  BLEU (50 val examples): 0.98

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, he was a few minutes.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He was a voice, and the voice of the voice, and the voice of the voice of the voice.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] "Oh, he would have been a voice, 

Epoch 4: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:46<00:00, 20.00it/s, loss=5.13]



Epoch 4 summary:
  Train loss: 5.4377
  Val loss:   5.6773
  BLEU (50 val examples): 3.61

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, he was gone.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He looked at the voice, and looked at the voice.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] 'Oh, I have no longer a little thing, and he must be afraid of course, he must

Epoch 5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:46<00:00, 20.03it/s, loss=5.24]



Epoch 5 summary:
  Train loss: 5.2724
  Val loss:   5.5929
  BLEU (50 val examples): 3.33

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, he is dead.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He was heard a smile, and his voice, and his voice.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] 'Oh, please, he was still a moment,' said Oblonsky, 'and he spoke to be a li

Epoch 6: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:46<00:00, 20.03it/s, loss=5.42]



Epoch 6 summary:
  Train loss: 5.0827
  Val loss:   5.5121
  BLEU (50 val examples): 4.07

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, he is married.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He's a good-bye, and he's a good-bye,' said Oblonsky.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] 'Ah, you must be a good-bye, and he said to himself, 'Oh, you must be a

Epoch 7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:46<00:00, 19.98it/s, loss=4.81]



Epoch 7 summary:
  Train loss: 4.9260
  Val loss:   5.4842
  BLEU (50 val examples): 6.53

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, he is dead, he is dead.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He looked at the Countess Lydia Ivanovna and looked at the Countess Nordston.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] 'Oh, I must be a good fellow,' said Ob

Epoch 8: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:46<00:00, 19.97it/s, loss=4.62]



Epoch 8 summary:
  Train loss: 4.7971
  Val loss:   5.4626
  BLEU (50 val examples): 4.31

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, he was silent.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He looked at the Countess,' said Oblonsky, looking at the Countess Nordston.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] 'Oh, he must be a good deal of course,' said Obl

Epoch 9: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:47<00:00, 19.81it/s, loss=5.02]



Epoch 9 summary:
  Train loss: 4.6889
  Val loss:   5.4632
  BLEU (50 val examples): 5.11

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, he was going to Paris.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He's heard the Countess Nordston,' said Oblonsky, looking at the Countess Nordston.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] 'Oh, please don't want anything,'

Epoch 10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:47<00:00, 19.94it/s, loss=4.52]



Epoch 10 summary:
  Train loss: 4.5933
  Val loss:   5.4669
  BLEU (50 val examples): 5.79

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, he is mad.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He looked at the Countess,' said Oblonsky.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] 'Oh, I must be a key,' said Oblonsky, in a low voice, and he was not a key to her.
[T

Epoch 11: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:46<00:00, 19.96it/s, loss=4.76]



Epoch 11 summary:
  Train loss: 4.5130
  Val loss:   5.4658
  BLEU (50 val examples): 5.12

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, he is very pretty, he.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He has heard a Countess,' said Oblonsky, looking at the Countess, and looked at the Countess.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] 'Oh, I must be glad he

Epoch 12: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:46<00:00, 19.97it/s, loss=4.59]



Epoch 12 summary:
  Train loss: 4.4377
  Val loss:   5.4706
  BLEU (50 val examples): 6.71

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, he is sure.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He looked at Oblonsky, and was looking at him.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] 'Oh, I must be a moment,' said Oblonsky, in a few minutes, in a few minutes, in 

Epoch 13: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:46<00:00, 20.00it/s, loss=4.67]



Epoch 13 summary:
  Train loss: 4.3690
  Val loss:   5.4864
  BLEU (50 val examples): 6.18

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, he is mad.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He looked at the Countess Lydia Ivanovna and spoke to him.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] 'Oh, it is not possible to be a bit of the lock, or something!' said 

Epoch 14: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:46<00:00, 20.00it/s, loss=4.79]



Epoch 14 summary:
  Train loss: 4.3084
  Val loss:   5.4931
  BLEU (50 val examples): 6.36

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, he is sure, he is.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He looked at the Countess and was looking at it.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] 'Oh, I should be glad to see him,' said Oblonsky, in a low tone, and he

Epoch 15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:46<00:00, 20.00it/s, loss=4.62]



Epoch 15 summary:
  Train loss: 4.2526
  Val loss:   5.4997
  BLEU (50 val examples): 5.64

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, he is in Paris.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] 'I know,' said Oblonsky, and looked at the Countess Nordston.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] 'Oh, I must be a little bit of the house,' said Oblonsky, in 

Epoch 16: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:46<00:00, 20.00it/s, loss=4.09]



Epoch 16 summary:
  Train loss: 4.1986
  Val loss:   5.5135
  BLEU (50 val examples): 6.07

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, he is gone.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He has heard the Countess,' said the Countess, and looked at the Countess.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] 'Oh, I must be a little more than a key to her, or t

Epoch 17: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:47<00:00, 19.94it/s, loss=4.21]



Epoch 17 summary:
  Train loss: 4.1517
  Val loss:   5.5338
  BLEU (50 val examples): 6.30

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, he is alive.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He was looking at the Countess Lydia Ivanovna and his voice was looking at the Countess.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] 'Oh, I have a key to her,' he said in

Epoch 18: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:47<00:00, 19.88it/s, loss=4.16]



Epoch 18 summary:
  Train loss: 4.1092
  Val loss:   5.5356
  BLEU (50 val examples): 6.79

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, Paris," he went on.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He has heard one of the Countess Lydia Ivanovna and heard Oblonsky's voice.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] 'Oh, I should think,' said Oblonsky, in a f

Epoch 19: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:47<00:00, 19.94it/s, loss=4.09]



Epoch 19 summary:
  Train loss: 4.0662
  Val loss:   5.5518
  BLEU (50 val examples): 6.42

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, he is alive.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He was looking at the Countess Lydia Ivanovna and of the Countess,' said Oblonsky, a voice.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] 'Oh, please do not come to her,' s

Epoch 20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [00:47<00:00, 19.65it/s, loss=4.14]



Epoch 20 summary:
  Train loss: 4.0267
  Val loss:   5.5706
  BLEU (50 val examples): 6.29

Sample translations:
--------------------------------------------------------------------------------
[SRC] ¬ªJa, er f√§hrt nach Paris.
[PRED] "Yes, Paris," he said, "he's.
[TGT] 'Yes, he is going to Paris.
--------------------------------------------------------------------------------
[SRC] Er hat gestern eine Stimme geh√∂rt¬´, sagte die Gr√§fin Lydia Iwanowna und blickte dabei Stepan Arkadjewitsch an.
[PRED] He has heard the Countess,' said Oblonsky, looking at the Countess.
[TGT] He heard a voice yesterday,' said the Countess, with a look at Oblonsky.
--------------------------------------------------------------------------------
[SRC] ¬ªAch, eine Stimme!¬´ sprach Oblonski ihr nach; er sagte sich, da√ü er in dieser Gesellschaft m√∂glichst vorsichtig sein m√ºsse, wo etwas vorgehe oder vorgehen solle, wozu er noch keinen Schl√ºssel habe.
[PRED] 'Oh, please do!' said Oblonsky in a voice, in a

Save the model

In [None]:
from google.colab import drive

drive.mount("/content/drive")
save_path = "/content/drive/MyDrive/transformer_en_de.pt"

torch.save(model.state_dict(), save_path)
print(f"Model saved to: {save_path}")

Mounted at /content/drive
Model saved to: /content/drive/MyDrive/transformer_en_de.pt


In [None]:
translation = greedy_decode(model, "Ich liebe Katzen", sp, device=device)
print("Translated:", translation)

Translated: I like cats
