In [1]:
# Import Libraries
import random
import os
import re
import unicodedata
import zipfile
 
import matplotlib.pyplot as plt
import numpy as np
import requests
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tokenizers
import tqdm

In [3]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version (torch): {torch.version.cuda}")
print(f"CUDA runtime version (driver): {torch.version.cuda}")
print(f"Device name: {torch.cuda.get_device_name(0)}" if torch.cuda.is_available() else "No CUDA device")

PyTorch version: 2.8.0+cu129
CUDA available: True
CUDA version (torch): 12.9
CUDA runtime version (driver): 12.9
Device name: NVIDIA GeForce RTX 2050


In [4]:
def normalize(line):
    line = unicodedata.normalize("NFKC", line)
    parts = line.split('\t')
    en, de = parts[0], parts[1]
    return  en.lower().strip(), de.lower().strip()

text_pairs = []
with open("de.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        en , de = normalize(line)
        text_pairs.append((en,de))

print(f"Pairs: {len(text_pairs)}")

Pairs: 320340


In [5]:
if os.path.exists("en_tokenizer.json") and os.path.exists("de_tokenizer.json"):
    en_tokenizer = tokenizers.Tokenizer.from_file("en_tokenizer.json")
    de_tokenizer = tokenizers.Tokenizer.from_file("de_tokenizer.json")
else:
    en_tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())
    de_tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())

    # Configure pre-tokenizer to split on whitespace and punctuation, add space at beginning of the sentence
    en_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=True)
    de_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=True)

    # Configure decoder: So that word boundary symbol "Ġ" will be removed
    en_tokenizer.decoder = tokenizers.decoders.ByteLevel()
    de_tokenizer.decoder = tokenizers.decoders.ByteLevel()

    # Train BPE for English and French using the same trainer
    VOCAB_SIZE = 20000
    trainer = tokenizers.trainers.BpeTrainer(
        vocab_size=VOCAB_SIZE,
        special_tokens=["[start]", "[end]", "[pad]"],
        show_progress=True
    )
    en_tokenizer.train_from_iterator([x[0] for x in text_pairs], trainer=trainer)
    de_tokenizer.train_from_iterator([x[1] for x in text_pairs], trainer=trainer)

    en_tokenizer.enable_padding(pad_id=en_tokenizer.token_to_id("[pad]"), pad_token="[pad]")
    de_tokenizer.enable_padding(pad_id=de_tokenizer.token_to_id("[pad]"), pad_token="[pad]")

    # Save the trained tokenizers
    en_tokenizer.save("en_tokenizer.json", pretty=True)
    de_tokenizer.save("de_tokenizer.json", pretty=True)

In [6]:
print("Sample tokenization:")
en_sample, de_sample = random.choice(text_pairs)
encoded = en_tokenizer.encode(en_sample)
print(f"Original: {en_sample}")
print(f"Tokens: {encoded.tokens}")
print(f"IDs: {encoded.ids}")
print(f"Decoded: {en_tokenizer.decode(encoded.ids)}")
print()

encoded = de_tokenizer.encode("[start] " + de_sample + " [end]")
print(f"Original: {de_sample}")
print(f"Tokens: {encoded.tokens}")
print(f"IDs: {encoded.ids}")
print(f"Decoded: {de_tokenizer.decode(encoded.ids)}")
print()

print("Vocab Size:")
print(f"English: {len(en_tokenizer.get_vocab())}")
print(f"Deutsch: {len(de_tokenizer.get_vocab())}")

Sample tokenization:
Original: tom plays the flute.
Tokens: ['Ġtom', 'Ġplays', 'Ġthe', 'Ġflute', '.']
IDs: [101, 2292, 100, 5291, 13]
Decoded:  tom plays the flute.

Original: tom spielt flöte.
Tokens: ['[start]', 'Ġtom', 'Ġspielt', 'ĠflÃ¶te', '.', 'Ġ', '[end]']
IDs: [0, 118, 1467, 7982, 13, 76, 1]
Decoded:  tom spielt flöte. 

Vocab Size:
English: 20000
Deutsch: 20000


In [7]:
#
# Create PyTorch dataset for the BPE-encoded translation pairs
#

class Dataset(torch.utils.data.Dataset):
    def __init__(self, text_pairs):
        self.text_pairs = text_pairs

    def __len__(self):
        return len(self.text_pairs)

    def __getitem__(self, idx):
        en, de = self.text_pairs[idx]
        return en, "[start] " + de + " [end]"

In [8]:
def collate_fn(batch):
    en, de = zip(*batch)
    en_enc = en_tokenizer.encode_batch(en, add_special_tokens=True)
    de_enc = en_tokenizer.encode_batch(de, add_special_tokens=True)

    en_ids = [enc.ids for enc in en_enc]
    de_ids = [enc.ids for enc in de_enc]

    return torch.tensor(en_ids), torch.tensor(de_ids)

In [9]:
BATCH_SIZE = 32
dataset = Dataset(text_pairs)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

In [10]:
class EncoderRNN(nn.Module):
    """A RNN encoder with an embedding layer"""
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout=0.1, num_layers=1):
        """
        Args:
            vocab_size: The size of the input vocabulary
            embedding_dim: The dimension of the embedding vector
            hidden_dim: The dimension of the hidden state
            dropout: The dropout rate
        """
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True, num_layers=num_layers)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_seq):
        # input seq = [batch_size, seq_len] -> embedded = [batch_size, seq_len, embedding_dim]
        embedded = self.dropout(self.embedding(input_seq))
        # outputs = [batch_size, seq_len, hidden_dim]
        # hidden = [num_layers, batch_size, hidden_dim]
        outputs, hidden = self.rnn(embedded)
        return outputs, hidden

In [11]:
class Attention(nn.Module):
    """
     The forward function takes query and keys only, and they should be the same shape (B,S,H)
    """
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        """Bahdanau Attention

        Args:
            query: [B, 1, H]
            keys: [B, S, H]

        Returns:
            context: [B, 1, H]
            weights: [B, 1, S]
        """
        B, S, H = keys.shape
        assert query.shape == (B, 1, H)
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.transpose(1,2)  # scores = [B, 1, S]

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)
        return context, weights

In [12]:

class DecoderRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout=0.1):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.attention = Attention(hidden_dim)
        self.gru = nn.GRU(embedding_dim + hidden_dim, hidden_dim, batch_first=True)
        self.out_proj = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_seq, hidden, enc_out):
        """Single token input, single token output
            
        Args:
            input_seq: [B, 1] — input token for this timestep
            hidden: [num_layers, B, H] — decoder hidden state
            enc_out: [B, S, H] — encoder outputs (for attention)

        Returns:
            output: [B, 1, vocab_size] — logits over vocab
            hidden: [num_layers, B, H] — updated hidden state
        """
        # input seq = [batch_size, 1] -> embedded = [batch_size, 1, embedding_dim]
        embedded = self.dropout(self.embedding(input_seq))
        # hidden = [num_layers, batch_size, hidden_dim]
        # query = [batch_size, 1, hidden_dim]
        # context = [batch_size, 1, hidden_dim]
        query = hidden[-1].unsqueeze(1)  # [B, 1, H]
        context, attn_weights = self.attention(query, enc_out)
        # rnn_input = [batch_size, 1, embedding_dim + hidden_dim]
        rnn_input = torch.cat([embedded, context], dim=-1)
        # rnn_output = [batch_size, 1, hidden_dim]
        rnn_output, hidden = self.gru(rnn_input, hidden)
        output = self.out_proj(rnn_output)
        return output, hidden

In [13]:
class Seq2SeqRNN(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, target_seq):
        """Given the partial target sequence, predict the next token"""
        # input seq = [batch_size, seq_len]
        # target seq = [batch_size, seq_len]
        batch_size, target_len = target_seq.shape
        device = target_seq.device
        # list for storing the output logits
        outputs = []
        # encoder forward pass
        enc_out, hidden = self.encoder(input_seq)
        dec_hidden = hidden
        # decoder forward pass
        for t in range(target_len-1):
            # during training, use the ground truth token as the input (teacher forcing)
            dec_in = target_seq[:, t].unsqueeze(1)
            # last target token and hidden states -> next token
            dec_out, dec_hidden = self.decoder(dec_in, dec_hidden, enc_out)
            # store the prediction
            outputs.append(dec_out)
        outputs = torch.cat(outputs, dim=1)
        return outputs


In [14]:
#  Model parameters

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
enc_vocab = len(en_tokenizer.get_vocab())
dec_vocab = len(de_tokenizer.get_vocab())
emb_dim = 512
hidden_dim = 512
dropout = 0.1


In [15]:
# Create model
encoder = EncoderRNN(enc_vocab, emb_dim, hidden_dim, dropout).to(device)
decoder = DecoderRNN(dec_vocab, emb_dim, hidden_dim, dropout).to(device)
model = Seq2SeqRNN(encoder, decoder).to(device)
print(model)

print("Model created with:")
print(f"  Input vocabulary size: {enc_vocab}")
print(f"  Output vocabulary size: {dec_vocab}")
print(f"  Embedding dimension: {emb_dim}")
print(f"  Hidden dimension: {hidden_dim}")
print(f"  Dropout: {dropout}")
print(f"  Total parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

Seq2SeqRNN(
  (encoder): EncoderRNN(
    (embedding): Embedding(20000, 512)
    (rnn): GRU(512, 512, batch_first=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): DecoderRNN(
    (embedding): Embedding(20000, 512)
    (dropout): Dropout(p=0.1, inplace=False)
    (attention): Attention(
      (Wa): Linear(in_features=512, out_features=512, bias=True)
      (Ua): Linear(in_features=512, out_features=512, bias=True)
      (Va): Linear(in_features=512, out_features=1, bias=True)
    )
    (gru): GRU(1024, 512, batch_first=True)
    (out_proj): Linear(in_features=512, out_features=20000, bias=True)
  )
)
Model created with:
  Input vocabulary size: 20000
  Output vocabulary size: 20000
  Embedding dimension: 512
  Hidden dimension: 512
  Dropout: 0.1
  Total parameters: 35204129


In [16]:
# Train unless model.pth exists
if os.path.exists("en2de_tlate_attn.pth"):
    model.load_state_dict(torch.load("seq2seq_attn.pth"))
else:
    optimizer = optim.Adam(model.parameters(), lr=0.0005)
    loss_fn = nn.CrossEntropyLoss() #ignore_index=de_tokenizer.token_to_id("[pad]"))
    N_EPOCHS = 5

    for epoch in range(N_EPOCHS):
        model.train()
        epoch_loss = 0
        for en_ids, de_ids in tqdm.tqdm(dataloader, desc="Training"):
            # Move the "sentences" to device
            en_ids = en_ids.to(device)
            de_ids = de_ids.to(device)
            # zero the grad, then forward pass
            optimizer.zero_grad()
            outputs = model(en_ids, de_ids)
            # compute the loss: compare 3D logits to 2D targets
            loss = loss_fn(outputs.reshape(-1, dec_vocab), de_ids[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}/{N_EPOCHS}; Avg loss {epoch_loss/len(dataloader)}; Latest loss {loss.item()}")
        torch.save(model.state_dict(), f"en2de_tlate_attn-{epoch+1}.pth")
        # Test
        if (epoch+1) % 2 != 0:
            continue
        model.eval()
        epoch_loss = 0
        with torch.no_grad():
            for en_ids, de_ids in tqdm.tqdm(dataloader, desc="Evaluating"):
                en_ids = en_ids.to(device)
                de_ids = de_ids.to(device)
                outputs = model(en_ids, de_ids)
                loss = loss_fn(outputs.reshape(-1, dec_vocab), de_ids[:, 1:].reshape(-1))
                epoch_loss += loss.item()
        print(f"Eval loss: {epoch_loss/len(dataloader)}")
    torch.save(model.state_dict(), "en2de_tlate_attn.pth")
    

Training: 100%|██████████| 10011/10011 [53:14<00:00,  3.13it/s] 


Epoch 1/5; Avg loss 0.7702970968600503; Latest loss 0.8347820043563843


Training: 100%|██████████| 10011/10011 [59:37<00:00,  2.80it/s] 


Epoch 2/5; Avg loss 0.4771934005183162; Latest loss 0.5925331115722656


Evaluating: 100%|██████████| 10011/10011 [11:44<00:00, 14.21it/s]


Eval loss: 0.38059103819513257


Training: 100%|██████████| 10011/10011 [59:42<00:00,  2.79it/s] 


Epoch 3/5; Avg loss 0.4155243230981196; Latest loss 0.4284461736679077


Training: 100%|██████████| 10011/10011 [1:00:13<00:00,  2.77it/s]


Epoch 4/5; Avg loss 0.38332118771969526; Latest loss 0.33460500836372375


Evaluating: 100%|██████████| 10011/10011 [11:33<00:00, 14.44it/s]


Eval loss: 0.31368289969027213


Training: 100%|██████████| 10011/10011 [1:00:15<00:00,  2.77it/s]


Epoch 5/5; Avg loss 0.3635944887785844; Latest loss 0.4502602219581604


In [17]:
# Test for a few samples
model.eval()
N_SAMPLES = 5
MAX_LEN = 60
with torch.no_grad():
    start_token = torch.tensor([de_tokenizer.token_to_id("[start]")]).to(device)
    for en, true_fr in random.sample(text_pairs, N_SAMPLES):
        en_ids = torch.tensor(en_tokenizer.encode(en).ids).unsqueeze(0).to(device)
        enc_out, hidden = model.encoder(en_ids)
        pred_ids = []
        prev_token = start_token.unsqueeze(0)
        for _ in range(MAX_LEN):
            output, hidden = model.decoder(prev_token, hidden, enc_out)
            output = output.argmax(dim=2)
            pred_ids.append(output.item())
            prev_token = output
            # early stop if the predicted token is the end token
            if pred_ids[-1] == de_tokenizer.token_to_id("[end]"):
                break
        # Decode the predicted IDs
        pred_fr = de_tokenizer.decode(pred_ids)
        print(f"English: {en}")
        print(f"Deutsch: {true_fr}")
        print(f"Predicted: {pred_fr}")
        print()

English: he can only criticize people behind their backs.
Deutsch: er kann leute nur hinter deren rücken kritisieren.
Predicted: watte es mngeschlagenonn hinein�� unsymp�h� äraunre� gro hört�hlein.�

English: i remember when i was your age.
Deutsch: ich weiß noch, als ich so alt war wie du.
Predicted: �auwatteenn kann mittlerweileau,urs�au lieen lustige weißt.�

English: i've invited my friends.
Deutsch: ich habe meine freunde eingeladen.
Predicted: �auirriöige trägt sate ist i dein wieein.�

English: tom is distracted.
Deutsch: tom ist abgelenkt.
Predicted:  santomm könleinkt.�

English: i like dark chocolate.
Deutsch: ich mag dunkle schokolade.
Predicted: �auvers herausbekommt sie wel wünschte dar oder.�

