In [11]:
import torch
print(torch.__version__)

2.5.0+cpu


In [161]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import tqdm

import evaluate

In [162]:
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

Dataset

In [163]:
dataset = datasets.load_dataset("bentrevett/multi30k")

In [165]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

In [166]:
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

In [167]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}

Tokenizers

In [168]:
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --- ------------------------------------ 1.0/12.8 MB 10.0 MB/s eta 0:00:02
     --------- ------------------------------ 2.9/12.8 MB 8.8 MB/s eta 0:00:02
     ----------- ---------------------------- 3.7/12.8 MB 7.5 MB/s eta 0:00:02
     ----------------- ---------------------- 5.5/12.8 MB 7.3 MB/s eta 0:00:02
     -------------------- ------------------- 6.6/12.8 MB 6.9 MB/s eta 0:00:01
     ---------------------- ----------------- 7.3/12.8 MB 6.5 MB/s eta 0:00:01
     -------------------------- ------------- 8.4/12.8 MB 6.0 MB/s eta 0:00:01
     ---------------------------- ----------- 9.2/12.8 MB 5.8 MB/s eta 0:00:01
     ------------------------------- -------- 10.2/12.8 MB 5.6 MB/s eta 0:00:01
     ----------------------------------

In [169]:
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

In [170]:
string  = "I am working on sequence to sequence models from ilya sutsekevar."
[token.text for token in en_nlp.tokenizer(string)]

['I',
 'am',
 'working',
 'on',
 'sequence',
 'to',
 'sequence',
 'models',
 'from',
 'ilya',
 'sutsekevar',
 '.']

In [171]:
def tokenize_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    return {"en_tokens" : en_tokens, "de_tokens": de_tokens}

In [172]:
max_length = 1_000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp":  en_nlp,
    "de_nlp" : de_nlp,
    "max_length" :  max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

Map: 100%|██████████| 29000/29000 [00:03<00:00, 9245.16 examples/s] 
Map: 100%|██████████| 1014/1014 [00:00<00:00, 8875.43 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 10751.98 examples/s]


In [173]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

Vocabulary

Torchtext is not working on my OS. it;s idiotic. so i'll implement my own class

In [174]:
class Vocab:
    def __init__(self, tokens=None, default_index=None):
        self.token_to_index = {}
        self.index_to_token = {}

        # Handle list or dict input for tokens
        if isinstance(tokens, list):
            self.token_to_index = {token: index for index, token in enumerate(tokens)}
        elif isinstance(tokens, dict):
            self.token_to_index = tokens

        # Create reverse mapping: index to token
        self.index_to_token = {index: token for token, index in self.token_to_index.items()}
        self.default_index = default_index

    def __len__(self):
        return len(self.token_to_index)

    def __contains__(self, token):
        return token in self.token_to_index

    def __getitem__(self, token):
        return self.token_to_index.get(token, self.default_index)

    def set_default_index(self, index):
        self.default_index = index

    def get_default_index(self):
        return self.default_index

    def insert_token(self, token, index):
        if token in self.token_to_index:
            raise RuntimeError("Token already exists in vocab")
        if index < 0 or index > len(self.token_to_index):
            raise RuntimeError("Index out of range")
        self.token_to_index[token] = index
        self.index_to_token[index] = token

    def append_token(self, token):
        if token in self.token_to_index:
            raise RuntimeError("Token already exists")
        index = len(self.token_to_index)
        self.token_to_index[token] = index
        self.index_to_token[index] = token

    def lookup_token(self, index):
        if index not in self.index_to_token:
            raise RuntimeError("Index out of range")
        return self.index_to_token[index]

    def lookup_tokens(self, indices):
        return [self.index_to_token.get(index, None) for index in indices]

    def lookup_indices(self, tokens):
        return [self.token_to_index.get(token, self.default_index) for token in tokens]

    def get_stoi(self):
        return self.token_to_index

    def get_itos(self):
        return [self.index_to_token[index] for index in sorted(self.index_to_token.keys())]   

In [175]:
from collections import Counter, OrderedDict
from typing import List, Optional

def vocab(ordered_dict: Dict, min_freq: int = 1, specials: Optional[List[str]] = None) -> Vocab:
    specials = specials or []

    # Remove special tokens from ordered dict
    for token in specials:
        ordered_dict.pop(token, None)

    tokens = []
    for token, freq in ordered_dict.items():
        if freq >= min_freq:
            tokens.append(token)

    return Vocab(tokens)


In [176]:
from collections import Counter, OrderedDict
from typing import Iterable, List, Optional

def build_vocab_from_iterator(
    iterator: Iterable, 
    min_freq: int = 1,
    max_tokens: Optional[int] = None,
    specials: Optional[List[str]] = None
) -> Vocab:
    
    counter = Counter()
    
    # Update the frequency count from the iterator
    for tokens in iterator:
        counter.update(tokens)
    
    specials = specials or []

    # Sort tokens by frequency (descending) and lexicographically (if frequencies are equal)
    sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    
    if max_tokens is None:
        ordered_dict = OrderedDict(sorted_by_freq_tuples)
    else:
        assert len(specials) < max_tokens, "len(specials) >= max_tokens, so the vocab will be entirely special tokens"
        ordered_dict = OrderedDict(sorted_by_freq_tuples[:max_tokens - len(specials)])

    # Build the vocab object
    word_vocab = vocab(ordered_dict, min_freq=min_freq, specials=specials)
    
    return word_vocab


In [177]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token
]



In [178]:
en_vocab = build_vocab_from_iterator(train_data["en_tokens"], min_freq, specials=special_tokens)

In [179]:
en_vocab.get_itos()[:10]

['a', '.', 'in', 'the', 'on', 'man', 'is', 'and', 'of', 'with']

In [180]:
en_vocab.get_stoi()["the"]

3

In [181]:
#for german vocabulary
de_vocab = build_vocab_from_iterator(train_data["de_tokens"], specials=special_tokens)

In [182]:
de_vocab

<__main__.Vocab at 0x21835e5bd70>

In [183]:
len(en_vocab), len(de_vocab)

(5889, 18665)

In [184]:
tokens = ["i", "love", "watching", "crime", "shows"]

In [185]:
en_vocab.lookup_indices(tokens)

[952, 2165, 169, None, 817]

In [186]:
en_vocab.lookup_tokens(en_vocab.lookup_indices(tokens))

['i', 'love', 'watching', None, 'shows']

In [187]:
assert en_vocab[unk_token] == de_vocab[unk_token]
assert en_vocab[pad_token] == de_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [188]:
def numericalize_example(example, en_vocab, de_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    de_ids = de_vocab.lookup_indices(example["de_tokens"])
    return {"en_ids": en_ids, "de_ids": de_ids}

In [189]:
fn_kwargs = {"en_vocab": en_vocab, "de_vocab": de_vocab}

train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)

Map: 100%|██████████| 29000/29000 [00:02<00:00, 13757.00 examples/s]
Map: 100%|██████████| 1014/1014 [00:00<00:00, 12580.53 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 13971.65 examples/s]


In [190]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>'],
 'en_ids': [None, 12, 20, 11, 21, 774, 13, 53, 76, 198, 1308, 1, None],
 'de_ids': [None,
  14,
  22,
  249,
  26,
  80,
  16,
  84,
  3,
  11,
  106,
  7643,
  3167,
  0,
  None]}

In [191]:
en_vocab.lookup_tokens(train_data[0]["en_ids"])

[None,
 'two',
 'young',
 ',',
 'white',
 'males',
 'are',
 'outside',
 'near',
 'many',
 'bushes',
 '.',
 None]

In [192]:
data_type = "torch"
format_columns = ["en_ids", "de_ids"]

train_data = train_data.with_format(
    type = data_type,
    columns=format_columns,
    output_all_columns=True,
)
valid_data = valid_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

In [193]:
train_data[0]

{'en_ids': tensor([       nan, 1.2000e+01, 2.0000e+01, 1.1000e+01, 2.1000e+01, 7.7400e+02,
         1.3000e+01, 5.3000e+01, 7.6000e+01, 1.9800e+02, 1.3080e+03, 1.0000e+00,
                nan]),
 'de_ids': tensor([       nan, 1.4000e+01, 2.2000e+01, 2.4900e+02, 2.6000e+01, 8.0000e+01,
         1.6000e+01, 8.4000e+01, 3.0000e+00, 1.1000e+01, 1.0600e+02, 7.6430e+03,
         3.1670e+03, 0.0000e+00,        nan]),
 'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

Data Loaders

In [194]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_de_ids = [example["de_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "de_ids": batch_de_ids,
        }
        return batch

    return collate_fn

In [195]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle
    )
    return data_loader

In [196]:
batch_size = 64
train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
test_data_loader = get_data_loader(test_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index, shuffle=True)


Building the Model

In [197]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        
        return hidden, cell
    

In [198]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        input = input.unsequeeze(0)
        embedded = self.dropout(self.embedding(input))
        output , (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

In [199]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions, of encoder and decoder must be equal"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and Decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio):
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        input = trg[0,:]
        for t in range(1, trg_length):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax()
            input = trg[t] if teacher_force else top1 
        
        return outputs

Training the Model

In [200]:
input_dim = len(de_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
)

model = Seq2Seq(encoder, decoder, device).to(device)

In [201]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(18665, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(5889, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5889, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [202]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"the model has {count_parameters(model):,} trainable parameters")

the model has 16,663,297 trainable parameters


In [203]:
optimizer = optim.Adam(model.parameters())

In [204]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [205]:
def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["de_ids"].to(device)
        trg = batch["en_ids"].to(device)
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [206]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["de_ids"].to(device)
            trg = batch["en_ids"].to(device)
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  # turn off teacher forcing
            # output = [trg length, batch size, trg vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(trg length - 1) * batch size, trg vocab size]
            trg = trg[1:].view(-1)
            # trg = [(trg length - 1) * batch size]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [208]:
n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut1-model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

  0%|          | 0/10 [00:00<?, ?it/s]


TypeError: pad_sequence(): argument 'padding_value' (position 3) must be float, not NoneType