In [6]:
!pip install datasets
!pip install tokenizers

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting tokenizers
  Using cached tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Using cached tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl (2.9 MB)
Installing collected packages: tokenizers
Successfully installed tokenizers-0.22.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pi

In [18]:
import torch
import torch.nn as nn
from datasets import load_dataset
from torch.utils.data import DataLoader


device = 'cpu'
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print("Device: ", device)


valid_set_, test_set_ = load_dataset(
    path="ageron/tatoeba_mt_train", name="eng-spa",
    split=["validation", "test"])


split = valid_set_.train_test_split(train_size=0.8, seed=42)
train_set, valid_set = split["train"], split["test"]


train_set[0]


Device:  mps


{'source_text': 'Tom tried to break up the fight.',
 'target_text': 'Tom trat√≥ de disolver la pelea.',
 'source_lang': 'eng',
 'target_lang': 'spa'}

In [7]:
import tokenizers
def train_eng_spa():
    for pair in train_set:
        yield pair["source_text"]
        yield pair["target_text"]


max_length = 256
vocab_size = 10000
tokenizer_model = tokenizers.models.BPE(unk_token="[UNK]") # tokenizer lib is from huggingface
tokenizer = tokenizers.Tokenizer(tokenizer_model)
tokenizer.enable_padding(pad_id=0, pad_token="<pad>")
tokenizer.enable_truncation(max_length=max_length)
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
tokenizer_trainer = tokenizers.trainers.BpeTrainer(
    vocab_size=vocab_size,
    special_tokens=["<unk>", "<pad>", "<s>", "</s>"]
)
tokenizer.train_from_iterator(train_eng_spa(), trainer=tokenizer_trainer)








In [10]:
tokenizer.encode("i like soccer").ids

[72, 401, 4381]

In [11]:
tokenizer.encode("<s> Me gusta el futbol").ids

[2, 396, 582, 219, 376, 3075]

In [13]:
from collections import namedtuple

fields = ["src_token_ids", "src_mask", "tgt_token_ids", "tgt_mask"]

class NmtPair(namedtuple("NmtPair", fields)):
    def to(self, device):
        return NmtPair(self.src_token_ids.to(device), self.src_mask.to(device), self.tgt_token_ids.to(device), self.tgt_mask.to(device))


In [17]:
def data_loader_fn(batch):
    src_txt = [pair["source_text"] for pair in batch]
    tgt_txt = [f"<s> {pair['target_text']}</s>" for pair in batch]
    src_encodings = tokenizer.encode_batch(src_txt)
    tgt_encodings = tokenizer.encode_batch(tgt_txt)
    src_token_ids = torch.tensor([enc.ids for enc in src_encodings])
    src_mask = torch.tensor([enc.attention_mask for enc in src_encodings])
    tgt_token_ids = torch.tensor([enc.ids for enc in tgt_encodings])
    tgt_mask = torch.tensor([enc.attention_mask for enc in tgt_encodings])
    inputs = NmtPair(src_token_ids, src_mask, tgt_token_ids, tgt_mask)
    labels = tgt_token_ids[:, 1:]
    return inputs, labels

batch_size = 32
train_loader = DataLoader(train_set, batch_size=batch_size, collate_fn=data_loader_fn)
valid_loader = DataLoader(valid_set, batch_size=batch_size, collate_fn=data_loader_fn)
test_loader = DataLoader(test_set_, batch_size=batch_size, collate_fn=data_loader_fn)


In [37]:
from torch.nn.utils.rnn import pack_padded_sequence
class Model(nn.Module):
    def __init__(self, vocab_size, embed_dim=512, pad_id=0, hidden_dim=512, n_layers=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
        self.encoder = nn.GRU(embed_dim, hidden_dim, n_layers, batch_first=True)
        self.decoder = nn.GRU(embed_dim, hidden_dim, n_layers, batch_first=True)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, pair):
        src_embed = self.embed(pair.src_token_ids)
        tgt_embed = self.embed(pair.tgt_token_ids)
        src_lengths = pair.src_mask.sum(dim=1)
        src_packed = pack_padded_sequence(
            src_embed, lengths=src_lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        _, hidden_states = self.encoder(src_packed)
        outputs, _ = self.decoder(tgt_embed, hidden_states)
        return self.out(outputs).permute(0, 2, 1)


torch.manual_seed(42)
vocab_size = tokenizer.get_vocab_size()
model = Model(vocab_size).to(device)

In [38]:
x = nn.CrossEntropyLoss(ignore_index=0)

In [40]:
def translate(model, src_txt, max_len=100, pad_id=0, sos_id=2, eos_id=3):
    tgt_txt = ""
    token_ids = []
    for index in range(max_length):
        batch, _ = data_loader_fn([{"source_text": src_txt,
                                    "target_text": tgt_txt}])
        with torch.no_grad():
            Y_logits = model(batch.to(device))
            Y_token_ids = Y_logits.argmax(dim=1)  # find the best token IDs
            next_token_id = Y_token_ids[0, index]  # take the last token ID

        next_token = tokenizer.id_to_token(next_token_id)
        tgt_txt += " " + next_token
        if next_token_id == eos_id:
            break
        return tgt_txt

model.eval()
translate(model, "i like soccer")

' Frank'