In [4]:
!pip install datasets
!pip install tokenizers


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


In [8]:
import torch
device = 'cpu'
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print("Device: ", device)

Device:  mps


In [9]:
from datasets import load_dataset

nmt_valid, nmt_test = load_dataset(
    path = "ageron/tatoeba_mt_train", name="eng-spa", split=["validation", "test"]
)

split  = nmt_valid.train_test_split(train_size=0.8, seed=42)
train_set, valid_set = split["train"], split["test"]

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
train_set[0]

{'source_text': 'Tom tried to break up the fight.',
 'target_text': 'Tom trató de disolver la pelea.',
 'source_lang': 'eng',
 'target_lang': 'spa'}

In [11]:
import tokenizers
#lets train a bpe tokenizer
def train_eng_spa():
    for pair in train_set:
        yield pair["source_text"]
        yield pair["target_text"]


max_length = 256
vocab_size =10000

tokenizer_model = tokenizers.models.BPE(unk_token="<unk>")

tokenizer = tokenizers.Tokenizer(tokenizer_model)
tokenizer.enable_padding(pad_id=0, pad_token="<pad>")
tokenizer.enable_truncation(max_length=max_length)
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()

tokenizer_trainer = tokenizers.trainers.BpeTrainer(
    vocab_size=vocab_size,
    special_tokens=["<pad>", "<unk>", "<s>", "</s>"]
)

tokenizer.train_from_iterator(train_eng_spa(), trainer=tokenizer_trainer)







In [12]:
#Now let’s create a small utility class that will hold tokenized English texts (i.e., the source token ID sequences), along with the corresponding tokenized Spanish targets (i.e., the target token ID sequences), plus the corresponding attention masks. For this, we can create a namedtuple base class (i.e., a tuple with named fields), and extend it to add a to() method, which will make it easy to move all these tensors to the GPU

from collections import namedtuple
from torch.utils.data import DataLoader

fields = ["source_ids", "source_mask", "target_ids", "target_mask"]

class NmtPair(namedtuple("NmtPairBase", fields)):
    def to(self, device):
        return NmtPair(self.source_ids.to(device), self.source_mask.to(device), self.target_ids.to(device), self.target_mask.to(device))


def nmt_collate_fn(batch):
    src_texts = [pair['source_text'] for pair in batch]
    tgt_texts = [f"<s> {pair['target_text']} </s>" for pair in batch]
    src_encodings = tokenizer.encode_batch(src_texts)
    tgt_encodings = tokenizer.encode_batch(tgt_texts)
    src_ids = torch.tensor([enc.ids for enc in src_encodings])
    tgt_token_ids = torch.tensor([enc.ids for enc in tgt_encodings])
    src_mask = torch.tensor([enc.attention_mask for enc in src_encodings])
    tgt_mask = torch.tensor([enc.attention_mask for enc in tgt_encodings])

    inputs = NmtPair(src_ids, src_mask, tgt_token_ids[:,:-1], tgt_mask[:,:-1])
    labels = tgt_token_ids[:, 1:]
    return inputs, labels


batch_size = 32

train_loader = DataLoader(train_set, batch_size=batch_size, collate_fn=nmt_collate_fn, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size, collate_fn=nmt_collate_fn)
test_loader = DataLoader(nmt_test, batch_size=batch_size, collate_fn=nmt_collate_fn)


In [13]:
#now lets build our model
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence


class NmtModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=512, pad_id=0, hidden_dim=512, n_layers=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
        self.encoder = nn.GRU(embed_dim, hidden_dim, n_layers, batch_first=True)
        self.decoder = nn.GRU(embed_dim, hidden_dim, n_layers, batch_first=True)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, pair):
        src_embed = self.embed(pair.source_ids)
        tgt_embed = self.embed(pair.target_ids)
        src_lengths = pair.source_mask.sum(dim=1)
        src_packed = pack_padded_sequence(src_embed, src_lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, hidden_states = self.encoder(src_packed)
        outputs, _ = self.decoder(tgt_embed, hidden_states)
        return self.out(outputs).permute(0, 2, 1)
        

In [None]:
# NMT Model with Attention Mechanism (Bahdanau-style)
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class Attention(nn.Module):
    """Bahdanau (additive) attention mechanism"""
    def __init__(self, hidden_dim):
        super().__init__()
        self.W_enc = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.W_dec = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.v = nn.Linear(hidden_dim, 1, bias=False)
    
    def forward(self, encoder_outputs, decoder_hidden, source_mask):
        # encoder_outputs: (batch, src_len, hidden_dim)
        # decoder_hidden: (batch, hidden_dim)
        # source_mask: (batch, src_len)
        
        src_len = encoder_outputs.size(1)
        
        # Project encoder outputs and decoder hidden state
        enc_proj = self.W_enc(encoder_outputs)  # (batch, src_len, hidden_dim)
        dec_proj = self.W_dec(decoder_hidden).unsqueeze(1)  # (batch, 1, hidden_dim)
        
        # Compute attention scores
        energy = torch.tanh(enc_proj + dec_proj)  # (batch, src_len, hidden_dim)
        attention_scores = self.v(energy).squeeze(-1)  # (batch, src_len)
        
        # Mask out padding positions
        attention_scores = attention_scores.masked_fill(source_mask == 0, float('-inf'))
        
        # Softmax to get attention weights
        attention_weights = F.softmax(attention_scores, dim=1)  # (batch, src_len)
        
        # Compute context vector as weighted sum of encoder outputs
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)  # (batch, 1, hidden_dim)
        context = context.squeeze(1)  # (batch, hidden_dim)
        
        return context, attention_weights


class NmtModelWithAttention(nn.Module):
    def __init__(self, vocab_size, embed_dim=512, pad_id=0, hidden_dim=512, n_layers=2):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
        self.encoder = nn.GRU(embed_dim, hidden_dim, n_layers, batch_first=True)
        self.attention = Attention(hidden_dim)
        # Decoder input now includes context vector concatenated with embedding
        self.decoder = nn.GRU(embed_dim + hidden_dim, hidden_dim, n_layers, batch_first=True)
        self.out = nn.Linear(hidden_dim * 2, vocab_size)  # Takes decoder output + context
    
    def forward(self, pair):
        batch_size = pair.source_ids.size(0)
        tgt_len = pair.target_ids.size(1)
        
        # Encode source sequence
        src_embed = self.embed(pair.source_ids)
        src_lengths = pair.source_mask.sum(dim=1)
        src_packed = pack_padded_sequence(src_embed, src_lengths.cpu(), batch_first=True, enforce_sorted=False)
        encoder_outputs, hidden_states = self.encoder(src_packed)
        encoder_outputs, _ = pad_packed_sequence(encoder_outputs, batch_first=True)  # (batch, src_len, hidden_dim)
        
        # Prepare target embeddings
        tgt_embed = self.embed(pair.target_ids)  # (batch, tgt_len, embed_dim)
        
        # Decode with attention step by step
        outputs = []
        decoder_hidden = hidden_states
        
        for t in range(tgt_len):
            # Get current target embedding
            tgt_t = tgt_embed[:, t:t+1, :]  # (batch, 1, embed_dim)
            
            # Compute attention using top layer of decoder hidden state
            context, attn_weights = self.attention(
                encoder_outputs, 
                decoder_hidden[-1],  # Use top layer hidden state
                pair.source_mask
            )
            
            # Concatenate target embedding with context vector
            decoder_input = torch.cat([tgt_t, context.unsqueeze(1)], dim=2)  # (batch, 1, embed_dim + hidden_dim)
            
            # Decoder step
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            
            # Combine decoder output with context for final prediction
            combined = torch.cat([decoder_output.squeeze(1), context], dim=1)  # (batch, hidden_dim * 2)
            output = self.out(combined)  # (batch, vocab_size)
            outputs.append(output)
        
        # Stack outputs: (batch, vocab_size, tgt_len) for CrossEntropyLoss
        outputs = torch.stack(outputs, dim=2)
        return outputs


In [14]:
import torch

torch.manual_seed(42)
vocab_size = tokenizer.get_vocab_size()
model = NmtModel(vocab_size)
model = model.to(device)

loss = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

def train(model, loss, optimizer, train_loader, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            l = loss(y_pred, y)
            total_loss += l.item()
            l.backward()
            optimizer.step()
            optimizer.zero_grad()
        
        mean_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {mean_loss:.4f}")


train(model, loss, optimizer, train_loader, epochs=10)

KeyboardInterrupt: 

In [15]:
def translate(model, src_text, max_length=20, pad_id=0, eos_id=3):
    tgt_text = ""
    token_ids = []
    for index in range(max_length):
        batch, _ = nmt_collate_fn([{"source_text": src_text,
                                    "target_text": tgt_text}])
        with torch.no_grad():
            Y_logits = model(batch.to(device))
            Y_token_ids = Y_logits.argmax(dim=1)  # find the best token IDs
            next_token_id = Y_token_ids[0, index]  # take the last token ID

        next_token = tokenizer.id_to_token(next_token_id)
        tgt_text += next_token
        if next_token_id == eos_id:
            break
        return tgt_text

In [16]:
model.eval()
print(translate(model, "I was eating a pizza"))

<pad>
