In [41]:
import pandas as pd
import numpy as np
#%pip install tokenizers
import torch
import torch.nn as nn
import torch.nn.functional as F
%matplotlib inline
from torch.utils.data import Dataset, DataLoader
from functools import partial
from sklearn.model_selection import train_test_split
!pip install bpemb
from bpemb import BPEmb



In [42]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 32
block_size = 100
random_seed = 42

## Tokenizer Initialization

In [68]:
en_tokenizer = BPEmb(lang = 'en' , vs = 10000, dim = 50 , add_pad_emb=True)
es_tokenizer = BPEmb(lang = 'es' , vs = 10000, dim = 50 , add_pad_emb=True)

In [143]:
en_tokenizer.encode("Hello world, this is a really long sentence")

['‚ñÅhel',
 'lo',
 '‚ñÅworld',
 ',',
 '‚ñÅthis',
 '‚ñÅis',
 '‚ñÅa',
 '‚ñÅreally',
 '‚ñÅlong',
 '‚ñÅsentence']

In [146]:
en_tokenizer.decode_ids('hel')

['h', 'e', 'l']

In [142]:
en_tokenizer.decode([1, 908, 1418, 501, 9934, 215, 80, 4, 5920, 769, 8018, 2])

['',
 'hel',
 'lo',
 'world',
 ',',
 'this',
 'is',
 'a',
 'really',
 'long',
 'sentence',
 '']

In [70]:
BOS_IDX = en_tokenizer.BOS
EOS_IDX = en_tokenizer.EOS
PAD_IDX = en_tokenizer.vs
UNK_IDX = 0
VOCAB_SIZE = en_tokenizer.vs

## Dataset Split

In [46]:
output_file = "spa.txt"
with open(output_file, 'r' , encoding='utf-8') as f:
    lines = f.readlines()

len(lines)
lines = [line.split('\t') for line in lines]
lines = ['\t'.join(line[:2]) for line in lines]

In [47]:
train_lines, val_test_lines = train_test_split(lines, test_size=0.2, random_state=random_seed, shuffle=True)
val_lines, test_lines = train_test_split(val_test_lines, test_size=0.5, random_state=random_seed, shuffle=True)

In [48]:
print(len(train_lines))
print(len(val_lines))
print(len(test_lines))

113234
14154
14155


In [49]:
train_lines[1]

"I don't think I have one of those yet.\tCreo que a√∫n no tengo uno de esos."

In [50]:
class SentencePairDataset(Dataset):
    def __init__(self, lines,src_tokenizer , tgt_tokenizer):
        super(SentencePairDataset, self).__init__()

        self.lines = lines
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        line = self.lines[idx]

        src , tgt = line.split('\t')
        src_tokens = self.src_tokenizer.encode_ids_with_bos_eos(src)
        tgt_tokens = self.tgt_tokenizer.encode_ids_with_bos_eos(tgt)

        return src_tokens, tgt_tokens

In [51]:
train_ds = SentencePairDataset(train_lines,en_tokenizer,es_tokenizer)
val_ds = SentencePairDataset(val_lines,en_tokenizer,es_tokenizer)
test_ds = SentencePairDataset(test_lines,en_tokenizer,es_tokenizer)

In [14]:

train_lengths = [len(x[0]) for x in train_ds]
val_lengths = [len(x[0]) for x in val_ds]
test_lengths = [len(x[0]) for x in test_ds]


def compute_length_stats(lengths, name):
    if not lengths:
        print(f"{name}: Dataset is empty!")
        return

    lengths = np.array(lengths)
    print(f"üìä {name} Token Length Stats:")
    print(f"   Max Length: {lengths.max()}")
    print(f"   99th Percentile: {np.percentile(lengths, 99):.2f}")
    print(f"   95th Percentile: {np.percentile(lengths, 95):.2f}")
    print(f"   Median Length: {np.median(lengths)}")
    print(f"   Mean Length: {lengths.mean():.2f}\n")

# Compute stats for each dataset
compute_length_stats(train_lengths, "Train")
compute_length_stats(val_lengths, "Validation")
compute_length_stats(test_lengths, "Test")


üìä Train Token Length Stats:
   Max Length: 90
   99th Percentile: 21.00
   95th Percentile: 17.00
   Median Length: 10.0
   Mean Length: 10.68

üìä Validation Token Length Stats:
   Max Length: 53
   99th Percentile: 21.00
   95th Percentile: 17.00
   Median Length: 10.0
   Mean Length: 10.69

üìä Test Token Length Stats:
   Max Length: 39
   99th Percentile: 21.00
   95th Percentile: 16.00
   Median Length: 10.0
   Mean Length: 10.67



## Creating Dataset Batches

In [52]:
max_seq_len = 100

def collate_fn(batch):
    batch_size = len(batch)
    srcs, tgts = zip(*batch)
    src_vectors = torch.zeros((batch_size, max_seq_len), dtype=torch.long, device=device)
    tgt_vectors = torch.zeros((batch_size, max_seq_len), dtype=torch.long, device=device)

    for i in range(batch_size):
        src_vectors[i] = torch.tensor((srcs[i] + [PAD_IDX] * (max_seq_len - len(srcs[i])))[:max_seq_len], dtype=torch.long, device=device)
        tgt_vectors[i] = torch.tensor((tgts[i] + [PAD_IDX] * (max_seq_len - len(tgts[i])))[:max_seq_len], dtype=torch.long, device=device)

    return src_vectors, tgt_vectors

In [53]:
train_dataloader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=partial(collate_fn))
val_dataloader = DataLoader(val_ds, batch_size=64, shuffle=True, collate_fn=partial(collate_fn))
test_dataloader = DataLoader(test_ds, batch_size=64, shuffle=True, collate_fn=partial(collate_fn))

In [54]:
for src , tgt in train_dataloader:
    print(src)
    print(tgt)
    break

tensor([[    1,  2088,    80,  ..., 10000, 10000, 10000],
        [    1,  1220,   597,  ..., 10000, 10000, 10000],
        [    1,   386,  4599,  ..., 10000, 10000, 10000],
        ...,
        [    1,   326,  9937,  ..., 10000, 10000, 10000],
        [    1,    83,  3048,  ..., 10000, 10000, 10000],
        [    1,    83,  9937,  ..., 10000, 10000, 10000]], device='cuda:0')
tensor([[    1,   772,   224,  ..., 10000, 10000, 10000],
        [    1,   163,  3542,  ..., 10000, 10000, 10000],
        [    1,   185,  4423,  ..., 10000, 10000, 10000],
        ...,
        [    1,  1292,   327,  ..., 10000, 10000, 10000],
        [    1,   185,  4833,  ..., 10000, 10000, 10000],
        [    1,   923,   520,  ..., 10000, 10000, 10000]], device='cuda:0')


## Model Definition

In [18]:
max_iters = 5000
eval_interval = 100
learning_rate = 3e-4
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
eval_iters = 200
n_embd = 512
n_head = 4
n_layer = 4
dropout = 0.1
import math

In [19]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_embd, n_head):
        super(MultiHeadAttention, self).__init__()
        assert n_embd % n_head == 0, "n_embd must be divisible by n_head"
        self.n_embd = n_embd
        self.n_head = n_head
        self.d_k = n_embd // n_head

        self.W_q = nn.Linear(n_embd, n_embd)
        self.W_k = nn.Linear(n_embd, n_embd)
        self.W_v = nn.Linear(n_embd, n_embd)
        self.W_o = nn.Linear(n_embd, n_embd)

    def scaled_attention(self, Q, K, V, mask=None):
        wei = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            wei = wei.masked_fill(mask == 0, -1e9)

        wei = torch.softmax(wei, dim=-1)
        output = torch.matmul(wei, V)
        return output

    def split_heads(self, x):
        batch_size, seq_len, n_embd = x.size()
        return x.reshape(batch_size, seq_len, self.n_head, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_len, d_k = x.size()
        return x.transpose(1, 2).reshape(batch_size, seq_len, self.n_embd)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [20]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [21]:
class PositionalEncoding(nn.Module):
    def __init__(self, n_embd, block_size):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(block_size, n_embd, device=device)
        position = torch.arange(0, block_size, dtype=torch.float, device=device).unsqueeze(1)
        div_term = torch.pow(10_000, (-torch.arange(0, n_embd, 2, device=device).float() / n_embd))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        return self.register_buffer('pe', pe.unsqueeze(0))


    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [22]:
class EncoderLayer(nn.Module):
    def __init__(self, n_embd, n_head, dropout):
        super(EncoderLayer, self).__init__()

        self.self_attn = MultiHeadAttention(n_embd, n_head)
        self.feed_forward = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        x = self.ln1(x + self.dropout(self.self_attn(x, x, x, mask)))
        x = self.ln2(x + self.feed_forward(x))
        return x

In [23]:
class DecoderLayer(nn.Module):
    def __init__(self, n_embd, n_head, dropout):
        super(DecoderLayer, self).__init__()

        self.self_attn = MultiHeadAttention(n_embd, n_head)
        self.cross_attn = MultiHeadAttention(n_embd, n_head)
        self.feed_forward = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        self.ln3 = nn.LayerNorm(n_embd)
        self.dropout = nn.Dropout(dropout)


    def forward(self, x, enc_output, src_mask, tgt_mask):
        x = self.ln1(x + self.dropout(self.self_attn(x, x, x, tgt_mask)))
        x = self.ln2(x + self.dropout(self.cross_attn(x, enc_output, enc_output, src_mask)))
        x = self.ln3(x + self.feed_forward(x))
        return x

In [24]:

class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(VOCAB_SIZE + 1, n_embd , padding_idx= VOCAB_SIZE)
        self.decoder_embedding = nn.Embedding(VOCAB_SIZE + 1, n_embd , padding_idx= VOCAB_SIZE)
        self.positional_encoding = PositionalEncoding(n_embd, block_size)

        self.encoder_layers = nn.ModuleList([EncoderLayer(n_embd, n_head, dropout) for _ in range(n_layer)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(n_embd, n_head, dropout) for _ in range(n_layer)])

        self.fc = nn.Linear(n_embd, VOCAB_SIZE + 1)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length, device=device), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)

        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

In [25]:
model = Transformer().to(device)
print(sum(p.numel() for p in model.parameters()) / 1e6, 'M parameters')

59.510033 M parameters


In [27]:
import time

# Loss function
criterion = nn.CrossEntropyLoss(ignore_index=VOCAB_SIZE)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9)

# Learning Rate Scheduler
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)

num_epochs = 3
print_freq = 200

for epoch in range(num_epochs):
    start_time = time.time()
    model.train()

    total_train_loss = 0
    num_train_batches = len(train_dataloader)

    print(f"\nEpoch {epoch + 1}/{num_epochs}\n" + "-------")

    for idx, (src_data, tgt_data) in enumerate(train_dataloader):
        optimizer.zero_grad()

        output = model(src_data, tgt_data[:, :-1])
        loss = criterion(output.contiguous().view(-1, VOCAB_SIZE + 1), tgt_data[:, 1:].contiguous().view(-1))

        total_train_loss += loss.item()

        # Backpropagation
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

        if idx % print_freq == 0:
            print(f"Batch {idx}/{num_train_batches} | Training Loss: {loss.item():.4f}")

    avg_train_loss = total_train_loss / num_train_batches
    print(f"Epoch {epoch+1} Training Loss: {avg_train_loss:.4f}")

    # Validation phase
    model.eval()
    total_val_loss = 0
    num_val_batches = len(val_dataloader)

    with torch.no_grad():
        for idx, (src_data, tgt_data) in enumerate(val_dataloader):
            output = model(src_data, tgt_data[:, :-1])
            loss = criterion(output.contiguous().view(-1, VOCAB_SIZE + 1), tgt_data[:, 1:].contiguous().view(-1))
            total_val_loss += loss.item()

            if idx % 50 == 0:
                print(f"Validation Batch {idx}/{num_val_batches} | Validation Loss: {loss.item():.4f}")

    avg_val_loss = total_val_loss / num_val_batches
    print(f"Epoch {epoch+1} Validation Loss: {avg_val_loss:.4f}")

    # Adjust learning rate
    scheduler.step()

    # Save model
    torch.save(model.state_dict(), f'/content/drive/MyDrive/model/model_epoch_{epoch+1}.pth')

    elapsed_time = time.time() - start_time
    print(f"Epoch {epoch+1} completed in {elapsed_time:.2f} seconds\n")


Epoch 1/3
-------
Batch 0/1770 | Training Loss: 9.4260
Batch 200/1770 | Training Loss: 4.6705
Batch 400/1770 | Training Loss: 4.2067
Batch 600/1770 | Training Loss: 3.6103
Batch 800/1770 | Training Loss: 3.5125
Batch 1000/1770 | Training Loss: 3.3236
Batch 1200/1770 | Training Loss: 3.0222
Batch 1400/1770 | Training Loss: 3.0409
Batch 1600/1770 | Training Loss: 2.5775
Epoch 1 Training Loss: 3.6201
Validation Batch 0/222 | Validation Loss: 2.2980
Validation Batch 50/222 | Validation Loss: 2.5430
Validation Batch 100/222 | Validation Loss: 2.3318
Validation Batch 150/222 | Validation Loss: 2.6572
Validation Batch 200/222 | Validation Loss: 2.2523
Epoch 1 Validation Loss: 2.4552
Epoch 1 completed in 1059.65 seconds


Epoch 2/3
-------
Batch 0/1770 | Training Loss: 2.3404
Batch 200/1770 | Training Loss: 2.5484
Batch 400/1770 | Training Loss: 2.1712
Batch 600/1770 | Training Loss: 2.5689
Batch 800/1770 | Training Loss: 2.3823
Batch 1000/1770 | Training Loss: 2.0719
Batch 1200/1770 | Traini

In [28]:
model.eval()
with torch.no_grad():
    for data in test_dataloader:
        src_data, tgt_data = data
        output = model(src_data, tgt_data[:, :-1])
        loss = criterion(output.contiguous().view(-1, VOCAB_SIZE + 1), tgt_data[:, 1:].contiguous().view(-1))
        print(f"Test Loss: {loss.item()}")

Test Loss: 1.3822786808013916
Test Loss: 1.4997037649154663
Test Loss: 1.413204550743103
Test Loss: 1.5136460065841675
Test Loss: 1.3749380111694336
Test Loss: 1.3223001956939697
Test Loss: 1.4119950532913208
Test Loss: 1.6161937713623047
Test Loss: 1.5821583271026611
Test Loss: 1.5933396816253662
Test Loss: 1.3498424291610718
Test Loss: 1.4229718446731567
Test Loss: 1.6453138589859009
Test Loss: 1.6780678033828735
Test Loss: 1.5500739812850952
Test Loss: 1.3732645511627197
Test Loss: 1.3493176698684692
Test Loss: 1.803459644317627
Test Loss: 1.560603380203247
Test Loss: 1.315798044204712
Test Loss: 1.5363688468933105
Test Loss: 1.453052282333374
Test Loss: 1.8032400608062744
Test Loss: 1.4793950319290161
Test Loss: 1.4715509414672852
Test Loss: 1.682931900024414
Test Loss: 1.370986819267273
Test Loss: 1.5008587837219238
Test Loss: 1.4300965070724487
Test Loss: 1.5177602767944336
Test Loss: 1.4353270530700684
Test Loss: 1.8620750904083252
Test Loss: 1.3934544324874878
Test Loss: 1.5801

In [29]:
model_path = "/content/drive/MyDrive/model/model_epoch_3.pth"
state_dict = torch.load(model_path)

transformer_loaded = Transformer().to(device)
transformer_loaded.load_state_dict(state_dict)

  state_dict = torch.load(model_path)


<All keys matched successfully>

In [201]:
def restore_word_boundaries(tokens):
    """Restores word boundaries from tokenized IDs and replaces `_` with spaces."""
    decoded_tokens = [es_tokenizer.emb.index_to_key[i] for i in tokens if i < es_tokenizer.vs]

    # Replace "_" with space and properly join words
    print(decoded_tokens)

def translate(src):
    src_tokens = en_tokenizer.encode_ids_with_bos_eos(src)
    tgt_tokens = [BOS_IDX]

    src_vectors = torch.tensor((src_tokens + [PAD_IDX] * (max_seq_len - len(src_tokens)))[:max_seq_len], dtype=torch.long, device=device).unsqueeze(0)

    for i in range(max_seq_len):
        tgt_vectors = torch.tensor((tgt_tokens + [PAD_IDX] * (max_seq_len - len(tgt_tokens)))[:max_seq_len], dtype=torch.long, device=device).unsqueeze(0)
        output = transformer_loaded(src_vectors, tgt_vectors)
        idx = torch.argmax(nn.functional.softmax(output, dim=2)[0][i]).item()
        tgt_tokens.append(idx)

        if idx == EOS_IDX:
            break

    return restore_word_boundaries(tgt_tokens[1 : -1])

In [202]:
translate("I am a teacher")

['‚ñÅso', 'y', '‚ñÅprofesor', '.']


In [190]:
es_tokenizer.decode_ids(195)

'so'

In [170]:
en_tokenizer.encode_ids('_')

[9912, 9976]

In [179]:
es_tokenizer.decode(es_tokenizer.encode("Hola Amigo"))

'hola amigo'

In [192]:
es_tokenizer.emb.index_to_key[195]

'‚ñÅso'