In [1]:
import pandas as pd
import numpy as np
import re
import random
from collections import Counter
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.lang.ar import Arabic
from datasets import load_dataset

if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"
device = torch.device(dev)

In [None]:
# Load the Helsinki-NLP Arabic-English dataset
dataset = load_dataset('Helsinki-NLP/tatoeba_mt', 'ara-eng')

# Use "validation" as training and "test" as validation (since there is no training split)
df_train = pd.DataFrame(dataset["validation"])
df_valid = pd.DataFrame(dataset["test"])

# Rename columns to match our expected fields: "ar" for Arabic and "eng" for English
df_train = df_train.rename(columns={"sourceString": "ar", "targetString": "eng"})
df_valid = df_valid.rename(columns={"sourceString": "ar", "targetString": "eng"})

# Initialize spacy tokenizers
enNLP = English()
arNLP = Arabic()
enTokenizer = Tokenizer(enNLP.vocab)
arTokenizer = Tokenizer(arNLP.vocab)

def myTokenizerEN(x):
    # Clean and tokenize English text.
    text = re.sub(r"[\.\'\`\"\r\n+]", " ", x.lower())
    text = re.sub(r"\s+", " ", text).strip()
    return [word.text for word in enTokenizer(text)]

def myTokenizerAR(x):
    # Clean and tokenize Arabic text.
    text = re.sub(r"[\.\'\`\"\r\n+]", " ", x.lower())
    text = re.sub(r"\s+", " ", text).strip()
    return [word.text for word in arTokenizer(text)]

# Special tokens
SRC_SPECIALS = ["<pad>", "<unk>", "ببدأ", "نهها"]     # For Arabic, init and eos tokens are given.
TGT_SPECIALS = ["<pad>", "<unk>", "<sos>", "<eos>"]

def build_vocab(texts, tokenizer, min_freq=2, specials=None):
    counter = Counter()
    for text in texts:
        tokens = tokenizer(text)
        counter.update(tokens)
    # Start with the special tokens
    specials = specials if specials is not None else []
    vocab_tokens = specials.copy()
    # Add tokens with frequency >= min_freq
    for tok, freq in counter.items():
        if freq >= min_freq and tok not in vocab_tokens:
            vocab_tokens.append(tok)
    # Create mapping dictionaries
    stoi = {tok: i for i, tok in enumerate(vocab_tokens)}
    itos = {i: tok for tok, i in stoi.items()}
    return type("Vocab", (), {"stoi": stoi, "itos": itos, "size": len(stoi)})

# Build vocabulary on the training set texts
src_vocab = build_vocab(df_train["ar"], myTokenizerAR, min_freq=2, specials=SRC_SPECIALS)
tgt_vocab = build_vocab(df_train["eng"], myTokenizerEN, min_freq=2, specials=TGT_SPECIALS)

print("Size of Arabic (src) vocabulary:", src_vocab.size)
print("Size of English (tgt) vocabulary:", tgt_vocab.size)

#%% [code]
class TranslationDataset(Dataset):
    def __init__(self, df, src_tokenizer, tgt_tokenizer,
                 src_vocab, tgt_vocab,
                 src_init_token="ببدأ", src_eos_token="نهها",
                 tgt_init_token="<sos>", tgt_eos_token="<eos>"):
        self.df = df
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.src_init_token = src_init_token
        self.src_eos_token = src_eos_token
        self.tgt_init_token = tgt_init_token
        self.tgt_eos_token = tgt_eos_token
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        src_text, tgt_text = row["ar"], row["eng"]
        src_tokens = [self.src_init_token] + self.src_tokenizer(src_text) + [self.src_eos_token]
        tgt_tokens = [self.tgt_init_token] + self.tgt_tokenizer(tgt_text) + [self.tgt_eos_token]
        src_indices = [self.src_vocab.stoi.get(tok, self.src_vocab.stoi["<unk>"]) for tok in src_tokens]
        tgt_indices = [self.tgt_vocab.stoi.get(tok, self.tgt_vocab.stoi["<unk>"]) for tok in tgt_tokens]
        return torch.tensor(src_indices), torch.tensor(tgt_indices)

def collate_fn(batch):
    # batch is a list of (src_tensor, tgt_tensor)
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=src_vocab.stoi["<pad>"])
    tgt_batch = pad_sequence(tgt_batch, padding_value=tgt_vocab.stoi["<pad>"])
    return src_batch, tgt_batch

# Create Dataset objects for training and validation
train_dataset = TranslationDataset(df_train, myTokenizerAR, myTokenizerEN, src_vocab, tgt_vocab)
valid_dataset = TranslationDataset(df_valid, myTokenizerAR, myTokenizerEN, src_vocab, tgt_vocab)

BATCH_SIZE = 150
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

#%% [code]
class TranslateTransformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        max_len,
    ):
        super(TranslateTransformer, self).__init__()
        self.srcEmbeddings = nn.Embedding(src_vocab_size, embedding_size)
        self.trgEmbeddings = nn.Embedding(trg_vocab_size, embedding_size)
        self.srcPositionalEmbeddings = nn.Embedding(max_len, embedding_size)
        self.trgPositionalEmbeddings = nn.Embedding(max_len, embedding_size)
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(0.1)
        self.src_pad_idx = src_pad_idx
        self.max_len = max_len
    
    def make_src_mask(self, src):
        src_mask = src.transpose(0, 1) == self.src_pad_idx
        return src_mask.to(device)
    
    def forward(self, x, trg):
        src_seq_length = x.shape[0]
        batch_size = x.shape[1]
        trg_seq_length = trg.shape[0]
        
        src_positions = torch.arange(0, src_seq_length).unsqueeze(1).expand(src_seq_length, batch_size).to(device)
        trg_positions = torch.arange(0, trg_seq_length).unsqueeze(1).expand(trg_seq_length, batch_size).to(device)
        
        src_embedded = self.srcEmbeddings(x.long()) + self.srcPositionalEmbeddings(src_positions.long())
        trg_embedded = self.trgEmbeddings(trg.long()) + self.trgPositionalEmbeddings(trg_positions.long())
        
        src_embedded = self.dropout(src_embedded)
        trg_embedded = self.dropout(trg_embedded)
        
        src_padding_mask = self.make_src_mask(x)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(device)
        
        out = self.transformer(
            src_embedded,
            trg_embedded,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=trg_mask
        )
        out = self.fc_out(out)
        return out

Size of Arabic (src) vocabulary: 8520
Size of English (tgt) vocabulary: 4902


In [3]:
num_heads = 8
num_encoder_layers = 6
num_decoder_layers = 6
max_len = 256
embedding_size = 256
src_pad_idx = src_vocab.stoi["<pad>"]

model = TranslateTransformer(
    embedding_size,
    src_vocab.size,
    tgt_vocab.size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    max_len
).to(device)



In [None]:
from tqdm import tqdm
import numpy as np
import torch.optim as optim
import torch.nn as nn

EPOCHS = 30  
optimizer = optim.Adam(model.parameters(), lr=0.0003) 
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab.stoi["<pad>"])

loss_track = []
loss_validation_track = []

for epoch in range(EPOCHS):
    model.train()
    stepLoss = []
    
    train_loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} Training", leave=True, dynamic_ncols=True, mininterval=1.0)
    for i, (src_batch, tgt_batch) in enumerate(train_loop):
        input_sentence = src_batch.to(device)
        trg = tgt_batch.to(device)
        
        optimizer.zero_grad()
        output = model(input_sentence, trg[:-1])
        output = output.reshape(-1, tgt_vocab.size)
        trg_target = trg[1:].reshape(-1)
        loss = criterion(output, trg_target)
        loss.backward()
        optimizer.step()
        stepLoss.append(loss.item())

        if i % 10 == 0:  
            train_loop.set_postfix(loss=f"{np.mean(stepLoss[-10:]):.4f}")
        
    epoch_train_loss = np.mean(stepLoss)
    loss_track.append(epoch_train_loss)
    print(f"Epoch {epoch+1} Train Loss: {epoch_train_loss:.4f}")

    model.eval()
    stepValidLoss = []
    with torch.no_grad():
        for src_batch, tgt_batch in valid_loader: 
            input_sentence = src_batch.to(device)
            trg = tgt_batch.to(device)
            output = model(input_sentence, trg[:-1])
            output = output.reshape(-1, tgt_vocab.size)
            trg_target = trg[1:].reshape(-1)
            loss = criterion(output, trg_target)
            stepValidLoss.append(loss.item())

    epoch_valid_loss = np.mean(stepValidLoss)
    loss_validation_track.append(epoch_valid_loss)
    print(f"Epoch {epoch+1} Validation Loss: {epoch_valid_loss:.4f}")

In [33]:
checkpoint = {
    "epoch": EPOCHS,
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "loss_track": loss_track,
    "loss_validation_track": loss_validation_track,
    "src_vocab_tokens": [src_vocab.itos[i] for i in range(src_vocab.size)],
    "tgt_vocab_tokens": [tgt_vocab.itos[i] for i in range(tgt_vocab.size)],
    "config": {
        "embedding_size": embedding_size,
        "num_heads": num_heads,
        "num_encoder_layers": num_encoder_layers,
        "num_decoder_layers": num_decoder_layers,
        "max_len": max_len,
        "src_pad_idx": src_vocab.stoi["<pad>"],
    }
}

save_path = "model_checkpoint.pt"
torch.save(checkpoint, save_path)
print(f"Checkpoint saved successfully at {save_path}!")

Checkpoint saved successfully at model_checkpoint.pt!


Epoch 1/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.70it/s, loss=5.3249]


Epoch 1 Train Loss: 5.8562
Epoch 1 Validation Loss: 5.0934


Epoch 2/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.77it/s, loss=4.8504]


Epoch 2 Train Loss: 5.0339
Epoch 2 Validation Loss: 4.5430


Epoch 3/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.87it/s, loss=4.4770]


Epoch 3 Train Loss: 4.5809
Epoch 3 Validation Loss: 4.3002


Epoch 4/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.83it/s, loss=4.2122]


Epoch 4 Train Loss: 4.2997
Epoch 4 Validation Loss: 4.0506


Epoch 5/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.76it/s, loss=3.9830]


Epoch 5 Train Loss: 4.0796
Epoch 5 Validation Loss: 3.8936


Epoch 6/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.83it/s, loss=3.8574]


Epoch 6 Train Loss: 3.8901
Epoch 6 Validation Loss: 3.7783


Epoch 7/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.87it/s, loss=3.7403]


Epoch 7 Train Loss: 3.7164
Epoch 7 Validation Loss: 3.6732


Epoch 8/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.73it/s, loss=3.5273]


Epoch 8 Train Loss: 3.5525
Epoch 8 Validation Loss: 3.5701


Epoch 9/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.76it/s, loss=3.3905]


Epoch 9 Train Loss: 3.3952
Epoch 9 Validation Loss: 3.5016


Epoch 10/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.84it/s, loss=3.3209]


Epoch 10 Train Loss: 3.2490
Epoch 10 Validation Loss: 3.4336


Epoch 11/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.83it/s, loss=3.0832]


Epoch 11 Train Loss: 3.1056
Epoch 11 Validation Loss: 3.3672


Epoch 12/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.84it/s, loss=3.0463]


Epoch 12 Train Loss: 2.9606
Epoch 12 Validation Loss: 3.3430


Epoch 13/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.79it/s, loss=2.7853]


Epoch 13 Train Loss: 2.8190
Epoch 13 Validation Loss: 3.2722


Epoch 14/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.91it/s, loss=2.6664]


Epoch 14 Train Loss: 2.6815
Epoch 14 Validation Loss: 3.2731


Epoch 15/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.88it/s, loss=2.5231]


Epoch 15 Train Loss: 2.5430
Epoch 15 Validation Loss: 3.2204


Epoch 16/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.87it/s, loss=2.3901]


Epoch 16 Train Loss: 2.4057
Epoch 16 Validation Loss: 3.1889


Epoch 17/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.79it/s, loss=2.3449]


Epoch 17 Train Loss: 2.2729
Epoch 17 Validation Loss: 3.1707


Epoch 18/30 Training: 100%|██████████| 131/131 [00:21<00:00,  5.95it/s, loss=2.1678]


Epoch 18 Train Loss: 2.1385
Epoch 18 Validation Loss: 3.1608


Epoch 19/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.83it/s, loss=2.0596]


Epoch 19 Train Loss: 2.0074
Epoch 19 Validation Loss: 3.1316


Epoch 20/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.83it/s, loss=1.9194]


Epoch 20 Train Loss: 1.8754
Epoch 20 Validation Loss: 3.1481


Epoch 21/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.77it/s, loss=1.7860]


Epoch 21 Train Loss: 1.7476
Epoch 21 Validation Loss: 3.1241


Epoch 22/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.80it/s, loss=1.6566]


Epoch 22 Train Loss: 1.6317
Epoch 22 Validation Loss: 3.1262


Epoch 23/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.71it/s, loss=1.5498]


Epoch 23 Train Loss: 1.5108
Epoch 23 Validation Loss: 3.1538


Epoch 24/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.86it/s, loss=1.4060]


Epoch 24 Train Loss: 1.3907
Epoch 24 Validation Loss: 3.1793


Epoch 25/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.74it/s, loss=1.3772]


Epoch 25 Train Loss: 1.2932
Epoch 25 Validation Loss: 3.1688


Epoch 26/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.78it/s, loss=1.2470]


Epoch 26 Train Loss: 1.1872
Epoch 26 Validation Loss: 3.2189


Epoch 27/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.76it/s, loss=1.1523]


Epoch 27 Train Loss: 1.0870
Epoch 27 Validation Loss: 3.2490


Epoch 28/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.73it/s, loss=1.0697]


Epoch 28 Train Loss: 0.9948
Epoch 28 Validation Loss: 3.2686


Epoch 29/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.73it/s, loss=0.9497]


Epoch 29 Train Loss: 0.9197
Epoch 29 Validation Loss: 3.3104


Epoch 30/30 Training: 100%|██████████| 131/131 [00:22<00:00,  5.86it/s, loss=0.8942]


Epoch 30 Train Loss: 0.8442
Epoch 30 Validation Loss: 3.3411
Checkpoint saved successfully at model_checkpoint.pt!


In [None]:
# Function to rebuild the Vocab object from an ordered token list.
def load_vocab_from_tokens(token_list):
    Vocab = type("Vocab", (), {})  # recreating the simple vocab type
    vocab = Vocab()
    vocab.stoi = {token: i for i, token in enumerate(token_list)}
    vocab.itos = {i: token for i, token in enumerate(token_list)}
    vocab.size = len(token_list)
    return vocab

# Load the checkpoint
checkpoint = torch.load("model_checkpoint.pt", map_location=device)
config = checkpoint["config"]

# Reassemble vocabulary objects using the stored token lists.
src_vocab = load_vocab_from_tokens(checkpoint["src_vocab_tokens"])
tgt_vocab = load_vocab_from_tokens(checkpoint["tgt_vocab_tokens"])

# Re-create the model architecture using the configuration from the checkpoint.
model = TranslateTransformer(
    config["embedding_size"],
    src_vocab.size,
    tgt_vocab.size,
    config["src_pad_idx"],
    config["num_heads"],
    config["num_encoder_layers"],
    config["num_decoder_layers"],
    config["max_len"]
).to(device)

# Load model and optimizer states.
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = optim.Adam(model.parameters())  # Recreate optimizer if needed
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

loss_track = checkpoint["loss_track"]
loss_validation_track = checkpoint["loss_validation_track"]

model.eval()
print("Checkpoint loaded successfully!")

  checkpoint = torch.load("model_checkpoint.pt", map_location=device)


Checkpoint loaded successfully!




In [None]:
def translate(model, sentence, src_tokenizer, src_vocab, tgt_vocab, max_len=200):
    model.eval()
    # Prepare source sentence by adding init and eos tokens
    tokens = ["ببدأ"] + src_tokenizer(sentence) + ["نهها"]
    src_indices = [src_vocab.stoi.get(tok, src_vocab.stoi["<unk>"]) for tok in tokens]
    src_tensor = torch.tensor(src_indices).unsqueeze(1).to(device)  # [src_len, 1]

    trg_tokens = ["<sos>"]
    for _ in range(max_len):
        trg_indices = [tgt_vocab.stoi.get(tok, tgt_vocab.stoi["<unk>"]) for tok in trg_tokens]
        trg_tensor = torch.tensor(trg_indices).unsqueeze(1).to(device)  # [trg_len, 1]
        output = model(src_tensor, trg_tensor)
        next_token_idx = output.argmax(dim=2)[-1].item()
        next_token = tgt_vocab.itos[next_token_idx]
        if next_token == "<eos>":
            break
        trg_tokens.append(next_token)
    # Return the prediction without the <sos> token
    return " ".join(trg_tokens[1:])

# Helper function to convert tensor indices back to text (dropping special tokens)
def indices_to_sentence(indices, vocab, remove_tokens):
    tokens = [vocab.itos[idx] for idx in indices if idx not in remove_tokens]
    return " ".join(tokens)

import random
# Pick a random sample from the validation dataset
sample_idx = random.randint(0, len(valid_dataset) - 1)
src_sample, tgt_sample = valid_dataset[sample_idx]

# Convert the tensor indices into sentences
src_sentence = indices_to_sentence(
    src_sample.tolist(), 
    src_vocab, 
    remove_tokens=[src_vocab.stoi["ببدأ"], src_vocab.stoi["نهها"], src_vocab.stoi["<pad>"]]
)
ground_truth = indices_to_sentence(
    tgt_sample.tolist(),
    tgt_vocab, 
    remove_tokens=[tgt_vocab.stoi["<sos>"], tgt_vocab.stoi["<eos>"], tgt_vocab.stoi["<pad>"]]
)

predicted_translation = translate(model, src_sentence, myTokenizerAR, src_vocab, tgt_vocab)

print("Source (Arabic):", src_sentence)
print("Predicted Translation (English):", predicted_translation)
print("Ground Truth Translation (English):", ground_truth)

def translate_sample_or_custom(model, src_vocab, tgt_vocab, src_tokenizer, valid_dataset, custom_sentence=None):
    if custom_sentence:
        # Translate the user-provided sentence
        predicted_translation = translate(model, custom_sentence, src_tokenizer, src_vocab, tgt_vocab)
        print("\nCustom Input Translation:")
        print("Source (Arabic):", custom_sentence)
        print("Predicted Translation (English):", predicted_translation)
    else:
        # Pick a random sample from the validation dataset
        sample_idx = random.randint(0, len(valid_dataset) - 1)
        src_sample, tgt_sample = valid_dataset[sample_idx]

        # Convert tensor indices to sentence
        src_sentence = indices_to_sentence(
            src_sample.tolist(), 
            src_vocab, 
            remove_tokens=[src_vocab.stoi["ببدأ"], src_vocab.stoi["نهها"], src_vocab.stoi["<pad>"]]
        )
        ground_truth = indices_to_sentence(
            tgt_sample.tolist(),
            tgt_vocab, 
            remove_tokens=[tgt_vocab.stoi["<sos>"], tgt_vocab.stoi["<eos>"], tgt_vocab.stoi["<pad>"]]
        )

        predicted_translation = translate(model, src_sentence, src_tokenizer, src_vocab, tgt_vocab)

        print("\nRandom Sample Translation:")
        print("Source (Arabic):", src_sentence)
        print("Predicted Translation (English):", predicted_translation)
        print("Ground Truth Translation (English):", ground_truth)

# Usage
custom_input = "من أنت"

translate_sample_or_custom(model, src_vocab, tgt_vocab, myTokenizerAR, valid_dataset, custom_sentence=custom_input)

Source (Arabic): أعد الكتاب إلى مكانه
Predicted Translation (English): give the book back to the owner
Ground Truth Translation (English): put the book back where it was

Custom Input Translation:
Source (Arabic): من أنت
Predicted Translation (English): understood
