In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from Transformer import Transformer


In [14]:
from transformers import AutoTokenizer

# Load tokenizers from Hugging Face
tokenizer_trg = AutoTokenizer.from_pretrained("vinai/phobert-base")
tokenizer_src = AutoTokenizer.from_pretrained("bert-base-cased")

# Ensure that each tokenizer has the special tokens you need. Your transformer code uses:
#   - A padding token ('<pad>')
#   - A start token ('<sos>')
#   - An end token ('<eos>')
# If these are not already defined, add them.
for t in [tokenizer_src, tokenizer_trg]:
    special_tokens = {}
    if t.pad_token is None:
        special_tokens['pad_token'] = '<pad>'
    if t.bos_token is None:
        special_tokens['bos_token'] = '<sos>'
    if t.eos_token is None:
        special_tokens['eos_token'] = '<eos>'
    if special_tokens:
        t.add_special_tokens(special_tokens)

# Now “wrap” the tokenizers into an object that provides the expected attributes.
class TokenizerWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        # Extract vocab as a dictionary: word -> index.
        self.vocab = type('', (), {})()  # create a dummy object
        self.vocab.stoi = tokenizer.get_vocab()
        # Build the reverse mapping: index -> word.
        self.vocab.itos = {idx: tok for tok, idx in self.vocab.stoi.items()}
    
    # A simple preprocessing function that converts text into tokens.
    # You can customize this based on your needs.
    def preprocess(self, text):
        # Here we use the tokenizer's built-in basic tokenization.
        # In many cases (especially with BERT), you might want to use lower case,
        # strip extra spaces, etc.
        return self.tokenizer.tokenize(text)

# Create wrappers
SRC = TokenizerWrapper(tokenizer_src)
TRG = TokenizerWrapper(tokenizer_trg)


In [15]:
import torch
from torch.utils.data import Dataset, DataLoader

class ParallelENVIDataset(Dataset):
    def __init__(self, src_file, trg_file, src_wrapper, trg_wrapper, max_src_length=160, max_trg_length=160):
        """
        ...
        max_src_length and max_trg_length: Maximum allowed number of tokens for source and target sequences
        """
        with open(src_file, 'r', encoding='utf-8') as f:
            self.src_sentences = [line.strip() for line in f]
        with open(trg_file, 'r', encoding='utf-8') as f:
            self.trg_sentences = [line.strip() for line in f]
        
        self.src_wrapper = src_wrapper
        self.trg_wrapper = trg_wrapper
        self.max_src_length = max_src_length
        self.max_trg_length = max_trg_length
        
        assert len(self.src_sentences) == len(self.trg_sentences), \
            "Source and target files must have the same number of lines!"

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        """Returns a tuple (source_idx_list, target_idx_list)."""
        src_text = self.src_sentences[idx]
        trg_text = self.trg_sentences[idx]
        
        # Tokenize the text
        src_tokens = self.src_wrapper.preprocess(src_text)
        trg_tokens = self.trg_wrapper.preprocess(trg_text)
        
        # Optionally, truncate tokens if length exceeds maximum allowed
        if self.max_src_length:
            src_tokens = src_tokens[:self.max_src_length]
        if self.max_trg_length:
            trg_tokens = trg_tokens[:self.max_trg_length]
        
        # Retrieve pad token string from tokenizer
        src_pad_token = self.src_wrapper.tokenizer.pad_token  # e.g., "[PAD]"
        trg_pad_token = self.trg_wrapper.tokenizer.pad_token  # e.g., "[PAD]"
        
        # Convert tokens to indices, defaulting missing tokens to the pad token id.
        src_indices = [
            self.src_wrapper.vocab.stoi.get(t, self.src_wrapper.vocab.stoi[src_pad_token])
            for t in src_tokens
        ]
        trg_indices = [
            self.trg_wrapper.vocab.stoi.get(t, self.trg_wrapper.vocab.stoi[trg_pad_token])
            for t in trg_tokens
        ]
        
        # Optionally, add <sos> and <eos> tokens around the target sequence if needed.
        # For example:
        sos_token = self.trg_wrapper.tokenizer.bos_token  # e.g., "<sos>"
        eos_token = self.trg_wrapper.tokenizer.eos_token  # e.g., "<eos>"
        trg_indices = ([self.trg_wrapper.vocab.stoi[sos_token]] + trg_indices + [self.trg_wrapper.vocab.stoi[eos_token]])
        
        return torch.LongTensor(src_indices), torch.LongTensor(trg_indices)


In [16]:
def collate_fn(batch):
    """
    batch: list of (src_indices, trg_indices) from ParallelENVIDataset.
    We'll pad them to have uniform lengths within the batch.
    """
    src_batch, trg_batch = zip(*batch)
    
    # Convert tuple of Tensors -> list of lengths.
    src_lens = [len(s) for s in src_batch]
    trg_lens = [len(t) for t in trg_batch]
    
    max_src_len = max(src_lens)
    max_trg_len = max(trg_lens)
    
    # Get the pad token strings from your tokenizer wrappers.
    src_pad_str = SRC.tokenizer.pad_token  # likely "[PAD]"
    trg_pad_str = TRG.tokenizer.pad_token  # likely "[PAD]"
    
    # Now look up the correct ID from the vocabulary mapping.
    src_pad_id = SRC.vocab.stoi[src_pad_str]
    trg_pad_id = TRG.vocab.stoi[trg_pad_str]
    
    # Pad the sequences.
    padded_src = []
    padded_trg = []
    for s, t in zip(src_batch, trg_batch):
        # s: pad to max_src_len
        pad_amount = max_src_len - len(s)
        padded_s = torch.cat([s, s.new_full((pad_amount,), src_pad_id)])
        padded_src.append(padded_s)
        
        # t: pad to max_trg_len
        pad_amount = max_trg_len - len(t)
        padded_t = torch.cat([t, t.new_full((pad_amount,), trg_pad_id)])
        padded_trg.append(padded_t)
        
    # Stack the padded sequences into a single tensor.
    padded_src = torch.stack(padded_src, dim=0)
    padded_trg = torch.stack(padded_trg, dim=0)
    
    return padded_src, padded_trg


In [17]:

train_en = "train.en"
train_vi = "train.vi"

train_dataset = ParallelENVIDataset(
    src_file=train_en,
    trg_file=train_vi,
    src_wrapper=SRC,
    trg_wrapper=TRG
)

train_loader = DataLoader(
    train_dataset,
    batch_size=30,       # pick a suitable batch size
    shuffle=True,
    collate_fn=collate_fn
)


In [18]:
import torch
import torch.nn as nn
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

src_vocab_size = len(SRC.vocab.stoi)
trg_vocab_size = len(TRG.vocab.stoi)
src_pad_idx = SRC.tokenizer.pad_token_id
trg_pad_idx = TRG.tokenizer.pad_token_id


model = Transformer(
    src_vocab_size=src_vocab_size,
    trg_vocab_size=trg_vocab_size,
    src_pad_idx=src_pad_idx,
    trg_pad_idx=trg_pad_idx,
    embed_size=512,        # or your chosen dimension
    num_layers=6,          # or however many you want
    forward_expansion=4,
    heads=8,
    dropout_probability=0.1,
    device=device,
    position_max_length=200 # set max length as needed
).to(device)



In [19]:
# Using the Scheduled Optimizer instead of the default Adam
class ScheduledOptim():
    '''A simple wrapper class for learning rate scheduling'''

    def __init__(self, optimizer, init_lr, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.init_lr = init_lr
        self.d_model = d_model
        self.n_warmup_steps = n_warmup_steps
        self.n_steps = 0


    def step_and_update_lr(self):
        "Step with the inner optimizer"
        self._update_learning_rate()
        self._optimizer.step()


    def zero_grad(self):
        "Zero out the gradients with the inner optimizer"
        self._optimizer.zero_grad()


    def _get_lr_scale(self):
        d_model = self.d_model
        n_steps, n_warmup_steps = self.n_steps, self.n_warmup_steps
        return (d_model ** -0.5) * min(n_steps ** (-0.5), n_steps * n_warmup_steps ** (-1.5))

    def state_dict(self):
        optimizer_state_dict = {
            'init_lr':self.init_lr,
            'd_model':self.d_model,
            'n_warmup_steps':self.n_warmup_steps,
            'n_steps':self.n_steps,
            '_optimizer':self._optimizer.state_dict(),
        }
        
        return optimizer_state_dict
    
    def load_state_dict(self, state_dict):
        self.init_lr = state_dict['init_lr']
        self.d_model = state_dict['d_model']
        self.n_warmup_steps = state_dict['n_warmup_steps']
        self.n_steps = state_dict['n_steps']
        
        self._optimizer.load_state_dict(state_dict['_optimizer'])
        
    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''

        self.n_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr
            
            
adam = optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9)
optimizer = ScheduledOptim(adam, init_lr=0.2, d_model=512, n_warmup_steps=4000)
# Cross Entropy Loss ignoring the padding index
criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_idx)

In [12]:
import torch

num_epochs = 10  # or whichever number of epochs you choose
best_loss = float("inf")  # Initialize best loss to a high value
checkpoint_path = "best_model.pth"  # Filepath to save the best model

for epoch in range(num_epochs):
    model.train()  # set model to training mode
    total_loss = 0
    
    for batch_idx, (src_batch, trg_batch) in enumerate(train_loader):
        src_batch = src_batch.to(device)
        trg_batch = trg_batch.to(device)
        
        # SHIFT target for teacher forcing:
        # Input to the decoder: trg_batch[:, :-1]
        # Expected output: trg_batch[:, 1:]
        outputs = model(src_batch, trg_batch[:, :-1])
        
        # outputs: [batch_size, trg_len - 1, vocab_size]
        # Reshape to [batch_size * (trg_len - 1), vocab_size]
        outputs = outputs.reshape(-1, outputs.shape[2])
        # ground_truth: [batch_size, trg_len - 1] -> reshape to [batch_size * (trg_len - 1)]
        ground_truth = trg_batch[:, 1:].reshape(-1)
        
        # Compute loss
        loss = criterion(outputs, ground_truth)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step_and_update_lr()
        
        total_loss += loss.item()
        
        if batch_idx % 1000 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx}/{len(train_loader)}], Loss: {loss.item():.4f}")
    
    avg_loss = total_loss / len(train_loader)
    print(f"*** Epoch [{epoch+1}] Completed; Avg Loss: {avg_loss:.4f}")
    
    # Save the model if the average loss for this epoch is lower than the best loss so far.
    if avg_loss < best_loss:
        best_loss = avg_loss
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Model saved with avg loss {avg_loss:.4f} at epoch {epoch+1}")


Epoch [1/10], Step [0/4444], Loss: 11.2357


Token indices sequence length is longer than the specified maximum sequence length for this model (757 > 512). Running this sequence through the model will result in indexing errors


Epoch [1/10], Step [1000/4444], Loss: 6.1666
Epoch [1/10], Step [2000/4444], Loss: 5.7099
Epoch [1/10], Step [3000/4444], Loss: 5.1315
Epoch [1/10], Step [4000/4444], Loss: 4.7252
*** Epoch [1] Completed; Avg Loss: 5.8618
Model saved with avg loss 5.8618 at epoch 1
Epoch [2/10], Step [0/4444], Loss: 4.5423
Epoch [2/10], Step [1000/4444], Loss: 4.6055
Epoch [2/10], Step [2000/4444], Loss: 4.2364
Epoch [2/10], Step [3000/4444], Loss: 3.9192
Epoch [2/10], Step [4000/4444], Loss: 4.3977
*** Epoch [2] Completed; Avg Loss: 4.2651
Model saved with avg loss 4.2651 at epoch 2
Epoch [3/10], Step [0/4444], Loss: 3.6234
Epoch [3/10], Step [1000/4444], Loss: 3.7069
Epoch [3/10], Step [2000/4444], Loss: 3.5824
Epoch [3/10], Step [3000/4444], Loss: 3.5833
Epoch [3/10], Step [4000/4444], Loss: 3.7638
*** Epoch [3] Completed; Avg Loss: 3.7958
Model saved with avg loss 3.7958 at epoch 3
Epoch [4/10], Step [0/4444], Loss: 3.7781
Epoch [4/10], Step [1000/4444], Loss: 3.5151
Epoch [4/10], Step [2000/4444],

In [13]:
from IPython.display import FileLink

FileLink(r'best_model.pth')


In [None]:
# Use can load my model from the link I provided in README
model.load_state_dict(torch.load("best_model.pth", map_location=device))
model.eval()  # set the model to evaluation mode

In [None]:
import torch.nn.functional as F

# --------------------------------------------------------
# 5. Define or import your translation functions:
def nopeak_mask(size, device):
    np_mask = torch.triu(torch.ones((1, size, size), device=device), diagonal=1).bool()
    return np_mask == 0

def greedy_decode(src, model, SRC, TRG, device, max_len):
    """
    Greedy decoding for generating translation.
    """
    # Create source mask
    src_mask = (src != SRC.vocab.stoi[SRC.tokenizer.pad_token]).unsqueeze(-2)
    encoder_output = model.encoder(src, src_mask)

    # Use the target tokenizer's bos_token_id if available, otherwise fall back to TRG.vocab.stoi
    init_tok = TRG.tokenizer.bos_token_id if TRG.tokenizer.bos_token_id is not None else TRG.vocab.stoi.get('<sos>')
    if init_tok is None:
        raise ValueError("No valid start token found in target tokenizer/vocab.")

    outputs = torch.LongTensor([[init_tok]]).to(device)

    for i in range(1, max_len):
        trg_mask = nopeak_mask(outputs.size(1), device)
        # Directly get the output from the decoder. This output is already the final logits.
        out = model.decoder(outputs, encoder_output, src_mask, trg_mask)
        out = F.softmax(out, dim=-1)

        # Choose the token with the highest probability for the last time step.
        next_token = torch.argmax(out[:, -1, :], dim=-1).unsqueeze(0)
        outputs = torch.cat([outputs, next_token], dim=1)

        # Retrieve end token similarly.
        eos_tok = TRG.tokenizer.eos_token_id if TRG.tokenizer.eos_token_id is not None else TRG.vocab.stoi.get('<eos>')
        if eos_tok is None:
            raise ValueError("No valid end token found in target tokenizer/vocab.")

        if next_token.item() == eos_tok:
            break

    token_ids = outputs.squeeze().tolist()
    if token_ids[-1] == eos_tok:
        token_ids = token_ids[1:-1]
    else:
        token_ids = token_ids[1:]
    
    translation = ' '.join([TRG.vocab.itos[tok] for tok in token_ids])
    return translation
def translate_sentence(sentence, model, SRC, TRG, device, max_len):
    model.eval()
    tokens = SRC.preprocess(sentence)
    indexed = [SRC.vocab.stoi.get(tok, SRC.vocab.stoi[tokenizer_src.pad_token]) for tok in tokens]
    src_tensor = torch.LongTensor([indexed]).to(device)
    translation = greedy_decode(src_tensor, model, SRC, TRG, device, max_len)
    return translation

  model.load_state_dict(torch.load("best_model.pth", map_location=device))


In [23]:
max_len = 50 # Maximum allowed target sequence length for decoding
source_sentence = "I really liked the part about challenging yourself even after achieving success. It shifted something in me."  # Sample English sentence

# Call the translate_sentence function:
translation = translate_sentence(source_sentence, model, SRC, TRG, device, max_len)
print("Translation:", translation)

Translation: Tôi thực sự thích phần của thử thách thậm chí sau khi thành công . Nó làm tôi cảm thấy gì đó trong tôi .
