<a href="https://colab.research.google.com/github/hanguyenai/sudo-code-nlp/blob/main/06_transformer_machine_translation/EVBCorpus_English_Vietnamese_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1 Importing dependencies

In [None]:
!pip install sacrebleu sentencepiece torchtext rarfile

Collecting sacrebleu
  Using cached sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting torchtext
  Using cached torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting rarfile
  Using cached rarfile-4.2-py3-none-any.whl.metadata (4.4 kB)
Collecting portalocker (from sacrebleu)
  Using cached portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Using cached colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Using cached sacrebleu-2.5.1-py3-none-any.whl (104 kB)
Using cached torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl (2.0 MB)
Using cached rarfile-4.2-py3-none-any.whl (29 kB)
Using cached colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Using cached portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: rarfile, portalocker, colorama, sacrebleu, torchtext
Successfully installed colorama-0.4.6 portalocker-3.2.0 rarfile-4.2 sacrebleu-2.5.1 torchtext-0.18.0


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import math
import random
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import re
from tqdm import tqdm
import sacrebleu
from pathlib import Path
import pickle
import os
import urllib.request
from pathlib import Path
import rarfile
import xml.etree.ElementTree as ET
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
# Special tokens
PAD = '<pad>'
UNK = '<unk>'
BOS = '<sos>'
EOS = '<eos>'

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
!git clone https://github.com/qhungngo/EVBCorpus.git

Cloning into 'EVBCorpus'...
remote: Enumerating objects: 35, done.[K
remote: Total 35 (delta 0), reused 0 (delta 0), pack-reused 35 (from 1)[K
Receiving objects: 100% (35/35), 35.37 MiB | 45.85 MiB/s, done.
Resolving deltas: 100% (10/10), done.


In [None]:
def parse_sgml_file(sgml_path):
    """Parse a single SGML file and extract English-Vietnamese sentence pairs"""
    try:
        # Read file content
        with open(sgml_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Parse with BeautifulSoup (handles SGML/XML-like formats)
        soup = BeautifulSoup(content, 'html.parser')

        pairs = []

        # Find all sentence pairs
        for spair in soup.find_all('spair'):
            en_text = ""
            vi_text = ""

            # Get all <s> tags in this spair
            s_tags = spair.find_all('s')

            for s in s_tags:
                s_id = s.get('id', '')
                text = s.get_text(strip=True)

                # Identify English vs Vietnamese by id prefix
                if s_id.startswith('en'):
                    en_text = text
                elif s_id.startswith('vn'):
                    vi_text = text

            # Only add if both sentences exist and are non-empty
            if en_text and vi_text:
                pairs.append({
                    'english': en_text,
                    'vietnamese': vi_text
                })

        return pairs

    except Exception as e:
        print(f"⚠️  Error parsing {os.path.basename(sgml_path)}: {e}")
        return []

In [None]:
def load_evbcorpus_to_dataframe(extract_dir):
    """Load all SGML files from EVBCorpus and create pandas DataFrame"""
    print("\n📖 Parsing SGML files...")

    # Find all SGML/XML files
    sgml_files = []
    for root, dirs, files in os.walk(extract_dir):
        for f in files:
            if f.endswith('.xml') or f.endswith('.sgml'):
                sgml_files.append(os.path.join(root, f))

    if not sgml_files:
        print("⚠️  No SGML/XML files found!")
        print(f"Searched in: {extract_dir}")
        print("\nTrying to find all files...")
        all_files = []
        for root, dirs, files in os.walk(extract_dir):
            all_files.extend([os.path.join(root, f) for f in files[:5]])
        print(f"Found files: {all_files[:10]}")
        raise FileNotFoundError("No SGML/XML files found in extracted directory")

    print(f"Found {len(sgml_files)} SGML/XML files")

    # Parse all SGML files
    all_pairs = []
    from tqdm import tqdm

    for sgml_file in tqdm(sgml_files, desc="Parsing SGML"):
        pairs = parse_sgml_file(sgml_file)
        all_pairs.extend(pairs)

    if not all_pairs:
        raise ValueError("No sentence pairs found! Check SGML file format.")

    # Create DataFrame
    df = pd.DataFrame(all_pairs)

    # Remove duplicates
    df = df.drop_duplicates()

    print(f"\n✓ Loaded {len(df):,} sentence pairs")
    print(f"  Unique pairs: {len(df):,}")
    print(f"\nDataFrame Info:")
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {df.columns.tolist()}")

    # Show statistics
    print(f"\nSentence Length Statistics:")
    df['en_words'] = df['english'].str.split().str.len()
    df['vi_words'] = df['vietnamese'].str.split().str.len()
    print(f"  English words: mean={df['en_words'].mean():.1f}, max={df['en_words'].max()}")
    print(f"  Vietnamese words: mean={df['vi_words'].mean():.1f}, max={df['vi_words'].max()}")

    # Drop temporary columns
    df = df.drop(['en_words', 'vi_words'], axis=1)

    return df

In [None]:
rar_path = "/content/EVBCorpus/EVBCorpus_EVBNews_v2.0.rar"
out_dir = "/content/evbcorpus_data/EVBCorpus_v2"
os.makedirs(out_dir, exist_ok=True)

with rarfile.RarFile(rar_path) as rf:
    # rf.printdir()  # xem list file
    rf.extractall(path=out_dir)

In [None]:
# Download and parse corpus
print("=" * 80)
print("EVBCorpus v2.0 - SGML Parser")
print("=" * 80)

extract_dir = out_dir
corpus_df = load_evbcorpus_to_dataframe(extract_dir)

# Display sample data
print("\n📊 Sample data from DataFrame:")
print("=" * 80)
display_df = corpus_df.head(5).copy()
display_df['english'] = display_df['english'].str[:60] + '...'
display_df['vietnamese'] = display_df['vietnamese'].str[:60] + '...'
print(display_df.to_string(index=True))
print("\n" + "=" * 80)

In [None]:
def preprocess_sentence(w, max_length=None):
    """Preprocess a sentence (TensorFlow/Keras style)"""
    w = w.lower().strip()

    # Add space around punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # Replace multiple spaces with single space
    w = re.sub(r'\s+', ' ', w)
    w = w.strip()

    # Truncate to max_length if specified
    if max_length:
        w = " ".join(w.split()[:max_length])

    # Add start and end tokens
    w = '{} {} {}'.format(BOS, w, EOS)

    return w

In [None]:
def display_samples(inp_lines, targ_lines, num_of_pairs=5):
    """Display sample data pairs"""
    pairs = list(zip(inp_lines[:num_of_pairs], targ_lines[:num_of_pairs]))

    print('=' * 70)
    print('SAMPLE DATA')
    print('=' * 70)

    for i, (inp, targ) in enumerate(pairs):
        print(f'\n--> Sample {i + 1}:')
        print(f'    Input:  {inp}')
        print(f'    Target: {targ}')

    print('\n' + '=' * 70)

In [None]:
def load_data_from_dataframe(df, max_samples=None, max_length=100):
    """Load and preprocess data from pandas DataFrame"""
    print(f"\n📖 Loading data from DataFrame...")
    print(f"  Total pairs: {len(df):,}")

    # Get English and Vietnamese sentences
    en_sentences = df['english'].tolist()
    vi_sentences = df['vietnamese'].tolist()

    print(f"\n🧹 Preprocessing sentences...")
    en_preprocessed = []
    vi_preprocessed = []

    for en, vi in tqdm(zip(en_sentences, vi_sentences), total=len(en_sentences), desc="Processing"):
        en_prep = preprocess_sentence(en, max_length=max_length)
        vi_prep = preprocess_sentence(vi, max_length=max_length)

        # Filter out empty sentences
        if len(en_prep.split()) > 2 and len(vi_prep.split()) > 2:  # More than BOS+EOS
            en_preprocessed.append(en_prep)
            vi_preprocessed.append(vi_prep)

    print(f"✓ After preprocessing: {len(en_preprocessed):,} sentences")

    if max_samples:
        en_preprocessed = en_preprocessed[:max_samples]
        vi_preprocessed = vi_preprocessed[:max_samples]
        print(f"✓ Using {max_samples:,} samples")

    # Display samples
    display_samples(en_preprocessed, vi_preprocessed, num_of_pairs=3)

    return en_preprocessed, vi_preprocessed

In [None]:
class Tokenizer:
    """Tokenizer similar to TensorFlow/Keras Tokenizer"""
    def __init__(self, num_words=None, oov_token='<unk>'):
        self.num_words = num_words
        self.oov_token = oov_token

        self.word_index = {}  # word -> index mapping
        self.index_word = {}  # index -> word mapping
        self.word_counts = Counter()

        # Initialize with special tokens
        self.word_index = {
            PAD: 0,
            BOS: 1,
            EOS: 2,
            UNK: 3
        }
        self.index_word = {v: k for k, v in self.word_index.items()}

    def fit_on_texts(self, texts):
        """Build vocabulary from texts"""
        print(f"🔧 Building vocabulary...")

        # Count words
        for text in tqdm(texts, desc="Counting words"):
            words = text.split()
            self.word_counts.update(words)

        # Build word_index based on frequency
        sorted_words = sorted(self.word_counts.items(), key=lambda x: x[1], reverse=True)

        # Start from index 4 (after special tokens)
        idx = 4
        for word, count in sorted_words:
            if word not in self.word_index:
                if self.num_words is None or idx < self.num_words:
                    self.word_index[word] = idx
                    self.index_word[idx] = word
                    idx += 1

        vocab_size = len(self.word_index)
        print(f"✓ Vocabulary size: {vocab_size:,}")
        print(f"  Total unique words: {len(self.word_counts):,}")
        if self.num_words:
            print(f"  Kept top {self.num_words:,} words")

    def texts_to_sequences(self, texts):
        """Convert texts to sequences of integers"""
        sequences = []
        for text in texts:
            words = text.split()
            sequence = []
            for word in words:
                if word in self.word_index:
                    sequence.append(self.word_index[word])
                else:
                    sequence.append(self.word_index[UNK])
            sequences.append(sequence)
        return sequences

    def sequences_to_texts(self, sequences):
        """Convert sequences of integers back to texts"""
        texts = []
        for sequence in sequences:
            words = []
            for idx in sequence:
                if idx in self.index_word:
                    word = self.index_word[idx]
                    # Skip special tokens except for visualization
                    if word not in [PAD, BOS, EOS]:
                        words.append(word)
                else:
                    words.append(UNK)
            texts.append(' '.join(words))
        return texts

    def get_vocab_size(self):
        """Get vocabulary size"""
        return len(self.word_index)

    def save(self, filepath):
        """Save tokenizer"""
        with open(filepath, 'wb') as f:
            pickle.dump({
                'word_index': self.word_index,
                'index_word': self.index_word,
                'word_counts': self.word_counts,
                'num_words': self.num_words,
                'oov_token': self.oov_token
            }, f)

    def load(self, filepath):
        """Load tokenizer"""
        with open(filepath, 'rb') as f:
            data = pickle.load(f)
            self.word_index = data['word_index']
            self.index_word = data['index_word']
            self.word_counts = data['word_counts']
            self.num_words = data['num_words']
            self.oov_token = data['oov_token']

In [None]:
class TranslationDataset(Dataset):
    """Dataset for translation task"""
    def __init__(self, src_sequences, trg_sequences, max_len=None):
        self.src_sequences = src_sequences
        self.trg_sequences = trg_sequences
        self.max_len = max_len

    def __len__(self):
        return len(self.src_sequences)

    def __getitem__(self, idx):
        src = self.src_sequences[idx]
        trg = self.trg_sequences[idx]

        # Truncate if too long
        if self.max_len:
            src = src[:self.max_len]
            trg = trg[:self.max_len]

        src = torch.tensor(src, dtype=torch.long)
        trg = torch.tensor(trg, dtype=torch.long)
        return src, trg

In [None]:
def collate_fn(batch):
    """Collate function for DataLoader with padding"""
    src_batch, trg_batch = zip(*batch)

    # Pad sequences
    src_batch = nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=0)
    trg_batch = nn.utils.rnn.pad_sequence(trg_batch, batch_first=True, padding_value=0)

    return src_batch, trg_batch

In [None]:
class PositionalEncoding(nn.Module):
    """Positional encoding for Transformer"""
    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

        # FIX: Use torch.arange instead of torch.range (deprecated)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # Add positional encoding and apply dropout
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [None]:
class TransformerTranslator(nn.Module):
    """Transformer model for translation"""
    def __init__(self, src_vocab_size, trg_vocab_size, d_model=512, nhead=8,
                 num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048,
                 dropout=0.1, max_len=200):
        super(TransformerTranslator, self).__init__()

        self.d_model = d_model
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len=max_len, dropout=dropout)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )

        self.fc_out = nn.Linear(d_model, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

        self._init_weights()

    def _init_weights(self):
        """Initialize weights"""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def generate_square_subsequent_mask(self, sz):
        """Generate mask for target sequence"""
        mask = torch.triu(torch.ones(sz, sz), diagonal=1).bool()
        return mask

    def forward(self, src, trg):
        """Forward pass"""
        # Create masks
        trg_mask = self.generate_square_subsequent_mask(trg.size(1)).to(src.device)
        src_padding_mask = (src == 0)
        trg_padding_mask = (trg == 0)

        # Embeddings with scaling
        src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))
        trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))

        # Transformer
        output = self.transformer(
            src_emb, trg_emb,
            tgt_mask=trg_mask,
            src_key_padding_mask=src_padding_mask,
            tgt_key_padding_mask=trg_padding_mask
        )

        return self.fc_out(output)

In [None]:
def train_epoch(model, dataloader, optimizer, criterion, clip=1.0):
  """Train for one epoch"""
  model.train()
  epoch_loss = 0
  progress_bar = tqdm(dataloader, desc="Training")
  for src, trg in progress_bar:
    src, trg = src.to(device), trg.to(device)

    optimizer.zero_grad()

    # Forward pass
    output = model(src, trg[:, :-1])

    # Reshape for loss calculation
    output = output.reshape(-1, output.shape[-1])
    trg = trg[:, 1:].reshape(-1)

    loss = criterion(output, trg)
    loss.backward()

    # Gradient clipping
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

    optimizer.step()
    epoch_loss += loss.item()
    progress_bar.set_postfix({'loss': loss.item()})

  return epoch_loss / len(dataloader)

In [None]:
def evaluate(model, dataloader, criterion):
    """Evaluate model"""
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for src, trg in tqdm(dataloader, desc="Evaluating"):
            src, trg = src.to(device), trg.to(device)

            output = model(src, trg[:, :-1])
            output = output.reshape(-1, output.shape[-1])
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

In [None]:
def translate_sentence(model, sentence, src_tokenizer, trg_tokenizer, max_len=50):
    """Translate a single sentence"""
    model.eval()

    # Preprocess and tokenize source sentence
    preprocessed = preprocess_sentence(sentence, max_length=max_len)
    src_sequence = src_tokenizer.texts_to_sequences([preprocessed])[0]
    src_tensor = torch.tensor(src_sequence).unsqueeze(0).to(device)

    # Start with BOS token
    trg_indices = [trg_tokenizer.word_index[BOS]]

    with torch.no_grad():
        for _ in range(max_len):
            trg_tensor = torch.tensor(trg_indices).unsqueeze(0).to(device)

            output = model(src_tensor, trg_tensor)
            next_token = output.argmax(dim=-1)[:, -1].item()

            trg_indices.append(next_token)

            # Stop if EOS token
            if next_token == trg_tokenizer.word_index[EOS]:
                break

    # Decode to text
    translation = trg_tokenizer.sequences_to_texts([trg_indices])[0]
    return translation

In [None]:
def calculate_bleu(model, test_texts, src_tokenizer, trg_tokenizer, sample_size=None):
    """Calculate BLEU score on test data"""
    model.eval()

    src_texts, trg_texts = test_texts

    if sample_size:
        indices = random.sample(range(len(src_texts)), min(sample_size, len(src_texts)))
        src_texts = [src_texts[i] for i in indices]
        trg_texts = [trg_texts[i] for i in indices]

    predictions = []
    references = []

    print("Generating translations for BLEU calculation...")
    for src, trg in tqdm(zip(src_texts, trg_texts), total=len(src_texts)):
        # Translate (src is already preprocessed with BOS/EOS)
        # Remove BOS/EOS for input to translate_sentence
        src_clean = src.replace(BOS, '').replace(EOS, '').strip()
        pred = translate_sentence(model, src_clean, src_tokenizer, trg_tokenizer)

        # Clean reference (remove BOS/EOS)
        trg_clean = trg.replace(BOS, '').replace(EOS, '').strip()

        predictions.append(pred)
        references.append(trg_clean)

    # Calculate BLEU score
    bleu = sacrebleu.corpus_bleu(predictions, [references])

    return bleu.score, predictions, references

In [None]:
print("=" * 70)
print("TRANSFORMER MODEL FOR ENGLISH-VIETNAMESE TRANSLATION")
print("=" * 70)

# Model configuration
config = {
    'd_model': 256,
    'nhead': 8,
    'num_encoder_layers': 3,
    'num_decoder_layers': 3,
    'dim_feedforward': 512,
    'dropout': 0.1,
    'batch_size': 32,
    'num_epochs': 50,
    'learning_rate': 0.0001,
    'max_len': 150,
    'max_samples': None,  # None for full dataset, or number like 5000 for testing
    'max_sentence_length': 100  # Max words per sentence
}

print("\n📋 Configuration:")
for key, value in config.items():
    print(f"  {key}: {value}")

In [None]:
print("\n" + "=" * 70)
print("LOADING DATA")
print("=" * 70)

# Load data from DataFrame
en_data, vi_data = load_data_from_dataframe(
    corpus_df,
    max_samples=config['max_samples'],
    max_length=config['max_sentence_length']
)

# Split data: 80% train, 10% val, 10% test
train_size = int(0.8 * len(en_data))
val_size = int(0.1 * len(en_data))

train_en = en_data[:train_size]
train_vi = vi_data[:train_size]
val_en = en_data[train_size:train_size+val_size]
val_vi = vi_data[train_size:train_size+val_size]
test_en = en_data[train_size+val_size:]
test_vi = vi_data[train_size+val_size:]

print(f"\n📊 Data split:")
print(f"  Train: {len(train_en):,} sentences")
print(f"  Val:   {len(val_en):,} sentences")
print(f"  Test:  {len(test_en):,} sentences")

In [None]:
print("\n" + "=" * 70)
print("BUILDING VOCABULARIES (Keras/TensorFlow style)")
print("=" * 70)

# Initialize tokenizers
en_tokenizer = Tokenizer(num_words=50000)  # Keep top 50K words
vi_tokenizer = Tokenizer(num_words=50000)

# Fit on training data
print("\n🔤 Fitting English tokenizer...")
en_tokenizer.fit_on_texts(train_en)

print("🔤 Fitting Vietnamese tokenizer...")
vi_tokenizer.fit_on_texts(train_vi)

print(f"\n✓ English vocab size: {en_tokenizer.get_vocab_size():,}")
print(f"✓ Vietnamese vocab size: {vi_tokenizer.get_vocab_size():,}")

# Show tokenization examples
print("\n📝 Tokenization examples:")
sample_en = train_en[0]
sample_vi = train_vi[0]

en_seq = en_tokenizer.texts_to_sequences([sample_en])[0]
vi_seq = vi_tokenizer.texts_to_sequences([sample_vi])[0]

print(f"\nEnglish text: {sample_en}")
print(f"Sequence:     {en_seq[:20]}...")  # Show first 20 tokens
print(f"Decoded back: {en_tokenizer.sequences_to_texts([en_seq])[0]}")

print(f"\nVietnamese text: {sample_vi}")
print(f"Sequence:        {vi_seq[:20]}...")
print(f"Decoded back:    {vi_tokenizer.sequences_to_texts([vi_seq])[0]}")

# Convert all texts to sequences
print("\n🔄 Converting texts to sequences...")
train_en_seq = en_tokenizer.texts_to_sequences(train_en)
train_vi_seq = vi_tokenizer.texts_to_sequences(train_vi)
val_en_seq = en_tokenizer.texts_to_sequences(val_en)
val_vi_seq = vi_tokenizer.texts_to_sequences(val_vi)
test_en_seq = en_tokenizer.texts_to_sequences(test_en)
test_vi_seq = vi_tokenizer.texts_to_sequences(test_vi)

print(f"✓ Converted all texts to sequences")

In [None]:
print("\n" + "=" * 70)
print("CREATING DATASETS")
print("=" * 70)

# Create datasets from sequences with max_len truncation
train_dataset = TranslationDataset(train_en_seq, train_vi_seq, max_len=config['max_len'])
val_dataset = TranslationDataset(val_en_seq, val_vi_seq, max_len=config['max_len'])
test_dataset = TranslationDataset(test_en_seq, test_vi_seq, max_len=config['max_len'])

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'],
                         shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'],
                       collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'],
                        collate_fn=collate_fn)

print(f"✓ Train batches: {len(train_loader)}")
print(f"✓ Val batches:   {len(val_loader)}")
print(f"✓ Test batches:  {len(test_loader)}")
print(f"✓ Max sequence length: {config['max_len']}")

In [None]:
print("\n" + "=" * 70)
print("INITIALIZING MODEL")
print("=" * 70)

model = TransformerTranslator(
    src_vocab_size=en_tokenizer.get_vocab_size(),
    trg_vocab_size=vi_tokenizer.get_vocab_size(),
    d_model=config['d_model'],
    nhead=config['nhead'],
    num_encoder_layers=config['num_encoder_layers'],
    num_decoder_layers=config['num_decoder_layers'],
    dim_feedforward=config['dim_feedforward'],
    dropout=config['dropout'],
    max_len=config['max_len']
).to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\n🤖 Model Architecture:")
print(f"  Total parameters:     {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")
print(f"  Model size:           ~{total_params * 4 / 1e6:.2f} MB")

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
print(f"✓ Loss function: CrossEntropyLoss")
print(f"✓ Optimizer: Adam (lr={config['learning_rate']})")
print(f"✓ Scheduler: ReduceLROnPlateau")

In [None]:
print("\n" + "=" * 70)
print("TRAINING MODEL")
print("=" * 70)

train_losses = []
val_losses = []
best_val_loss = float('inf')
start_time = time.time()

for epoch in range(config['num_epochs']):
    epoch_start = time.time()
    print(f"\n{'='*70}")
    print(f"Epoch {epoch+1}/{config['num_epochs']}")
    print(f"{'='*70}")

    # Train
    train_loss = train_epoch(model, train_loader, optimizer, criterion)

    # Validate
    val_loss = evaluate(model, val_loader, criterion)

    # Record losses
    train_losses.append(train_loss)
    val_losses.append(val_loss)

    epoch_time = time.time() - epoch_start

    # Print results
    print(f"\n📊 Results:")
    print(f"  Train Loss: {train_loss:.4f} | Train PPL: {math.exp(train_loss):.2f}")
    print(f"  Val Loss:   {val_loss:.4f} | Val PPL:   {math.exp(val_loss):.2f}")
    print(f"  Time:       {epoch_time:.2f}s")

    # Learning rate scheduling
    scheduler.step(val_loss)

    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss,
            'val_loss': val_loss,
            'config': config
        }, 'best_model.pt')
        print("  ✓ Best model saved!")

total_time = time.time() - start_time
print(f"\n{'='*70}")
print(f"✓ Training completed in {total_time/60:.2f} minutes")
print(f"  Average time per epoch: {total_time/config['num_epochs']/60:.2f} minutes")

# ============================================================================
# Cell 19: Load Best Model
# ============================================================================
print("\n" + "=" * 70)
print("STEP 6: LOADING BEST MODEL")
print("=" * 70)

checkpoint = torch.load('best_model.pt')
model.load_state_dict(checkpoint['model_state_dict'])

print(f"✓ Loaded best model from epoch {checkpoint['epoch']+1}")
print(f"  Val Loss: {checkpoint['val_loss']:.4f}")
print(f"  Val PPL:  {math.exp(checkpoint['val_loss']):.2f}")

In [None]:
print("\n" + "=" * 70)
print("EVALUATING ON TEST SET")
print("=" * 70)

test_loss = evaluate(model, test_loader, criterion)
test_ppl = math.exp(test_loss)

print(f"\n📊 Test Results:")
print(f"  Test Loss:       {test_loss:.4f}")
print(f"  Test Perplexity: {test_ppl:.2f}")

# ============================================================================
# Cell 21: Calculate BLEU Score
# ============================================================================
print("\n" + "=" * 70)
print("STEP 8: CALCULATING BLEU SCORE")
print("=" * 70)

bleu_score, predictions, references = calculate_bleu(
    model, (test_en, test_vi), en_tokenizer, vi_tokenizer, sample_size=100
)

print(f"\n🎯 BLEU Score: {bleu_score:.2f}")

print("\n📝 Sample Translations:")
print("-" * 70)

for i in range(min(5, len(predictions))):
    # Clean test sentences (remove BOS/EOS for display)
    test_en_clean = test_en[i].replace(BOS, '').replace(EOS, '').strip()

    print(f"\n[Example {i+1}]")
    print(f"Source:     {test_en_clean}")
    print(f"Reference:  {references[i]}")
    print(f"Prediction: {predictions[i]}")

In [None]:
print("\n" + "=" * 70)
print("VISUALIZING RESULTS")
print("=" * 70)

plt.figure(figsize=(14, 5))

# Loss plot
plt.subplot(1, 2, 1)
epochs = range(1, len(train_losses) + 1)
plt.plot(epochs, train_losses, 'bo-', label='Train Loss', linewidth=2, markersize=6)
plt.plot(epochs, val_losses, 'rs-', label='Val Loss', linewidth=2, markersize=6)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Training and Validation Loss', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)

# Perplexity plot
plt.subplot(1, 2, 2)
train_ppls = [math.exp(loss) for loss in train_losses]
val_ppls = [math.exp(loss) for loss in val_losses]
plt.plot(epochs, train_ppls, 'bo-', label='Train Perplexity', linewidth=2, markersize=6)
plt.plot(epochs, val_ppls, 'rs-', label='Val Perplexity', linewidth=2, markersize=6)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Perplexity', fontsize=12)
plt.title('Training and Validation Perplexity', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('training_curves.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Training curves saved to 'training_curves.png'")

In [None]:
print("\n" + "=" * 70)
print("SAVING RESULTS")
print("=" * 70)

# Save results
results = {
    'test_loss': test_loss,
    'test_perplexity': test_ppl,
    'bleu_score': bleu_score,
    'config': config,
    'vocab_sizes': {
        'en': en_tokenizer.get_vocab_size(),
        'vi': vi_tokenizer.get_vocab_size()
    },
    'training_time': total_time,
    'train_losses': train_losses,
    'val_losses': val_losses
}

with open('results.pkl', 'wb') as f:
    pickle.dump(results, f)

# Save tokenizers
en_tokenizer.save('en_tokenizer.pkl')
vi_tokenizer.save('vi_tokenizer.pkl')

print("✓ Results saved to 'results.pkl'")
print("✓ Tokenizers saved to 'en_tokenizer.pkl' and 'vi_tokenizer.pkl'")
print("✓ Best model saved to 'best_model.pt'")

In [None]:
print("\n" + "=" * 70)
print("FINAL TRANSLATION QUALITY REPORT")
print("=" * 70)

print(f"\n📊 Model Performance:")
print(f"  {'Metric':<20} {'Value':<15}")
print(f"  {'-'*35}")
print(f"  {'Test Loss':<20} {test_loss:<15.4f}")
print(f"  {'Test Perplexity':<20} {test_ppl:<15.2f}")
print(f"  {'BLEU Score':<20} {bleu_score:<15.2f}")

print(f"\n📐 Model Size:")
print(f"  {'EN Vocabulary':<20} {en_tokenizer.get_vocab_size():<15,}")
print(f"  {'VI Vocabulary':<20} {vi_tokenizer.get_vocab_size():<15,}")
print(f"  {'Parameters':<20} {total_params:<15,}")

print(f"\n⏱️  Training Time:")
print(f"  {'Total Time':<20} {total_time/60:<15.2f} minutes")
print(f"  {'Avg per Epoch':<20} {total_time/config['num_epochs']/60:<15.2f} minutes")

print("\n✅ Training Complete!")

In [32]:
print("\n" + "=" * 70)
print("TRANSLATION TEST")
print("=" * 70)

# Danh sách các câu test
test_sentences = [
    "Hello, how are you?",
    "I love learning natural language processing.",
    "The weather is beautiful today.",
    "Machine learning is changing the world.",
    "What is your name?",
    "I am studying artificial intelligence.",
    "She goes to school every day.",
    "This book is very interesting.",
    "Can you help me with this problem?",
    "Thank you for your time."
]

results = []

print(f"\nTranslating {len(test_sentences)} sentences...\n")

for i, sentence in enumerate(test_sentences, 1):
    print(f"Processing [{i}/{len(test_sentences)}]...", end=" ")

    try:
        translation = translate_sentence(model, sentence, en_tokenizer, vi_tokenizer)
        results.append((sentence, translation, "✅"))
        print("Done")
    except Exception as e:
        results.append((sentence, f"Error: {e}", "❌"))
        print("Failed")

# In kết quả
print("\n" + "=" * 70)
print("RESULTS")
print("=" * 70 + "\n")

for i, (en, vi, status) in enumerate(results, 1):
    print(f"{status} Test {i}:")
    print(f"   🇬🇧 EN: {en}")
    print(f"   🇻🇳 VI: {vi}\n")

print("=" * 70)
print(f"✅ Completed: {sum(1 for r in results if r[2] == '✅')}/{len(results)}")
print("=" * 70)


TRANSLATION TEST

Translating 10 sentences...

Processing [1/10]... Done
Processing [2/10]... Done
Processing [3/10]... Done
Processing [4/10]... Done
Processing [5/10]... Done
Processing [6/10]... Done
Processing [7/10]... Done
Processing [8/10]... Done
Processing [9/10]... Done
Processing [10/10]... Done

RESULTS

✅ Test 1:
   🇬🇧 EN: Hello, how are you?
   🇻🇳 VI: xin chào , làm sao con là gì ?

✅ Test 2:
   🇬🇧 EN: I love learning natural language processing.
   🇻🇳 VI: tôi yêu thích sự tự nhiên bằng ngôn ngữ tự nhiên .

✅ Test 3:
   🇬🇧 EN: The weather is beautiful today.
   🇻🇳 VI: thời tiết thời là tuyệt vời ngày nay rất đẹp .

✅ Test 4:
   🇬🇧 EN: Machine learning is changing the world.
   🇻🇳 VI: máy học đang học đang thay đổi thế giới .

✅ Test 5:
   🇬🇧 EN: What is your name?
   🇻🇳 VI: cái tên của bạn là gì ? ? ? ? ? ? ?

✅ Test 6:
   🇬🇧 EN: I am studying artificial intelligence.
   🇻🇳 VI: tôi đang học đại học nhân tình báo nhân của tình báo .

✅ Test 7:
   🇬🇧 EN: She goes to school