In [1]:
!pip install torch transformers sentencepiece datasets sacrebleu accelerate pyvi

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
# Cell 2: Setup and imports

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import torch
from datasets import Dataset, concatenate_datasets
from transformers import (
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback,
    AutoTokenizer
)
import random

from pyvi import ViTokenizer

print(torch.cuda.get_device_name(0))
print("VRAM:", torch.cuda.get_device_properties(0).total_memory / 1e9, "GB")

NVIDIA H200
VRAM: 150.0217344 GB


In [3]:
# Cell 3: Load Khmer tokenizer
# Load Khmer tokenizer from Hugging Face
khmer_word_tokenizer = AutoTokenizer.from_pretrained(
    "khopilot/km-tokenizer-khmer", 
    use_fast=False
)
print("Khmer tokenizer loaded successfully!")

# Cell 3b: Define tokenization functions
def tokenize_vietnamese(text):
    """Tokenize Vietnamese text using PyVi"""
    try:
        return ViTokenizer.tokenize(text)
    except Exception as e:
        print(f"Error tokenizing Vietnamese: {e}")
        return text

def tokenize_khmer(text):
    """Tokenize Khmer text using khopilot/km-tokenizer-khmer"""
    try:
        tokens = khmer_word_tokenizer.tokenize(text)
        return " ".join(tokens)
    except Exception as e:
        print(f"Error tokenizing Khmer: {e}")
        return text

def tokenize_batch_vietnamese(texts):
    """Batch tokenize Vietnamese texts"""
    print(f"Tokenizing {len(texts)} Vietnamese texts...")
    results = []
    for i, text in enumerate(texts):
        results.append(tokenize_vietnamese(text))
        if (i + 1) % 10000 == 0:
            print(f"  Processed {i + 1}/{len(texts)} Vietnamese texts")
    return results

def tokenize_batch_khmer(texts):
    """Batch tokenize Khmer texts"""
    print(f"Tokenizing {len(texts)} Khmer texts...")
    results = []
    for i, text in enumerate(texts):
        results.append(tokenize_khmer(text))
        if (i + 1) % 10000 == 0:
            print(f"  Processed {i + 1}/{len(texts)} Khmer texts")
    return results

print("Vietnamese and Khmer tokenizers loaded successfully!")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Khmer tokenizer loaded successfully!
Vietnamese and Khmer tokenizers loaded successfully!


In [4]:

DATA_DIR = "dataset"

In [5]:
# ============================================================
# CELL 3: Load Phase 3 Model
# ============================================================
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


print("Loading Phase 3 model...")
model = AutoModelForSeq2SeqLM.from_pretrained("./vi_to_km/phase3/best").cuda()
tokenizer = AutoTokenizer.from_pretrained("./vi_to_km/phase3/best")

print(f"Model parameters: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")
print("Phase 3 model loaded successfully!")

Loading Phase 3 model...
Model parameters: 483.9M
Phase 3 model loaded successfully!


In [6]:
# ============================================================
# CELL 4: Unfreeze ALL Parameters
# ============================================================
print("\nUnfreezing all parameters...")

for param in model.parameters():
    param.requires_grad = True

# Verify
total, trainable = 0, 0
encoder_trainable, decoder_trainable = 0, 0

for name, param in model.named_parameters():
    n = param.numel()
    total += n
    
    if param.requires_grad:
        trainable += n
        if "encoder" in name:
            encoder_trainable += n
        elif "decoder" in name:
            decoder_trainable += n

print("\n" + "="*70)
print("PARAMETER STATISTICS")
print("="*70)
print(f"Total params        : {total/1e6:.1f}M")
print(f"Trainable params    : {trainable/1e6:.1f}M")
print(f"  - Encoder         : {encoder_trainable/1e6:.1f}M")
print(f"  - Decoder         : {decoder_trainable/1e6:.1f}M")
print(f"Trainable %         : 100.0%")
print("="*70)



Unfreezing all parameters...

PARAMETER STATISTICS
Total params        : 483.9M
Trainable params    : 483.9M
  - Encoder         : 201.6M
  - Decoder         : 151.2M
Trainable %         : 100.0%


In [7]:
# Cell 6: Load and prepare data

def load_parallel(src_file, tgt_file):
    with open(src_file, encoding="utf-8") as f:
        src = [l.strip() for l in f]
    with open(tgt_file, encoding="utf-8") as f:
        tgt = [l.strip() for l in f]
    
    assert len(src) == len(tgt)

    print("Tokenizing Vietnamese texts...")
    src_tokenized = tokenize_batch_vietnamese(src)
    
    print("Tokenizing Khmer texts...")
    tgt_tokenized = tokenize_batch_khmer(tgt) 
    
    return Dataset.from_dict({
        "src_text": src_tokenized,
        "tgt_text": tgt_tokenized
    })



# Load data from train.khm and train.vi
full_dataset = load_parallel(
    f"{DATA_DIR}/train_khm.vi",
    f"{DATA_DIR}/train_khm.khm"
)

print(f"Total dataset size: {len(full_dataset)} examples")

# Split dataset
test_size = 1000
dev_size = 3000

test_start_idx = len(full_dataset) - test_size
dev_start_idx = test_start_idx - dev_size

test_dataset = full_dataset.select(range(test_start_idx, len(full_dataset)))
dev_dataset = full_dataset.select(range(dev_start_idx, test_start_idx))
train_dataset = full_dataset.select(range(0, dev_start_idx))

train_dataset = train_dataset.shuffle(seed=42)

print(f"Train dataset: {len(train_dataset)} examples (for training)")
print(f"Dev dataset  : {len(dev_dataset)} examples (for validation during training)")
print(f"Test dataset : {len(test_dataset)} examples (for final evaluation)")
print("\nData split and shuffle completed.")

Tokenizing Vietnamese texts...
Tokenizing 599999 Vietnamese texts...
  Processed 10000/599999 Vietnamese texts
  Processed 20000/599999 Vietnamese texts
  Processed 30000/599999 Vietnamese texts
  Processed 40000/599999 Vietnamese texts
  Processed 50000/599999 Vietnamese texts
  Processed 60000/599999 Vietnamese texts
  Processed 70000/599999 Vietnamese texts
  Processed 80000/599999 Vietnamese texts
  Processed 90000/599999 Vietnamese texts
  Processed 100000/599999 Vietnamese texts
  Processed 110000/599999 Vietnamese texts
  Processed 120000/599999 Vietnamese texts
  Processed 130000/599999 Vietnamese texts
  Processed 140000/599999 Vietnamese texts
  Processed 150000/599999 Vietnamese texts
  Processed 160000/599999 Vietnamese texts
  Processed 170000/599999 Vietnamese texts
  Processed 180000/599999 Vietnamese texts
  Processed 190000/599999 Vietnamese texts
  Processed 200000/599999 Vietnamese texts
  Processed 210000/599999 Vietnamese texts
  Processed 220000/599999 Vietnamese 

In [8]:
# Cell 7: Preprocessing function
MAX_LEN = 256

def preprocess(batch):
    tokenizer.src_lang = "vi"  # Changed from "lo" to "km" for Khmer
    tokenizer.tgt_lang = "km"

    inputs = tokenizer(
        batch["src_text"],
        truncation=True,
        max_length=MAX_LEN
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["tgt_text"],
            truncation=True,
            max_length=MAX_LEN
        )

    inputs["labels"] = labels["input_ids"]
    return inputs

In [9]:
# Cell 8: Apply preprocessing
train_dataset = train_dataset.map(
    preprocess,
    batched=True,
    remove_columns=train_dataset.column_names,
    num_proc=8
)

dev_dataset = dev_dataset.map(
    preprocess,
    batched=True,
    remove_columns=dev_dataset.column_names,
    num_proc=8
)

Map (num_proc=8):   0%|          | 0/595999 [00:00<?, ? examples/s]



Map (num_proc=8):   0%|          | 0/3000 [00:00<?, ? examples/s]



In [10]:
# ============================================================
# CELL 7: Data Collator
# ============================================================
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True,
)


In [11]:
# ============================================================
# CELL 8: Training Arguments
# ============================================================
training_args = TrainingArguments(
    output_dir="./vi_to_km/phase4",
    
    # Evaluation & Saving
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    
    # Logging
    logging_steps=100,    
    # Batch size
    per_device_train_batch_size=128,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    
    # Learning rate - VERY LOW for full fine-tuning
    learning_rate=5e-5 ,
    lr_scheduler_type="linear",
    warmup_ratio=0.08,
    
    # Regularization - STRONGER to prevent overfitting
    weight_decay=0.1,
    max_grad_norm=0.4,  # Stricter clipping
    
    # Training
    num_train_epochs=12,
    
    # FP16
    bf16=True,
    fp16=False,

    # Speed
    group_by_length=True,
    dataloader_num_workers=8,
    dataloader_pin_memory=True,
    
    
    # Best model
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    report_to="none",
)

print("\nTraining configuration:")
print(f"Effective batch size: {128 * 4}")
print(f"Learning rate: {5e-5 }")
print(f"Epochs: {12}")
print(f"Label smoothing: 0.15")


Training configuration:
Effective batch size: 512
Learning rate: 5e-05
Epochs: 12
Label smoothing: 0.15


In [13]:
# ============================================================
# CELL 9: Create Trainer
# ============================================================
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[
            EarlyStoppingCallback(early_stopping_patience=6)
        ]
    )

  trainer = Trainer(


In [14]:
# ============================================================
# CELL 10: Train
# ============================================================
print("\n" + "="*70)
print("STARTING PHASE 4: FULL FINE-TUNING")
print("="*70 + "\n")



trainer.train()

print("\n" + "="*70)
print("TRAINING COMPLETED")
print("="*70)


STARTING PHASE 4: FULL FINE-TUNING



Step,Training Loss,Validation Loss
500,0.6798,0.694644
1000,0.7029,0.698146
1500,0.6884,0.691658
2000,0.6847,0.68416
2500,0.6564,0.677242
3000,0.6576,0.671043
3500,0.6519,0.667021
4000,0.6294,0.666786
4500,0.6335,0.663253
5000,0.6079,0.661816


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



TRAINING COMPLETED


In [15]:
# ============================================================
# CELL 11: Save Model
# ============================================================
print("\nSaving best model...")
trainer.save_model("./vi_to_km/phase4/best")
tokenizer.save_pretrained("./vi_to_km/phase4/best")
print(f"Model saved!!! ")


Saving best model...
Model saved!!! 


In [16]:
# ============================================================
# CELL 12: Evaluation Function
# ============================================================
def translate_batch(texts, model, tokenizer, batch_size=32):
    """Batch translation for speed"""
    model.eval()
    tokenizer.src_lang = "vi"
    tokenizer.tgt_lang = "km"
    
    outputs = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=256
        ).to("cuda")
        
        with torch.no_grad():
            gen = model.generate(
                **inputs,
                forced_bos_token_id=tokenizer.get_lang_id("km"),
                num_beams=5,
                max_length=256
            )
        
        texts_out = tokenizer.batch_decode(gen, skip_special_tokens=True)
        outputs.extend(texts_out)
        
        if (i // batch_size + 1) % 10 == 0:
            print(f"Translated {i+len(batch)}/{len(texts)}")
    
    return outputs

In [18]:
# ============================================================
# CELL 13: Evaluate on Test Set
# ============================================================
# Load test data
# Lấy dữ liệu test từ test_dataset (đã chia từ train.vi/train.lo)
test_vi = test_dataset["src_text"]
test_khm = test_dataset["tgt_text"]

print(f"\nTest set size: {len(test_vi)} examples")
print("\nTranslating test set...")

# Translate
print("\nTranslating test set...")
predictions = translate_batch(test_vi, model, tokenizer)

from sacrebleu import corpus_bleu


# Calculate BLEU
bleu_score = corpus_bleu(predictions, [test_khm])



Test set size: 1000 examples

Translating test set...

Translating test set...
Translated 320/1000
Translated 640/1000
Translated 960/1000


In [21]:
# ============================================================
# CELL 14: Final Results
# ============================================================
print("\n" + "="*70)
print("PHASE 4 RESULTS (FINAL)")
print("="*70)
print(f"BLEU Score: {bleu_score.score:.2f}")
print("="*70)

# Save predictions
with open("./vi_to_km/phase4/test_predictions.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(predictions))

print(f"\nPredictions saved !!!")


PHASE 4 RESULTS (FINAL)
BLEU Score: 23.82

Predictions saved !!!


In [22]:
# ============================================================
# CELL 15: Sample Translations
# ============================================================
print("\n" + "="*70)
print("SAMPLE TRANSLATIONS")
print("="*70)

for i in range(10):
    print(f"\nExample {i+1}:")
    print(f"Source    : {test_vi[i]}")
    print(f"Reference : {test_khm[i]}")
    print(f"Prediction: {predictions[i]}")
    print("-" * 70)



SAMPLE TRANSLATIONS

Example 1:
Source    : Ủy_ban bầu_cử Myanmar đã bác_bỏ tuyên_bố này .
Reference : ▁ គណៈកម្មការ រៀបចំការបោះឆ្នោត របស់ប្រទេស មីយ៉ាន់ម៉ា បាន ច្រ ា ន ចោល ការ អះអាង នេះ ។
Prediction: ការ អះអាង នេះត្រូវបាន គណៈកម្មការ រៀបចំការបោះឆ្នោត មីយ៉ាន់ម៉ា ច្រ ា ន ចោល ។
----------------------------------------------------------------------

Example 2:
Source    : Bộ_trưởng nói : " Phụ_nữ cũng muốn được lên_tiếng , được nêu lên những vấn_đề của chính mình . Trong quá_khứ , phụ_nữ nhận thấy rằng những người_làm chính_trị không thực_sự nêu lên những vấn_đề của chính họ vì bản_thân họ chưa từng đối_mặt với những vấn_đề đó . "
Reference : ▁លោកស្រី រដ្ឋមន្ត្រី មានប្រសាសន៍ថា ៖ « ស្ត្រី ក៏ ចង់បាន ស្រ្តី ជា សំឡេង ខ្លួន ដែរ ▁ លើក បញ្ហា របស់ ខ្លួនឯង ▁ដែល កន្លងមក ស្ត្រី ឃើញថា ▁ អ្នកដែល ធ្វើ នយោបាយ ▁លោក មិនបាន លើក បញ្ហា ខ្លួន ឲ្យ ពិត ជាក់ស្តែង ▁ដោយសារ លោក មិនបាន ជួប បញ្ហា ហ្នឹង ដោយ ខ្លួន គាត់ ។
Prediction: លោកស្រី រដ្ឋមន្រ្តី មានប្រសាសន៍ថា ៖ « ស្ត្រី ក៏ ចង់ឲ្យ មានការ បញ្ចេញមតិ ការ លើកឡើង នូវ បញ

In [23]:
# ============================================================
# CELL 16: Load All Phase Results
# ============================================================
print("\n" + "="*70)
print("LOADING RESULTS FROM ALL PHASES")
print("="*70)

results = {
    "phase1": 18.54,
    "phase2": 19.67,
    "phase3": 22.47,
    "phase4": bleu_score.score
}

# Try to load previous results
import os

if os.path.exists("./vi_to_km/phase2/phase2_predictions.txt"):
    with open("./vi_to_km/phase2/phase2_predictions.txt", encoding="utf-8") as f:
        phase2_preds = [l.strip() for l in f]
    results["phase2"] = corpus_bleu(phase2_preds, [test_khm]).score

if os.path.exists("./vi_to_km/phase3/phase3_predictions.tx"):
    with open("./vi_to_km/phase3/phase3_predictions.tx", encoding="utf-8") as f:
        phase3_preds = [l.strip() for l in f]
    results["phase3"] = corpus_bleu(phase3_preds, [test_lo]).score


LOADING RESULTS FROM ALL PHASES


In [24]:
# ============================================================
# CELL 17: Final Comparison
# ============================================================

import json
print("\n" + "="*70)
print("FINAL RESULTS COMPARISON")
print("="*70)
print(f"Phase1 (no freeze)       : {results['phase1']:.2f} BLEU")
if results['phase2']:
    print(f"Phase 2 (decoder only)     : {results['phase2']:.2f} BLEU")
if results['phase3']:
    print(f"Phase 3 (progressive)      : {results['phase3']:.2f} BLEU")
print(f"Phase 4 (full fine-tuning) : {results['phase4']:.2f} BLEU")

if results['phase2']:
    improvement = results['phase4'] - results['phase1']
    print(f"\nTotal improvement: +{improvement:.2f} BLEU")
print("="*70)

# Save results
with open("./vi_to_km/phase4/all_results.json", "w") as f:
    json.dump(results, f, indent=2)

print(f"\nAll results saved!!")

print("\n✓ All 4 phases completed successfully!")
print("\n" + "="*70)
print("TRAINING PIPELINE FINISHED")
print("="*70)


FINAL RESULTS COMPARISON
Phase1 (no freeze)       : 18.54 BLEU
Phase 2 (decoder only)     : 19.67 BLEU
Phase 3 (progressive)      : 22.47 BLEU
Phase 4 (full fine-tuning) : 23.82 BLEU

Total improvement: +5.28 BLEU

All results saved!!

✓ All 4 phases completed successfully!

TRAINING PIPELINE FINISHED
