In [1]:
!pip install torch transformers sentencepiece datasets sacrebleu accelerate pyvi


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
# Cell 2: Setup and imports

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import torch
from datasets import Dataset, concatenate_datasets
from transformers import (
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback,
    AutoTokenizer
)
import random

from pyvi import ViTokenizer

print(torch.cuda.get_device_name(0))
print("VRAM:", torch.cuda.get_device_properties(0).total_memory / 1e9, "GB")

NVIDIA H200
VRAM: 150.0217344 GB


In [3]:
# Cell 3: Load Khmer tokenizer
# Load Khmer tokenizer from Hugging Face
khmer_word_tokenizer = AutoTokenizer.from_pretrained(
    "khopilot/km-tokenizer-khmer", 
    use_fast=False
)
print("Khmer tokenizer loaded successfully!")

# Cell 3b: Define tokenization functions
def tokenize_vietnamese(text):
    """Tokenize Vietnamese text using PyVi"""
    try:
        return ViTokenizer.tokenize(text)
    except Exception as e:
        print(f"Error tokenizing Vietnamese: {e}")
        return text

def tokenize_khmer(text):
    """Tokenize Khmer text using khopilot/km-tokenizer-khmer"""
    try:
        tokens = khmer_word_tokenizer.tokenize(text)
        return " ".join(tokens)
    except Exception as e:
        print(f"Error tokenizing Khmer: {e}")
        return text

def tokenize_batch_vietnamese(texts):
    """Batch tokenize Vietnamese texts"""
    print(f"Tokenizing {len(texts)} Vietnamese texts...")
    results = []
    for i, text in enumerate(texts):
        results.append(tokenize_vietnamese(text))
        if (i + 1) % 10000 == 0:
            print(f"  Processed {i + 1}/{len(texts)} Vietnamese texts")
    return results

def tokenize_batch_khmer(texts):
    """Batch tokenize Khmer texts"""
    print(f"Tokenizing {len(texts)} Khmer texts...")
    results = []
    for i, text in enumerate(texts):
        results.append(tokenize_khmer(text))
        if (i + 1) % 10000 == 0:
            print(f"  Processed {i + 1}/{len(texts)} Khmer texts")
    return results

print("Vietnamese and Khmer tokenizers loaded successfully!")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Khmer tokenizer loaded successfully!
Vietnamese and Khmer tokenizers loaded successfully!


In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained(
    "km_to_vi/phase2/best"
).cuda()

tokenizer = AutoTokenizer.from_pretrained(
    "km_to_vi/phase2/best"
)

print("Phase 2 model loaded!")

Phase 2 model loaded!


In [5]:
# ============================================================
# CELL 2: Unfreeze Top 6 Encoder Layers (layers 6-11)
# ============================================================
# 1. Freeze toàn bộ encoder trước
for param in model.model.encoder.parameters():
    param.requires_grad = False

# 2. Unfreeze top 6 layers
for i in range(6, 12):
    for param in model.model.encoder.layers[i].parameters():
        param.requires_grad = True

# Verify
total, frozen, trainable = 0, 0, 0
encoder_trainable = 0

for name, param in model.named_parameters():
    n = param.numel()
    total += n
    if not param.requires_grad:
        frozen += n
    else:
        trainable += n
        if "encoder" in name:
            encoder_trainable += n

print(f"Total params       : {total/1e6:.1f}M")
print(f"Frozen params      : {frozen/1e6:.1f}M")
print(f"Trainable params   : {trainable/1e6:.1f}M")
print(f"Encoder trainable  : {encoder_trainable/1e6:.1f}M")

Total params       : 483.9M
Frozen params      : 206.8M
Trainable params   : 277.1M
Encoder trainable  : 126.0M


In [6]:
# Cell 6: Load and prepare data
DATA_DIR = "dataset"

def load_parallel(src_file, tgt_file):
    with open(src_file, encoding="utf-8") as f:
        src = [l.strip() for l in f]
    with open(tgt_file, encoding="utf-8") as f:
        tgt = [l.strip() for l in f]
    
    assert len(src) == len(tgt)

    print("Tokenizing Vietnamese texts...")
    src_tokenized = tokenize_batch_khmer(src)
    
    print("Tokenizing Khmer texts...")
    tgt_tokenized = tokenize_batch_vietnamese(tgt) 
    
    return Dataset.from_dict({
        "src_text": src_tokenized,
        "tgt_text": tgt_tokenized
    })



# Load data from train.khm and train.vi
full_dataset = load_parallel(
    f"{DATA_DIR}/train_khmer_to_vi_shuf.khm",
    f"{DATA_DIR}/train_khmer_to_vi_shuf.vi"
)

print(f"Total dataset size: {len(full_dataset)} examples")

# Split dataset
test_size = 1000
dev_size = 3000

test_start_idx = len(full_dataset) - test_size
dev_start_idx = test_start_idx - dev_size

test_dataset = full_dataset.select(range(test_start_idx, len(full_dataset)))
dev_dataset = full_dataset.select(range(dev_start_idx, test_start_idx))
train_dataset = full_dataset.select(range(0, dev_start_idx))

train_dataset = train_dataset.shuffle(seed=42)

print(f"Train dataset: {len(train_dataset)} examples (for training)")
print(f"Dev dataset  : {len(dev_dataset)} examples (for validation during training)")
print(f"Test dataset : {len(test_dataset)} examples (for final evaluation)")
print("\nData split and shuffle completed.")

Tokenizing Vietnamese texts...
Tokenizing 700000 Khmer texts...
  Processed 10000/700000 Khmer texts
  Processed 20000/700000 Khmer texts
  Processed 30000/700000 Khmer texts
  Processed 40000/700000 Khmer texts
  Processed 50000/700000 Khmer texts
  Processed 60000/700000 Khmer texts
  Processed 70000/700000 Khmer texts
  Processed 80000/700000 Khmer texts
  Processed 90000/700000 Khmer texts
  Processed 100000/700000 Khmer texts
  Processed 110000/700000 Khmer texts
  Processed 120000/700000 Khmer texts
  Processed 130000/700000 Khmer texts
  Processed 140000/700000 Khmer texts
  Processed 150000/700000 Khmer texts
  Processed 160000/700000 Khmer texts
  Processed 170000/700000 Khmer texts
  Processed 180000/700000 Khmer texts
  Processed 190000/700000 Khmer texts
  Processed 200000/700000 Khmer texts
  Processed 210000/700000 Khmer texts
  Processed 220000/700000 Khmer texts
  Processed 230000/700000 Khmer texts
  Processed 240000/700000 Khmer texts
  Processed 250000/700000 Khmer t

In [7]:
# Cell 7: Preprocessing function
MAX_LEN = 256

def preprocess(batch):
    tokenizer.src_lang = "km"  # Changed from "lo" to "km" for Khmer
    tokenizer.tgt_lang = "vi"

    inputs = tokenizer(
        batch["src_text"],
        truncation=True,
        max_length=MAX_LEN
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["tgt_text"],
            truncation=True,
            max_length=MAX_LEN
        )

    inputs["labels"] = labels["input_ids"]
    return inputs

In [8]:
# Cell 8: Apply preprocessing
train_dataset = train_dataset.map(
    preprocess,
    batched=True,
    remove_columns=train_dataset.column_names,
    num_proc=8
)

dev_dataset = dev_dataset.map(
    preprocess,
    batched=True,
    remove_columns=dev_dataset.column_names,
    num_proc=8
)

Map (num_proc=8):   0%|          | 0/696000 [00:00<?, ? examples/s]



Map (num_proc=8):   0%|          | 0/3000 [00:00<?, ? examples/s]



In [9]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True
)


In [10]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

# ============================================================
# CELL 3: Training Arguments - Phase 3
# ============================================================
training_args_phase3 = TrainingArguments(
    output_dir="./km_to_vi/phase3",
    
    # Evaluation
    eval_strategy="steps",
    eval_steps=500,
    
    # Saving
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    
    # Logging
    logging_steps=100,
    
    # Batch size
    per_device_train_batch_size=128,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    
    # Learning rate - LOWER than phase 2
    learning_rate=3e-5,  # Lower for fine-tuning encoder
    lr_scheduler_type="cosine_with_restarts",
    warmup_ratio=0.05,
    
    # Regularization
    weight_decay=0.015,
    max_grad_norm=0.8,
    
    # Precision
    num_train_epochs=6,
    
    # FP16
    bf16=True,
    fp16=False,

    # Speed
    group_by_length=True,
    dataloader_num_workers=8,
    dataloader_pin_memory=True,
    
    # Best model
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    report_to="none",
)


In [11]:
# ============================================================
# CELL 4: Create Trainer - Phase 3
# ============================================================
trainer_phase3 = Trainer(
    model=model,
    args=training_args_phase3,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=4)
    ]
)


  trainer_phase3 = Trainer(


In [12]:
# ============================================================
# CELL 5: Train Phase 3
# ============================================================
print("\n" + "="*60)
print("PHASE 3: Progressive Unfreezing (Top 6 Encoder Layers)")
print("="*60 + "\n")

trainer_phase3.train()


PHASE 3: Progressive Unfreezing (Top 6 Encoder Layers)



Step,Training Loss,Validation Loss
500,0.8444,0.940557
1000,0.9691,0.92266
1500,0.9749,0.912176
2000,0.975,0.901583
2500,0.9778,0.890488
3000,0.923,0.879636
3500,0.9196,0.871153
4000,0.9066,0.863874
4500,0.9053,0.855147
5000,0.9035,0.849605


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=16314, training_loss=0.8554002293713817, metrics={'train_runtime': 4968.9164, 'train_samples_per_second': 840.425, 'train_steps_per_second': 3.283, 'total_flos': 7.269915377805558e+17, 'train_loss': 0.8554002293713817, 'epoch': 6.0})

In [14]:
# ============================================================
# CELL 6: Save Phase 3 Model
# ============================================================
trainer_phase3.save_model("./km_to_vi/phase3/best")
tokenizer.save_pretrained("./km_to_vi/phase3/best")

print("\nPhase 3 model saved!")



Phase 3 model saved!


In [15]:
# ============================================================
# CELL 7: Evaluate Phase 3
# ============================================================

def translate_batch(texts, model, tokenizer, batch_size=32):
    """Batch translation for speed"""
    model.eval()
    tokenizer.src_lang = "km"
    tokenizer.tgt_lang = "vi"
    
    outputs = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=256
        ).to("cuda")
        
        with torch.no_grad():
            gen = model.generate(
                **inputs,
                forced_bos_token_id=tokenizer.get_lang_id("vi"),
                num_beams=5,
                max_length=256
            )
        
        texts_out = tokenizer.batch_decode(gen, skip_special_tokens=True)
        outputs.extend(texts_out)
        
        if (i // batch_size + 1) % 10 == 0:
            print(f"Translated {i+len(batch)}/{len(texts)}")
    
    return outputs

# Lấy dữ liệu test từ test_dataset (đã chia từ train.vi/train.lo)
test_khm = test_dataset["src_text"]
test_vi = test_dataset["tgt_text"]

print(f"\nTest set size: {len(test_vi)} examples")

# Translate
print("\nTranslating test set...")
preds_phase3 = translate_batch(test_khm, model, tokenizer)




Test set size: 1000 examples

Translating test set...
Translated 320/1000
Translated 640/1000
Translated 960/1000


In [16]:
from sacrebleu import corpus_bleu

# Calculate BLEU
bleu_phase3 = corpus_bleu(preds_phase3, [test_vi])
print(f"\n{'='*60}")
print(f"PHASE 3 BLEU Score: {bleu_phase3.score:.2f}")
print(f"{'='*60}\n")

# Save predictions
with open("./km_to_vi/phase3/phase3_predictions.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(preds_phase3))

That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.



PHASE 3 BLEU Score: 50.48



In [17]:
# ============================================================
# CELL 15: Sample Translations
# ============================================================
print("\n" + "="*70)
print("SAMPLE TRANSLATIONS")
print("="*70)

for i in range(10):
    print(f"\nExample {i+1}:")
    print(f"Source    : {test_khm[i]}")
    print(f"Reference : {test_vi[i]}")
    print(f"Prediction: {preds_phase3[i]}")
    print("-" * 70)



SAMPLE TRANSLATIONS

Example 1:
Source    : ▁ទោះជាយ៉ាងណា ក៏ដោយ ▁ ក្នុងអំឡុងពេល ប្រតិបត្តិការ របស់ក្រុមហ៊ុន ▁T i C o ▁លោក ▁Th u an ▁គឺជា អ្នក ទទួលបន្ទុក ដោយផ្ទាល់ លើ ប្រតិបត្តិការ ទាំងអស់ ▁និង ទទួលខុសត្រូវ ទាំងស្រុង ។
Reference : Tuy_nhiên , trong quá_trình Công_ty TiCo hoạt_động , Thuận là người trực_tiếp điều_hành mọi công_việc và chịu trách_nhiệm .
Prediction: Tuy_nhiên , trong quá_trình hoạt_động của Tập_đoàn TiCo , ông Thuận là người trực_tiếp phụ_trách mọi hoạt_động và hoàn_toàn chịu trách_nhiệm .
----------------------------------------------------------------------

Example 2:
Source    : ▁ ថ្នាំ ▁M e th ad on e ▁ ជួយ បន្ថយ រោគសញ្ញា នៃការ ដក ថ្នាំ ▁ដោយ កាត់បន្ថយ ការ ឃ្លា ន ▁ការ ប្រកួតប្រជែង ▁និង ផលប៉ះពាល់ នៃ ថ្នាំ ហេ រ៉ូ អ៊ីន យ៉ាងច្រើន ។
Reference : Methadone có tác_dụng làm mất các biểu_hiện của hội_chứng_cai , giảm đáng_kể thèm nhớ , cạnh_tranh và khóa tác_động của heroin .
Prediction: Methadone giúp làm dịu các triệu_chứng của việc loại_bỏ thuốc bằng cách giảm đáng_kể cơn đó