In [1]:
!pip install torch transformers sentencepiece datasets sacrebleu accelerate pyvi laonlp

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-4.4.2-py3-none-any.whl.metadata (19 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting laonlp
  Downloading LaoNLP-1.2.0-py3-none-any.whl.metadata (4.0 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from tra

In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import torch
from datasets import Dataset, concatenate_datasets
from transformers import (
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
import random

from pyvi import ViTokenizer
from laonlp.tokenize import word_tokenize as lao_word_tokenize

print(torch.cuda.get_device_name(0))
print("VRAM:", torch.cuda.get_device_properties(0).total_memory / 1e9, "GB")

[2025-12-22 03:38:44,217] INFO numexpr.utils: Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
[2025-12-22 03:38:44,217] INFO numexpr.utils: Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[2025-12-22 03:38:44,217] INFO numexpr.utils: NumExpr defaulting to 8 threads.
NVIDIA H200
VRAM: 150.0217344 GB


In [3]:
def tokenize_vietnamese(text):
    """Tokenize Vietnamese text using PyVi"""
    try:
        return ViTokenizer.tokenize(text)
    except Exception as e:
        print(f"Error tokenizing Vietnamese: {e}")
        return text

def tokenize_lao(text):
    """Tokenize Lao text using LaoNLP"""
    try:
        # LaoNLP word_tokenize không có parameter engine
        tokens = lao_word_tokenize(text)
        return " ".join(tokens)
    except Exception as e:
        print(f"Error tokenizing Lao: {e}")
        return text

def tokenize_batch_vietnamese(texts):
    """Batch tokenize Vietnamese texts"""
    print(f"Tokenizing {len(texts)} Vietnamese texts...")
    results = []
    for i, text in enumerate(texts):
        results.append(tokenize_vietnamese(text))
        if (i + 1) % 10000 == 0:
            print(f"  Processed {i + 1}/{len(texts)} Vietnamese texts")
    return results

def tokenize_batch_lao(texts):
    """Batch tokenize Lao texts"""
    print(f"Tokenizing {len(texts)} Lao texts...")
    results = []
    for i, text in enumerate(texts):
        results.append(tokenize_lao(text))
        if (i + 1) % 10000 == 0:
            print(f"  Processed {i + 1}/{len(texts)} Lao texts")
    return results

print("Vietnamese and Lao tokenizers loaded successfully!")

Vietnamese and Lao tokenizers loaded successfully!


In [4]:
# ============================================================
# CELL 1: Load Best Model from Phase 2
# ============================================================
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained(
    "/work//m2m_vi_lo_phase2_decoder_only/best"
).cuda()

tokenizer = AutoTokenizer.from_pretrained(
    "/work/m2m_vi_lo_phase2_decoder_only/best"
)

print("Phase 2 model loaded!")

Phase 2 model loaded!


In [5]:
# ============================================================
# CELL 2: Unfreeze Top 6 Encoder Layers (layers 6-11)
# ============================================================
# 1. Freeze toàn bộ encoder trước
for param in model.model.encoder.parameters():
    param.requires_grad = False

# 2. Unfreeze top 6 layers
for i in range(6, 12):
    for param in model.model.encoder.layers[i].parameters():
        param.requires_grad = True

# Verify
total, frozen, trainable = 0, 0, 0
encoder_trainable = 0

for name, param in model.named_parameters():
    n = param.numel()
    total += n
    if not param.requires_grad:
        frozen += n
    else:
        trainable += n
        if "encoder" in name:
            encoder_trainable += n

print(f"Total params       : {total/1e6:.1f}M")
print(f"Frozen params      : {frozen/1e6:.1f}M")
print(f"Trainable params   : {trainable/1e6:.1f}M")
print(f"Encoder trainable  : {encoder_trainable/1e6:.1f}M")

Total params       : 483.9M
Frozen params      : 206.8M
Trainable params   : 277.1M
Encoder trainable  : 126.0M


In [6]:
DATA_DIR = "/work/data/dataset"
def load_parallel(src_file, tgt_file):
    with open(src_file, encoding="utf-8") as f:
        src = [l.strip() for l in f]
    with open(tgt_file, encoding="utf-8") as f:
        tgt = [l.strip() for l in f]
    
    assert len(src) == len(tgt)
    
    # Apply language-specific tokenization
    print("Tokenizing Vietnamese texts...")
    src_tokenized = tokenize_batch_vietnamese(src)
    
    print("Tokenizing Lao texts...")
    tgt_tokenized = tokenize_batch_lao(tgt)
    
    return Dataset.from_dict({
        "src_text": src_tokenized,
        "tgt_text": tgt_tokenized
    })


In [7]:
# Load toàn bộ dữ liệu từ train.vi và train.lo
full_dataset = load_parallel(
    f"{DATA_DIR}/train.vi",
    f"{DATA_DIR}/train.lo"
)

print(f"Total dataset size: {len(full_dataset)} examples")

# Chia dataset: 
# - Test: 1000 dòng cuối cùng
# - Dev: 3000 dòng trước test
# - Train: phần còn lại

test_size = 1000
dev_size = 3000

# Tính chỉ số
test_start_idx = len(full_dataset) - test_size
dev_start_idx = test_start_idx - dev_size

# Chia dataset
test_dataset = full_dataset.select(range(test_start_idx, len(full_dataset)))
dev_dataset = full_dataset.select(range(dev_start_idx, test_start_idx))
train_dataset = full_dataset.select(range(0, dev_start_idx))

# Shuffle training data để tránh bias thứ tự
train_dataset = train_dataset.shuffle(seed=42)

print(f"Train dataset: {len(train_dataset)} examples (for training)")
print(f"Dev dataset  : {len(dev_dataset)} examples (for validation during training)")
print(f"Test dataset : {len(test_dataset)} examples (for final evaluation)")
print("\nData split and shuffle completed.")

Tokenizing Vietnamese texts...
Tokenizing 695512 Vietnamese texts...
  Processed 10000/695512 Vietnamese texts
  Processed 20000/695512 Vietnamese texts
  Processed 30000/695512 Vietnamese texts
  Processed 40000/695512 Vietnamese texts
  Processed 50000/695512 Vietnamese texts
  Processed 60000/695512 Vietnamese texts
  Processed 70000/695512 Vietnamese texts
  Processed 80000/695512 Vietnamese texts
  Processed 90000/695512 Vietnamese texts
  Processed 100000/695512 Vietnamese texts
  Processed 110000/695512 Vietnamese texts
  Processed 120000/695512 Vietnamese texts
  Processed 130000/695512 Vietnamese texts
  Processed 140000/695512 Vietnamese texts
  Processed 150000/695512 Vietnamese texts
  Processed 160000/695512 Vietnamese texts
  Processed 170000/695512 Vietnamese texts
  Processed 180000/695512 Vietnamese texts
  Processed 190000/695512 Vietnamese texts
  Processed 200000/695512 Vietnamese texts
  Processed 210000/695512 Vietnamese texts
  Processed 220000/695512 Vietnamese 

In [8]:
MAX_LEN = 256

def preprocess(batch):
    tokenizer.src_lang = "vi"
    tokenizer.tgt_lang = "lo"

    inputs = tokenizer(
        batch["src_text"],
        truncation=True,
        max_length=MAX_LEN
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["tgt_text"],
            truncation=True,
            max_length=MAX_LEN
        )

    inputs["labels"] = labels["input_ids"]
    return inputs



In [9]:
train_dataset = train_dataset.map(
    preprocess,
    batched=True,
    remove_columns=train_dataset.column_names,
    num_proc=8
)

dev_dataset = dev_dataset.map(
    preprocess,
    batched=True,
    remove_columns=dev_dataset.column_names,
    num_proc=8
)


Map (num_proc=8):   0%|          | 0/691512 [00:00<?, ? examples/s]



Map (num_proc=8):   0%|          | 0/3000 [00:00<?, ? examples/s]



In [10]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True
)


In [11]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [12]:
# ============================================================
# CELL 3: Training Arguments - Phase 3
# ============================================================
training_args_phase3 = TrainingArguments(
    output_dir="/work/m2m_vi_lo_phase3_progressive",
    
    # Evaluation
    eval_strategy="steps",
    eval_steps=500,
    
    # Saving
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    
    # Logging
    logging_steps=100,
    
    # Batch size
    per_device_train_batch_size=128,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    
    # Learning rate - LOWER than phase 2
    learning_rate=3e-5,  # Lower for fine-tuning encoder
    lr_scheduler_type="cosine_with_restarts",
    warmup_ratio=0.05,
    
    # Regularization
    weight_decay=0.015,
    max_grad_norm=0.8,
    
    # Precision
    num_train_epochs=6,
    
    # FP16
    bf16=True,
    fp16=False,

    # Speed
    group_by_length=True,
    dataloader_num_workers=8,
    dataloader_pin_memory=True,
    
    # Best model
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    report_to="none",
)


In [13]:
# ============================================================
# CELL 4: Create Trainer - Phase 3
# ============================================================
trainer_phase3 = Trainer(
    model=model,
    args=training_args_phase3,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=4)
    ]
)


  trainer_phase3 = Trainer(


In [14]:
# ============================================================
# CELL 5: Train Phase 3
# ============================================================
print("\n" + "="*60)
print("PHASE 3: Progressive Unfreezing (Top 6 Encoder Layers)")
print("="*60 + "\n")

trainer_phase3.train()


PHASE 3: Progressive Unfreezing (Top 6 Encoder Layers)



Step,Training Loss,Validation Loss
500,0.9255,0.926515
1000,0.9641,0.932489
1500,0.9741,0.928035
2000,0.9643,0.919914
2500,0.9731,0.912851
3000,0.9292,0.908534
3500,0.9323,0.903722
4000,0.926,0.896435
4500,0.923,0.893608
5000,0.9204,0.887179


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=16212, training_loss=0.8766266609632378, metrics={'train_runtime': 3119.2004, 'train_samples_per_second': 1330.172, 'train_steps_per_second': 5.197, 'total_flos': 3.5294792901564826e+17, 'train_loss': 0.8766266609632378, 'epoch': 6.0})

In [15]:
# ============================================================
# CELL 6: Save Phase 3 Model
# ============================================================
trainer_phase3.save_model("/work/m2m_vi_lo_phase3_progressive/best")
tokenizer.save_pretrained("/work/m2m_vi_lo_phase3_progressive/best")

print("\nPhase 3 model saved!")



Phase 3 model saved!


In [16]:
# ============================================================
# CELL 7: Evaluate Phase 3
# ============================================================

def translate_batch(texts, model, tokenizer, batch_size=32):
    """Batch translation for speed"""
    model.eval()
    tokenizer.src_lang = "vi"
    tokenizer.tgt_lang = "lo"
    
    outputs = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=256
        ).to("cuda")
        
        with torch.no_grad():
            gen = model.generate(
                **inputs,
                forced_bos_token_id=tokenizer.get_lang_id("lo"),
                num_beams=5,
                max_length=256
            )
        
        texts_out = tokenizer.batch_decode(gen, skip_special_tokens=True)
        outputs.extend(texts_out)
        
        if (i // batch_size + 1) % 10 == 0:
            print(f"Translated {i+len(batch)}/{len(texts)}")
    
    return outputs

# Lấy dữ liệu test từ test_dataset (đã chia từ train.vi/train.lo)
test_vi = test_dataset["src_text"]
test_lo = test_dataset["tgt_text"]

print(f"\nTest set size: {len(test_vi)} examples")

# Translate
print("\nTranslating test set...")
preds_phase3 = translate_batch(test_vi, model, tokenizer)




Test set size: 1000 examples

Translating test set...
Translated 320/1000
Translated 640/1000
Translated 960/1000


In [17]:
from sacrebleu import corpus_bleu

# Calculate BLEU
bleu_phase3 = corpus_bleu(preds_phase3, [test_lo])
print(f"\n{'='*60}")
print(f"PHASE 3 BLEU Score: {bleu_phase3.score:.2f}")
print(f"{'='*60}\n")

# Save predictions
with open("/work/phase3_predictions.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(preds_phase3))


PHASE 3 BLEU Score: 27.31

