In [1]:
!pip install torch transformers sentencepiece datasets sacrebleu accelerate pyvi laonlp

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-4.4.2-py3-none-any.whl.metadata (19 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting laonlp
  Downloading LaoNLP-1.2.0-py3-none-any.whl.metadata (4.0 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from tra

In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import torch
from datasets import Dataset, concatenate_datasets
from transformers import (
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
import random

from pyvi import ViTokenizer
from laonlp.tokenize import word_tokenize as lao_word_tokenize

print(torch.cuda.get_device_name(0))
print("VRAM:", torch.cuda.get_device_properties(0).total_memory / 1e9, "GB")

[2025-12-22 04:41:50,904] INFO numexpr.utils: Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
[2025-12-22 04:41:50,904] INFO numexpr.utils: Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[2025-12-22 04:41:50,904] INFO numexpr.utils: NumExpr defaulting to 8 threads.
NVIDIA H200
VRAM: 150.0217344 GB


In [3]:
def tokenize_vietnamese(text):
    """Tokenize Vietnamese text using PyVi"""
    try:
        return ViTokenizer.tokenize(text)
    except Exception as e:
        print(f"Error tokenizing Vietnamese: {e}")
        return text

def tokenize_lao(text):
    """Tokenize Lao text using LaoNLP"""
    try:
        # LaoNLP word_tokenize không có parameter engine
        tokens = lao_word_tokenize(text)
        return " ".join(tokens)
    except Exception as e:
        print(f"Error tokenizing Lao: {e}")
        return text

def tokenize_batch_vietnamese(texts):
    """Batch tokenize Vietnamese texts"""
    print(f"Tokenizing {len(texts)} Vietnamese texts...")
    results = []
    for i, text in enumerate(texts):
        results.append(tokenize_vietnamese(text))
        if (i + 1) % 10000 == 0:
            print(f"  Processed {i + 1}/{len(texts)} Vietnamese texts")
    return results

def tokenize_batch_lao(texts):
    """Batch tokenize Lao texts"""
    print(f"Tokenizing {len(texts)} Lao texts...")
    results = []
    for i, text in enumerate(texts):
        results.append(tokenize_lao(text))
        if (i + 1) % 10000 == 0:
            print(f"  Processed {i + 1}/{len(texts)} Lao texts")
    return results

print("Vietnamese and Lao tokenizers loaded successfully!")

Vietnamese and Lao tokenizers loaded successfully!


In [None]:
# ============================================================
# CELL 2: Configuration
# ============================================================
PHASE3_MODEL = "/work/m2m_vi_lo_phase3_progressive/best"
DATA_DIR = "/work/data/dataset"
OUTPUT_DIR = "/work/phase4_full"

MAX_LEN = 256
BATCH_SIZE = 128  # Slightly reduced for full model
GRAD_ACCUM = 4
LEARNING_RATE = 5e-5  # Very low for full fine-tuning
NUM_EPOCHS = 12  # More epochs with early stopping

In [5]:
# ============================================================
# CELL 3: Load Phase 3 Model
# ============================================================
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


print("Loading Phase 3 model...")
model = AutoModelForSeq2SeqLM.from_pretrained(PHASE3_MODEL).cuda()
tokenizer = AutoTokenizer.from_pretrained(PHASE3_MODEL)

print(f"Model parameters: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")
print("Phase 3 model loaded successfully!")

Loading Phase 3 model...
Model parameters: 483.9M
Phase 3 model loaded successfully!


In [6]:
# ============================================================
# CELL 4: Unfreeze ALL Parameters
# ============================================================
print("\nUnfreezing all parameters...")

for param in model.parameters():
    param.requires_grad = True

# Verify
total, trainable = 0, 0
encoder_trainable, decoder_trainable = 0, 0

for name, param in model.named_parameters():
    n = param.numel()
    total += n
    
    if param.requires_grad:
        trainable += n
        if "encoder" in name:
            encoder_trainable += n
        elif "decoder" in name:
            decoder_trainable += n

print("\n" + "="*70)
print("PARAMETER STATISTICS")
print("="*70)
print(f"Total params        : {total/1e6:.1f}M")
print(f"Trainable params    : {trainable/1e6:.1f}M")
print(f"  - Encoder         : {encoder_trainable/1e6:.1f}M")
print(f"  - Decoder         : {decoder_trainable/1e6:.1f}M")
print(f"Trainable %         : 100.0%")
print("="*70)



Unfreezing all parameters...

PARAMETER STATISTICS
Total params        : 483.9M
Trainable params    : 483.9M
  - Encoder         : 201.6M
  - Decoder         : 151.2M
Trainable %         : 100.0%


In [7]:
# ============================================================
# CELL 5: Load Data
# ============================================================
def load_parallel(src_file, tgt_file):
    with open(src_file, encoding="utf-8") as f:
        src = [l.strip() for l in f]
    with open(tgt_file, encoding="utf-8") as f:
        tgt = [l.strip() for l in f]
    
    assert len(src) == len(tgt)
    
    # Apply language-specific tokenization
    print("Tokenizing Vietnamese texts...")
    src_tokenized = tokenize_batch_vietnamese(src)
    
    print("Tokenizing Lao texts...")
    tgt_tokenized = tokenize_batch_lao(tgt)
    
    return Dataset.from_dict({
        "src_text": src_tokenized,
        "tgt_text": tgt_tokenized
    })


print("\nLoading datasets...")
# Load toàn bộ dữ liệu từ train.vi và train.lo
full_dataset = load_parallel(
    f"{DATA_DIR}/train.vi",
    f"{DATA_DIR}/train.lo"
)

print(f"Total dataset size: {len(full_dataset)} examples")

# Chia dataset: 
# - Test: 1000 dòng cuối cùng
# - Dev: 3000 dòng trước test
# - Train: phần còn lại

test_size = 1000
dev_size = 3000

# Tính chỉ số
test_start_idx = len(full_dataset) - test_size
dev_start_idx = test_start_idx - dev_size

# Chia dataset
test_dataset = full_dataset.select(range(test_start_idx, len(full_dataset)))
dev_dataset = full_dataset.select(range(dev_start_idx, test_start_idx))
train_dataset = full_dataset.select(range(0, dev_start_idx))

# Shuffle training data để tránh bias thứ tự
train_dataset = train_dataset.shuffle(seed=42)

print(f"Train dataset: {len(train_dataset)} examples (for training)")
print(f"Dev dataset  : {len(dev_dataset)} examples (for validation during training)")
print(f"Test dataset : {len(test_dataset)} examples (for final evaluation)")
print("\nData split and shuffle completed.")


Loading datasets...
Tokenizing Vietnamese texts...
Tokenizing 695512 Vietnamese texts...
  Processed 10000/695512 Vietnamese texts
  Processed 20000/695512 Vietnamese texts
  Processed 30000/695512 Vietnamese texts
  Processed 40000/695512 Vietnamese texts
  Processed 50000/695512 Vietnamese texts
  Processed 60000/695512 Vietnamese texts
  Processed 70000/695512 Vietnamese texts
  Processed 80000/695512 Vietnamese texts
  Processed 90000/695512 Vietnamese texts
  Processed 100000/695512 Vietnamese texts
  Processed 110000/695512 Vietnamese texts
  Processed 120000/695512 Vietnamese texts
  Processed 130000/695512 Vietnamese texts
  Processed 140000/695512 Vietnamese texts
  Processed 150000/695512 Vietnamese texts
  Processed 160000/695512 Vietnamese texts
  Processed 170000/695512 Vietnamese texts
  Processed 180000/695512 Vietnamese texts
  Processed 190000/695512 Vietnamese texts
  Processed 200000/695512 Vietnamese texts
  Processed 210000/695512 Vietnamese texts
  Processed 2200

In [8]:
# ============================================================
# CELL 6: Preprocessing
# ============================================================
def preprocess(batch):
    tokenizer.src_lang = "vi"
    tokenizer.tgt_lang = "lo"

    inputs = tokenizer(
        batch["src_text"],
        truncation=True,
        max_length=MAX_LEN
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["tgt_text"],
            truncation=True,
            max_length=MAX_LEN
        )

    inputs["labels"] = labels["input_ids"]
    return inputs

print("\nPreprocessing datasets...")
train_processed = train_dataset.map(
    preprocess,
    batched=True,
    remove_columns=train_dataset.column_names,
    num_proc=8,
    desc="Processing train"
)

dev_processed = dev_dataset.map(
    preprocess,
    batched=True,
    remove_columns=dev_dataset.column_names,
    num_proc=8,
    desc="Processing dev"
)

print("Preprocessing completed!")



Preprocessing datasets...


Processing train (num_proc=8):   0%|          | 0/691512 [00:00<?, ? examples/s]



Processing dev (num_proc=8):   0%|          | 0/3000 [00:00<?, ? examples/s]



Preprocessing completed!


In [9]:
# ============================================================
# CELL 7: Data Collator
# ============================================================
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True,
)


In [10]:
# ============================================================
# CELL 8: Training Arguments
# ============================================================
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    
    # Evaluation & Saving
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    
    # Logging
    logging_steps=100,
    logging_dir=f"{OUTPUT_DIR}/logs",
    
    # Batch size
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=GRAD_ACCUM,
    
    # Learning rate - VERY LOW for full fine-tuning
    learning_rate=LEARNING_RATE,
    lr_scheduler_type="linear",
    warmup_ratio=0.08,
    
    # Regularization - STRONGER to prevent overfitting
    weight_decay=0.1,
    max_grad_norm=0.4,  # Stricter clipping
    
    # Training
    num_train_epochs=NUM_EPOCHS,
    
    # FP16
    bf16=True,
    fp16=False,

    # Speed
    group_by_length=True,
    dataloader_num_workers=8,
    dataloader_pin_memory=True,
    
    
    # Best model
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    report_to="none",
)

print("\nTraining configuration:")
print(f"Effective batch size: {BATCH_SIZE * GRAD_ACCUM}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Epochs: {NUM_EPOCHS}")
print(f"Label smoothing: 0.15")


Training configuration:
Effective batch size: 512
Learning rate: 5e-05
Epochs: 12
Label smoothing: 0.15


In [11]:
# ============================================================
# CELL 9: Create Trainer
# ============================================================
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_processed,
        eval_dataset=dev_processed,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[
            EarlyStoppingCallback(early_stopping_patience=8)
        ]
    )

  trainer = Trainer(


In [12]:
# ============================================================
# CELL 10: Train
# ============================================================
print("\n" + "="*70)
print("STARTING PHASE 4: FULL FINE-TUNING")
print("="*70 + "\n")



trainer.train()

print("\n" + "="*70)
print("TRAINING COMPLETED")
print("="*70)


STARTING PHASE 4: FULL FINE-TUNING



Step,Training Loss,Validation Loss
500,0.8128,0.851697
1000,0.8308,0.861033
1500,0.8262,0.870433
2000,0.8323,0.861779
2500,0.8297,0.855557
3000,0.7877,0.850067
3500,0.7982,0.843864
4000,0.7892,0.834091
4500,0.7563,0.834912
5000,0.7571,0.82634


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



TRAINING COMPLETED


In [13]:
# ============================================================
# CELL 11: Save Model
# ============================================================
print("\nSaving best model...")
trainer.save_model(f"{OUTPUT_DIR}/best")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/best")
print(f"Model saved to {OUTPUT_DIR}/best")


Saving best model...
Model saved to /work/phase4_full/best


In [14]:
# ============================================================
# CELL 12: Evaluation Function
# ============================================================
def translate_batch(texts, model, tokenizer, batch_size=32):
    """Batch translation for speed"""
    model.eval()
    tokenizer.src_lang = "vi"
    tokenizer.tgt_lang = "lo"
    
    outputs = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=256
        ).to("cuda")
        
        with torch.no_grad():
            gen = model.generate(
                **inputs,
                forced_bos_token_id=tokenizer.get_lang_id("lo"),
                num_beams=5,
                max_length=256
            )
        
        texts_out = tokenizer.batch_decode(gen, skip_special_tokens=True)
        outputs.extend(texts_out)
        
        if (i // batch_size + 1) % 10 == 0:
            print(f"Translated {i+len(batch)}/{len(texts)}")
    
    return outputs

In [15]:
# ============================================================
# CELL 13: Evaluate on Test Set
# ============================================================
# Load test data
# Lấy dữ liệu test từ test_dataset (đã chia từ train.vi/train.lo)
test_vi = test_dataset["src_text"]
test_lo = test_dataset["tgt_text"]

print(f"\nTest set size: {len(test_vi)} examples")
print("\nTranslating test set...")

# Translate
print("\nTranslating test set...")
predictions = translate_batch(test_vi, model, tokenizer)

from sacrebleu import corpus_bleu


# Calculate BLEU
bleu_score = corpus_bleu(predictions, [test_lo])



Test set size: 1000 examples

Translating test set...

Translating test set...
Translated 320/1000
Translated 640/1000
Translated 960/1000


In [16]:
# ============================================================
# CELL 14: Final Results
# ============================================================
print("\n" + "="*70)
print("PHASE 4 RESULTS (FINAL)")
print("="*70)
print(f"BLEU Score: {bleu_score.score:.2f}")
print("="*70)

# Save predictions
with open(f"{OUTPUT_DIR}/test_predictions.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(predictions))

print(f"\nPredictions saved to {OUTPUT_DIR}/test_predictions.txt")


PHASE 4 RESULTS (FINAL)
BLEU Score: 28.72

Predictions saved to /work/phase4_full/test_predictions.txt


In [17]:
# ============================================================
# CELL 15: Sample Translations
# ============================================================
print("\n" + "="*70)
print("SAMPLE TRANSLATIONS")
print("="*70)

for i in range(10):
    print(f"\nExample {i+1}:")
    print(f"Source    : {test_vi[i]}")
    print(f"Reference : {test_lo[i]}")
    print(f"Prediction: {predictions[i]}")
    print("-" * 70)



SAMPLE TRANSLATIONS

Example 1:
Source    : Chính_phủ Hoa_Kỳ sẽ tiếp_tục hợp_tác với Văn_phòng Tình_báo Quốc_gia ( NRA ) và UXO Lao để xác_định các phương_thức khác nhằm tăng_cường năng_lực của ngành rà phá bom mìn chưa nổ , phù_hợp với các ưu_tiên của chính_phủ Lào và tầm nhìn_chung về việc loại_bỏ bom mìn chưa nổ như một trở_ngại cho sự phát_triển vào năm 2030 .
Reference : ລັດຖະບານ   ສະຫະລັດ ອາເມຣິກາ ຈະ ສືບຕໍ່ ເຮັດວຽກ ກັບ   ຄຊກລ   ( NRA )   ແລະ   ຄກລ   ( UXO   Lao )   ເພື່ອ ກຳນົດ ວິທີການ ອື່ນ ໆ   ໃນ ການຍົກສູງ   ຄວາມ ອາດ ສາ   ມາດ ຂອງ ຂະແໜງ ເກັບ ກູ້ ລະເບີດ ບໍ່ທັນ ແຕກ   ໃຫ້ ສອດຄ່ອງ   ກັບ ບູລິມະສິດ ຂອງ ລັດຖະບານ ລາວ   ແລະ ມີ ວິໄສທັດ ຮ່ວມກັນ   ໃນ ການລົບລ້າງ ລະເບີດ ບໍ່ທັນ ແຕກ   ທີ ເປັນ ສິ່ງ ກີດຂວາງ ຕໍ່ ການພັດທະນາ   ພາຍໃນ ປີ   2030
Prediction: ລັດຖະບານ ສະຫະລັດ ອາເມຣິກາ ຈະ ສືບຕໍ່ ເຮັດວຽກ ຮ່ວມ ກັບ ຫ້ອງການ ສືບ ລັບ ແຫ່ງຊາດ ( NRA ) ແລະ UXO Lao ເພື່ອ ກໍານົດ ວິທີການ ອື່ນ ໆ ເພື່ອ ເສີມສ້າງ ຄວາມ ອາດ ສາມາດ ໃນ ການເກັບ ກູ້ ລະເບີດ ບໍ່ທັນ ແຕກ ໃຫ້ ສອດຄ່ອງ ກັບ ບູລິມະສິດ ຂອງ ລັດຖະບານ ລາວ ແລະ ວິໄສທັດ ຮ່ວມກັນ

In [18]:
# ============================================================
# CELL 16: Load All Phase Results
# ============================================================
print("\n" + "="*70)
print("LOADING RESULTS FROM ALL PHASES")
print("="*70)

results = {
    "phase1": 23.28,
    "phase2": 25.71,
    "phase3": 26.92,
    "phase4": bleu_score.score
}

# Try to load previous results
import os

if os.path.exists("/work/phase2_predictions.txt"):
    with open("/work/phase2_predictions.txt", encoding="utf-8") as f:
        phase2_preds = [l.strip() for l in f]
    results["phase2"] = corpus_bleu(phase2_preds, [test_lo]).score

if os.path.exists("/work/phase3_predictions.tx"):
    with open("/work/phase3_predictions.tx", encoding="utf-8") as f:
        phase3_preds = [l.strip() for l in f]
    results["phase3"] = corpus_bleu(phase3_preds, [test_lo]).score


LOADING RESULTS FROM ALL PHASES


In [19]:
# ============================================================
# CELL 17: Final Comparison
# ============================================================

import json
print("\n" + "="*70)
print("FINAL RESULTS COMPARISON")
print("="*70)
print(f"Phase1 (no freeze)       : {results['phase1']:.2f} BLEU")
if results['phase2']:
    print(f"Phase 2 (decoder only)     : {results['phase2']:.2f} BLEU")
if results['phase3']:
    print(f"Phase 3 (progressive)      : {results['phase3']:.2f} BLEU")
print(f"Phase 4 (full fine-tuning) : {results['phase4']:.2f} BLEU")

if results['phase2']:
    improvement = results['phase4'] - results['phase1']
    print(f"\nTotal improvement: +{improvement:.2f} BLEU")
print("="*70)

# Save results
with open(f"{OUTPUT_DIR}/all_results.json", "w") as f:
    json.dump(results, f, indent=2)

print(f"\nAll results saved to {OUTPUT_DIR}/all_results.json")

print("\n✓ All 4 phases completed successfully!")
print("\n" + "="*70)
print("TRAINING PIPELINE FINISHED")
print("="*70)


FINAL RESULTS COMPARISON
Phase1 (no freeze)       : 23.28 BLEU
Phase 2 (decoder only)     : 25.71 BLEU
Phase 3 (progressive)      : 26.92 BLEU
Phase 4 (full fine-tuning) : 28.72 BLEU

Total improvement: +5.44 BLEU

All results saved to /work/phase4_full/all_results.json

✓ All 4 phases completed successfully!

TRAINING PIPELINE FINISHED
