In [1]:
!pip install torch transformers sentencepiece datasets sacrebleu accelerate pyvi laonlp

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import torch
from datasets import Dataset, concatenate_datasets
from transformers import (
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
import random

from pyvi import ViTokenizer
from laonlp.tokenize import word_tokenize as lao_word_tokenize

print(torch.cuda.get_device_name(0))
print("VRAM:", torch.cuda.get_device_properties(0).total_memory / 1e9, "GB")

NVIDIA H200
VRAM: 150.0217344 GB


In [3]:
def tokenize_vietnamese(text):
    """Tokenize Vietnamese text using PyVi"""
    try:
        return ViTokenizer.tokenize(text)
    except Exception as e:
        print(f"Error tokenizing Vietnamese: {e}")
        return text

def tokenize_lao(text):
    """Tokenize Lao text using LaoNLP"""
    try:
        # LaoNLP word_tokenize không có parameter engine
        tokens = lao_word_tokenize(text)
        return " ".join(tokens)
    except Exception as e:
        print(f"Error tokenizing Lao: {e}")
        return text

def tokenize_batch_vietnamese(texts):
    """Batch tokenize Vietnamese texts"""
    print(f"Tokenizing {len(texts)} Vietnamese texts...")
    results = []
    for i, text in enumerate(texts):
        results.append(tokenize_vietnamese(text))
        if (i + 1) % 10000 == 0:
            print(f"  Processed {i + 1}/{len(texts)} Vietnamese texts")
    return results

def tokenize_batch_lao(texts):
    """Batch tokenize Lao texts"""
    print(f"Tokenizing {len(texts)} Lao texts...")
    results = []
    for i, text in enumerate(texts):
        results.append(tokenize_lao(text))
        if (i + 1) % 10000 == 0:
            print(f"  Processed {i + 1}/{len(texts)} Lao texts")
    return results

print("Vietnamese and Lao tokenizers loaded successfully!")

Vietnamese and Lao tokenizers loaded successfully!


In [4]:
# ============================================================
# CELL 2: Configuration
# ============================================================
PHASE3_MODEL = "./lo_to_vi/phase3/best"
DATA_DIR = "dataset"
OUTPUT_DIR = "./lo_to_vi/phase4"

MAX_LEN = 256
BATCH_SIZE = 128  # Slightly reduced for full model
GRAD_ACCUM = 4
LEARNING_RATE = 5e-5  # Very low for full fine-tuning
NUM_EPOCHS = 12  # More epochs with early stopping

In [5]:
# ============================================================
# CELL 3: Load Phase 3 Model
# ============================================================
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


print("Loading Phase 3 model...")
model = AutoModelForSeq2SeqLM.from_pretrained(PHASE3_MODEL).cuda()
tokenizer = AutoTokenizer.from_pretrained(PHASE3_MODEL)

print(f"Model parameters: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")
print("Phase 3 model loaded successfully!")

Loading Phase 3 model...
Model parameters: 483.9M
Phase 3 model loaded successfully!


In [6]:
# ============================================================
# CELL 4: Unfreeze ALL Parameters
# ============================================================
print("\nUnfreezing all parameters...")

for param in model.parameters():
    param.requires_grad = True

# Verify
total, trainable = 0, 0
encoder_trainable, decoder_trainable = 0, 0

for name, param in model.named_parameters():
    n = param.numel()
    total += n
    
    if param.requires_grad:
        trainable += n
        if "encoder" in name:
            encoder_trainable += n
        elif "decoder" in name:
            decoder_trainable += n

print("\n" + "="*70)
print("PARAMETER STATISTICS")
print("="*70)
print(f"Total params        : {total/1e6:.1f}M")
print(f"Trainable params    : {trainable/1e6:.1f}M")
print(f"  - Encoder         : {encoder_trainable/1e6:.1f}M")
print(f"  - Decoder         : {decoder_trainable/1e6:.1f}M")
print(f"Trainable %         : 100.0%")
print("="*70)



Unfreezing all parameters...

PARAMETER STATISTICS
Total params        : 483.9M
Trainable params    : 483.9M
  - Encoder         : 201.6M
  - Decoder         : 151.2M
Trainable %         : 100.0%


In [7]:
# ============================================================
# CELL 5: Load Data
# ============================================================
def load_parallel(src_file, tgt_file):
    with open(src_file, encoding="utf-8") as f:
        src = [l.strip() for l in f]
    with open(tgt_file, encoding="utf-8") as f:
        tgt = [l.strip() for l in f]
    
    assert len(src) == len(tgt)
    
    # Apply language-specific tokenization
    print("Tokenizing Vietnamese texts...")
    src_tokenized = tokenize_batch_lao(src)
    
    print("Tokenizing Lao texts...")
    tgt_tokenized = tokenize_batch_vietnamese(tgt)
    
    return Dataset.from_dict({
        "src_text": src_tokenized,
        "tgt_text": tgt_tokenized
    })



# Load toàn bộ dữ liệu từ train.vi và train.lo
full_dataset = load_parallel(
    f"{DATA_DIR}/train_lo_to_vi_shuf.lo",
    f"{DATA_DIR}/train_lo_to_vi_shuf.vi"
)

print(f"Total dataset size: {len(full_dataset)} examples")

# Chia dataset: 
# - Test: 1000 dòng cuối cùng
# - Dev: 3000 dòng trước test
# - Train: phần còn lại

test_size = 1000
dev_size = 3000

# Tính chỉ số
test_start_idx = len(full_dataset) - test_size
dev_start_idx = test_start_idx - dev_size

# Chia dataset
test_dataset = full_dataset.select(range(test_start_idx, len(full_dataset)))
dev_dataset = full_dataset.select(range(dev_start_idx, test_start_idx))
train_dataset = full_dataset.select(range(0, dev_start_idx))

# Shuffle training data để tránh bias thứ tự
train_dataset = train_dataset.shuffle(seed=42)

print(f"Train dataset: {len(train_dataset)} examples (for training)")
print(f"Dev dataset  : {len(dev_dataset)} examples (for validation during training)")
print(f"Test dataset : {len(test_dataset)} examples (for final evaluation)")
print("\nData split and shuffle completed.")

Tokenizing Vietnamese texts...
Tokenizing 700000 Lao texts...
  Processed 10000/700000 Lao texts
  Processed 20000/700000 Lao texts
  Processed 30000/700000 Lao texts
  Processed 40000/700000 Lao texts
  Processed 50000/700000 Lao texts
  Processed 60000/700000 Lao texts
  Processed 70000/700000 Lao texts
  Processed 80000/700000 Lao texts
  Processed 90000/700000 Lao texts
  Processed 100000/700000 Lao texts
  Processed 110000/700000 Lao texts
  Processed 120000/700000 Lao texts
  Processed 130000/700000 Lao texts
  Processed 140000/700000 Lao texts
  Processed 150000/700000 Lao texts
  Processed 160000/700000 Lao texts
  Processed 170000/700000 Lao texts
  Processed 180000/700000 Lao texts
  Processed 190000/700000 Lao texts
  Processed 200000/700000 Lao texts
  Processed 210000/700000 Lao texts
  Processed 220000/700000 Lao texts
  Processed 230000/700000 Lao texts
  Processed 240000/700000 Lao texts
  Processed 250000/700000 Lao texts
  Processed 260000/700000 Lao texts
  Processed

In [20]:
MAX_LEN = 256

def preprocess(batch):
    tokenizer.src_lang = "lo"
    tokenizer.tgt_lang = "vi"

    inputs = tokenizer(
        batch["src_text"],
        truncation=True,
        max_length=MAX_LEN
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["tgt_text"],
            truncation=True,
            max_length=MAX_LEN
        )

    inputs["labels"] = labels["input_ids"]
    return inputs


In [21]:
# ============================================================
# CELL 7: Data Collator
# ============================================================
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True,
)


In [22]:
# Cell 8: Apply preprocessing
train_dataset = train_dataset.map(
    preprocess,
    batched=True,
    remove_columns=train_dataset.column_names,
    num_proc=8
)

dev_dataset = dev_dataset.map(
    preprocess,
    batched=True,
    remove_columns=dev_dataset.column_names,
    num_proc=8
)

Map (num_proc=8):   0%|          | 0/696000 [00:00<?, ? examples/s]



Map (num_proc=8):   0%|          | 0/3000 [00:00<?, ? examples/s]



In [23]:
# ============================================================
# CELL 8: Training Arguments
# ============================================================
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    
    # Evaluation & Saving
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    
    # Logging
    logging_steps=100,
    
    # Batch size
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=GRAD_ACCUM,
    
    # Learning rate - VERY LOW for full fine-tuning
    learning_rate=LEARNING_RATE,
    lr_scheduler_type="linear",
    warmup_ratio=0.08,
    
    # Regularization - STRONGER to prevent overfitting
    weight_decay=0.1,
    max_grad_norm=0.4,  # Stricter clipping
    
    # Training
    num_train_epochs=NUM_EPOCHS,
    
    # FP16
    bf16=True,
    fp16=False,

    # Speed
    group_by_length=True,
    dataloader_num_workers=8,
    dataloader_pin_memory=True,
    
    remove_unused_columns=False,
    
    # Best model
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    report_to="none",
)

print("\nTraining configuration:")
print(f"Effective batch size: {BATCH_SIZE * GRAD_ACCUM}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Epochs: {NUM_EPOCHS}")
print(f"Label smoothing: 0.15")


Training configuration:
Effective batch size: 512
Learning rate: 5e-05
Epochs: 12
Label smoothing: 0.15


In [24]:
# ============================================================
# CELL 9: Create Trainer
# ============================================================
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[
            EarlyStoppingCallback(early_stopping_patience=8)
        ]
    )

  trainer = Trainer(


In [25]:
print(type(trainer.train_dataset[0]))
print(trainer.train_dataset[0])

<class 'dict'>
{'input_ids': [128056, 5298, 42327, 221, 11864, 376, 61579, 45247, 39263, 19922, 7673, 42433, 9400, 5856, 6550, 17915, 16987, 221, 2816, 205, 61579, 45247, 4845, 58613, 41026, 25710, 19922, 7673, 42433, 13473, 7908, 12893, 7908, 221, 45450, 10621, 22, 13272, 9416, 15061, 221, 2816, 12893, 40951, 25946, 19966, 237, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [128097, 17829, 4338, 221, 745, 376, 13069, 666, 699, 3367, 31221, 666, 434, 76454, 221, 205, 13069, 666, 699, 6587, 3818, 22983, 21713, 2630, 121993, 106, 666, 402, 121993, 106, 221, 52187, 666, 17557, 2339, 714, 98, 77517, 666, 229, 26982, 237, 2]}


In [26]:
# ============================================================
# CELL 10: Train
# ============================================================
print("\n" + "="*70)
print("STARTING PHASE 4: FULL FINE-TUNING")
print("="*70 + "\n")



trainer.train()

print("\n" + "="*70)
print("TRAINING COMPLETED")
print("="*70)


STARTING PHASE 4: FULL FINE-TUNING



Step,Training Loss,Validation Loss
500,0.6836,0.745705
1000,0.6982,0.752885
1500,0.6861,0.753658
2000,0.6949,0.7476
2500,0.6961,0.741368
3000,0.6585,0.734965
3500,0.6597,0.732551
4000,0.657,0.725465
4500,0.6294,0.72465
5000,0.6363,0.719912


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



TRAINING COMPLETED


In [27]:
# ============================================================
# CELL 11: Save Model
# ============================================================
print("\nSaving best model...")
trainer.save_model(f"{OUTPUT_DIR}/best")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/best")
print(f"Model saved to {OUTPUT_DIR}/best")


Saving best model...
Model saved to ./lo_to_vi/phase4/best


In [28]:
# ============================================================
# CELL 6: Evaluate Phase 2
# ============================================================
def translate_batch(texts, model, tokenizer, batch_size=32):
    """Batch translation for speed"""
    model.eval()
    tokenizer.src_lang = "lo"
    tokenizer.tgt_lang = "vi"
    
    outputs = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=256
        ).to("cuda")
        
        with torch.no_grad():
            gen = model.generate(
                **inputs,
                forced_bos_token_id=tokenizer.get_lang_id("vi"),
                num_beams=5,
                max_length=256
            )
        
        texts_out = tokenizer.batch_decode(gen, skip_special_tokens=True)
        outputs.extend(texts_out)
        
        if (i // batch_size + 1) % 10 == 0:
            print(f"Translated {i+len(batch)}/{len(texts)}")
    
    return outputs




In [29]:
# ============================================================
# CELL 13: Evaluate on Test Set
# ============================================================
# Load test data
# Lấy dữ liệu test từ test_dataset (đã chia từ train.vi/train.lo)
test_lo = test_dataset["src_text"]
test_vi = test_dataset["tgt_text"]

print(f"\nTest set size: {len(test_vi)} examples")
print("\nTranslating test set...")
predictions = translate_batch(test_lo, model, tokenizer)


from sacrebleu import corpus_bleu


# Calculate BLEU
bleu_score = corpus_bleu(predictions, [test_vi])



Test set size: 1000 examples

Translating test set...
Translated 320/1000
Translated 640/1000
Translated 960/1000


That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


In [30]:
# ============================================================
# CELL 14: Final Results
# ============================================================
print("\n" + "="*70)
print("PHASE 4 RESULTS (FINAL)")
print("="*70)
print(f"BLEU Score: {bleu_score.score:.2f}")
print("="*70)

# Save predictions
with open(f"{OUTPUT_DIR}/test_predictions.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(predictions))

print(f"\nPredictions saved to {OUTPUT_DIR}/test_predictions.txt")


PHASE 4 RESULTS (FINAL)
BLEU Score: 52.01

Predictions saved to ./lo_to_vi/phase4/test_predictions.txt


In [31]:
# ============================================================
# CELL 15: Sample Translations
# ============================================================
print("\n" + "="*70)
print("SAMPLE TRANSLATIONS")
print("="*70)

for i in range(10):
    print(f"\nExample {i+1}:")
    print(f"Source    : {test_lo[i]}")
    print(f"Reference : {test_vi[i]}")
    print(f"Prediction: {predictions[i]}")
    print("-" * 70)



SAMPLE TRANSLATIONS

Example 1:
Source    : ພຽງ ແຕ່ ການຈັບ ຄູ່ ກະໂປງ ສັ້ນ ລາຍ ດອກ ກັບ ເສື້ອ ເປີດ ບ່າໄຫລ່ ກໍ ຈະ ເຮັດໃຫ້ ສາວ ໆ ມີ ຊຸດ ທີ່ເບິ່ງ ດີ ແລະ ເທ່ ທັນທີ .
Reference : Chỉ việc mix chân váy hoa dáng ngắn cùng áo trễ vai , các cô gái có ngay một set đồ " hack " dáng lại mát_rượi .
Prediction: Chỉ cần kết_hợp chân váy ngắn hoa với áo trễ vai là các cô nàng đã có ngay set đồ tôn dáng , cool ngầu .
----------------------------------------------------------------------

Example 2:
Source    : ຮູບເງົາ ບາງ ເລື່ອງ ທີ່   Phuong   Thanh   ໄດ້ ເຂົ້າຮ່ວມ ສະແດງ ລວມ ມີ :   Walking   and   Crying ,   Surrogate   Mother ,   When   Men   Get   Pregnant ,   The   Soul   of   Truong   Ba ,   the   Butcher ' s   Skin ,   The   Kiss   of   Death ,   Beautiful   Every   Centimeter ,   Rescuing   the   God   of   Death ,   ແລະ   Hot   Boy   Rebellion .
Reference : Các bộ phim Phương_Thanh đã tham_gia như : Vừa đi vừa khóc , Đẻ_mướn , Khi đàn_ông có bầu , Hồn Trương Ba , da hàng thịt , Nụ hôn thần_chết ,

In [34]:
# ============================================================
# CELL 16: Load All Phase Results
# ============================================================
print("\n" + "="*70)
print("LOADING RESULTS FROM ALL PHASES")
print("="*70)

results = {
    "phase1": 45.43,
    "phase2": 48.79,
    "phase3": 50.76,
    "phase4": bleu_score.score
}

# Try to load previous results
import os

if os.path.exists("./lo_to_vi/phase2/phase2_predictions.txt"):
    with open("./lo_to_vi/phase2/phase2_predictions.txt", encoding="utf-8") as f:
        phase2_preds = [l.strip() for l in f]
    results["phase2"] = corpus_bleu(phase2_preds, [test_lo]).score

if os.path.exists("./lo_to_vi/phase3/phase3_predictions.tx"):
    with open("./lo_to_vi/phase3/phase3_predictions.tx", encoding="utf-8") as f:
        phase3_preds = [l.strip() for l in f]
    results["phase3"] = corpus_bleu(phase3_preds, [test_lo]).score


LOADING RESULTS FROM ALL PHASES


In [35]:
# ============================================================
# CELL 17: Final Comparison
# ============================================================

import json
print("\n" + "="*70)
print("FINAL RESULTS COMPARISON")
print("="*70)
print(f"Phase1 (no freeze)       : {results['phase1']:.2f} BLEU")
if results['phase2']:
    print(f"Phase 2 (decoder only)     : {results['phase2']:.2f} BLEU")
if results['phase3']:
    print(f"Phase 3 (progressive)      : {results['phase3']:.2f} BLEU")
print(f"Phase 4 (full fine-tuning) : {results['phase4']:.2f} BLEU")

if results['phase2']:
    improvement = results['phase4'] - results['phase1']
    print(f"\nTotal improvement: +{improvement:.2f} BLEU")
print("="*70)

# Save results
with open(f"{OUTPUT_DIR}/all_results.json", "w") as f:
    json.dump(results, f, indent=2)

print(f"\nAll results saved to {OUTPUT_DIR}/all_results.json")

print("\n✓ All 4 phases completed successfully!")
print("\n" + "="*70)
print("TRAINING PIPELINE FINISHED")
print("="*70)


FINAL RESULTS COMPARISON
Phase1 (no freeze)       : 45.43 BLEU
Phase 2 (decoder only)     : 48.79 BLEU
Phase 3 (progressive)      : 50.76 BLEU
Phase 4 (full fine-tuning) : 52.01 BLEU

Total improvement: +6.58 BLEU

All results saved to ./lo_to_vi/phase4/all_results.json

✓ All 4 phases completed successfully!

TRAINING PIPELINE FINISHED
