In [1]:
!pip install torch transformers sentencepiece datasets sacrebleu accelerate pyvi laonlp

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import torch
from datasets import Dataset, concatenate_datasets
from transformers import (
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
import random

from pyvi import ViTokenizer
from laonlp.tokenize import word_tokenize as lao_word_tokenize

print(torch.cuda.get_device_name(0))
print("VRAM:", torch.cuda.get_device_properties(0).total_memory / 1e9, "GB")

NVIDIA H200
VRAM: 150.0217344 GB


In [3]:
def tokenize_vietnamese(text):
    """Tokenize Vietnamese text using PyVi"""
    try:
        return ViTokenizer.tokenize(text)
    except Exception as e:
        print(f"Error tokenizing Vietnamese: {e}")
        return text

def tokenize_lao(text):
    """Tokenize Lao text using LaoNLP"""
    try:
        # LaoNLP word_tokenize không có parameter engine
        tokens = lao_word_tokenize(text)
        return " ".join(tokens)
    except Exception as e:
        print(f"Error tokenizing Lao: {e}")
        return text

def tokenize_batch_vietnamese(texts):
    """Batch tokenize Vietnamese texts"""
    print(f"Tokenizing {len(texts)} Vietnamese texts...")
    results = []
    for i, text in enumerate(texts):
        results.append(tokenize_vietnamese(text))
        if (i + 1) % 10000 == 0:
            print(f"  Processed {i + 1}/{len(texts)} Vietnamese texts")
    return results

def tokenize_batch_lao(texts):
    """Batch tokenize Lao texts"""
    print(f"Tokenizing {len(texts)} Lao texts...")
    results = []
    for i, text in enumerate(texts):
        results.append(tokenize_lao(text))
        if (i + 1) % 10000 == 0:
            print(f"  Processed {i + 1}/{len(texts)} Lao texts")
    return results

print("Vietnamese and Lao tokenizers loaded successfully!")

Vietnamese and Lao tokenizers loaded successfully!


In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained(
    "./lo_to_vi/phase1/best"
).cuda()

tokenizer = AutoTokenizer.from_pretrained(
    "./lo_to_vi/phase1/best"
)

print("Phase 1 model loaded!")

Phase 1 model loaded!


In [5]:
# ============================================================
# CELL 1: Freeze Encoder
# ============================================================
# Freeze entire encoder
for param in model.model.encoder.parameters():
    param.requires_grad = False

# Verify
total, frozen, trainable = 0, 0, 0
for name, param in model.named_parameters():
    n = param.numel()
    total += n
    if not param.requires_grad:
        frozen += n
    else:
        trainable += n

print(f"Total params    : {total/1e6:.1f}M")
print(f"Frozen params   : {frozen/1e6:.1f}M")
print(f"Trainable params: {trainable/1e6:.1f}M")

Total params    : 483.9M
Frozen params   : 282.3M
Trainable params: 201.6M


In [6]:
DATA_DIR = "dataset"
def load_parallel(src_file, tgt_file):
    with open(src_file, encoding="utf-8") as f:
        src = [l.strip() for l in f]
    with open(tgt_file, encoding="utf-8") as f:
        tgt = [l.strip() for l in f]
    
    assert len(src) == len(tgt)
    
    # Apply language-specific tokenization
    print("Tokenizing Vietnamese texts...")
    src_tokenized = tokenize_batch_lao(src)
    
    print("Tokenizing Lao texts...")
    tgt_tokenized = tokenize_batch_vietnamese(tgt)
    
    return Dataset.from_dict({
        "src_text": src_tokenized,
        "tgt_text": tgt_tokenized
    })


In [7]:
# Load toàn bộ dữ liệu từ train.vi và train.lo
full_dataset = load_parallel(
    f"{DATA_DIR}/train_lo_to_vi_shuf.lo",
    f"{DATA_DIR}/train_lo_to_vi_shuf.vi"
)

print(f"Total dataset size: {len(full_dataset)} examples")

# Chia dataset: 
# - Test: 1000 dòng cuối cùng
# - Dev: 3000 dòng trước test
# - Train: phần còn lại

test_size = 1000
dev_size = 3000

# Tính chỉ số
test_start_idx = len(full_dataset) - test_size
dev_start_idx = test_start_idx - dev_size

# Chia dataset
test_dataset = full_dataset.select(range(test_start_idx, len(full_dataset)))
dev_dataset = full_dataset.select(range(dev_start_idx, test_start_idx))
train_dataset = full_dataset.select(range(0, dev_start_idx))

# Shuffle training data để tránh bias thứ tự
train_dataset = train_dataset.shuffle(seed=42)

print(f"Train dataset: {len(train_dataset)} examples (for training)")
print(f"Dev dataset  : {len(dev_dataset)} examples (for validation during training)")
print(f"Test dataset : {len(test_dataset)} examples (for final evaluation)")
print("\nData split and shuffle completed.")

Tokenizing Vietnamese texts...
Tokenizing 700000 Lao texts...
  Processed 10000/700000 Lao texts
  Processed 20000/700000 Lao texts
  Processed 30000/700000 Lao texts
  Processed 40000/700000 Lao texts
  Processed 50000/700000 Lao texts
  Processed 60000/700000 Lao texts
  Processed 70000/700000 Lao texts
  Processed 80000/700000 Lao texts
  Processed 90000/700000 Lao texts
  Processed 100000/700000 Lao texts
  Processed 110000/700000 Lao texts
  Processed 120000/700000 Lao texts
  Processed 130000/700000 Lao texts
  Processed 140000/700000 Lao texts
  Processed 150000/700000 Lao texts
  Processed 160000/700000 Lao texts
  Processed 170000/700000 Lao texts
  Processed 180000/700000 Lao texts
  Processed 190000/700000 Lao texts
  Processed 200000/700000 Lao texts
  Processed 210000/700000 Lao texts
  Processed 220000/700000 Lao texts
  Processed 230000/700000 Lao texts
  Processed 240000/700000 Lao texts
  Processed 250000/700000 Lao texts
  Processed 260000/700000 Lao texts
  Processed

In [8]:
MAX_LEN = 256

def preprocess(batch):
    tokenizer.src_lang = "lo"
    tokenizer.tgt_lang = "vi"

    inputs = tokenizer(
        batch["src_text"],
        truncation=True,
        max_length=MAX_LEN
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["tgt_text"],
            truncation=True,
            max_length=MAX_LEN
        )

    inputs["labels"] = labels["input_ids"]
    return inputs


In [9]:
train_dataset = train_dataset.map(
    preprocess,
    batched=True,
    remove_columns=train_dataset.column_names,
    num_proc=8
)

dev_dataset = dev_dataset.map(
    preprocess,
    batched=True,
    remove_columns=dev_dataset.column_names,
    num_proc=8
)


Map (num_proc=8):   0%|          | 0/696000 [00:00<?, ? examples/s]



Map (num_proc=8):   0%|          | 0/3000 [00:00<?, ? examples/s]



In [10]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True
)


In [11]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [12]:
training_args_phase2 = TrainingArguments(
    output_dir="./lo_to_vi/phase2",
    # Eval / Save
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,

    logging_steps=100,

    # Batch (encoder frozen → đẩy lớn)
    per_device_train_batch_size=160,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,

    # Optim
    learning_rate=8e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    weight_decay=0.01,
    max_grad_norm=1.0,

    # Epochs
    num_train_epochs=3,

    # Precision
    bf16=True,
    fp16=False,

    # Speed
    group_by_length=True,
    dataloader_num_workers=8,
    dataloader_pin_memory=True,

    # Best model
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    report_to="none",
)


In [13]:
# ============================================================
# CELL 3: Create Trainer - Phase 2
# ============================================================
trainer_phase2 = Trainer(
    model=model,
    args=training_args_phase2,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=3)
    ]
)


  trainer_phase2 = Trainer(


In [14]:
# ============================================================
# CELL 4: Train Phase 2
# ============================================================
print("\n" + "="*60)
print("PHASE 2: Training Decoder Only (Encoder Frozen)")
print("="*60 + "\n")

trainer_phase2.train()



PHASE 2: Training Decoder Only (Encoder Frozen)



Step,Training Loss,Validation Loss
500,0.9991,0.948252
1000,0.9606,0.919356
1500,0.9525,0.894836
2000,0.9375,0.872824
2500,0.8798,0.855516
3000,0.8713,0.839672
3500,0.8595,0.828258
4000,0.8484,0.819867
4500,0.8139,0.817354
5000,0.812,0.810656


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=6525, training_loss=0.8815156236553557, metrics={'train_runtime': 1387.1865, 'train_samples_per_second': 1505.205, 'train_steps_per_second': 4.704, 'total_flos': 2.815068878143488e+17, 'train_loss': 0.8815156236553557, 'epoch': 3.0})

In [15]:
print(train_dataset.column_names)


['input_ids', 'attention_mask', 'labels']


In [16]:
# ============================================================
# CELL 5: Save Phase 2 Model
# ============================================================
trainer_phase2.save_model("./lo_to_vi/phase2/best")
tokenizer.save_pretrained("./lo_to_vi/phase2/best")

print("\nPhase 2 model saved!")


Phase 2 model saved!


In [17]:
# ============================================================
# CELL 6: Evaluate Phase 2
# ============================================================
def translate_batch(texts, model, tokenizer, batch_size=32):
    """Batch translation for speed"""
    model.eval()
    tokenizer.src_lang = "lo"
    tokenizer.tgt_lang = "vi"
    
    outputs = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=256
        ).to("cuda")
        
        with torch.no_grad():
            gen = model.generate(
                **inputs,
                forced_bos_token_id=tokenizer.get_lang_id("vi"),
                num_beams=5,
                max_length=256
            )
        
        texts_out = tokenizer.batch_decode(gen, skip_special_tokens=True)
        outputs.extend(texts_out)
        
        if (i // batch_size + 1) % 10 == 0:
            print(f"Translated {i+len(batch)}/{len(texts)}")
    
    return outputs

# Lấy dữ liệu test từ test_dataset (đã chia từ train.vi/train.lo)
test_lo = test_dataset["src_text"]
test_vi = test_dataset["tgt_text"]

print(f"\nTest set size: {len(test_vi)} examples")
print("\nTranslating test set...")
preds_phase2 = translate_batch(test_lo, model, tokenizer)




Test set size: 1000 examples

Translating test set...
Translated 320/1000
Translated 640/1000
Translated 960/1000


In [18]:

from sacrebleu import corpus_bleu

# Calculate BLEU
bleu_phase2 = corpus_bleu(preds_phase2, [test_vi])
print(f"\n{'='*60}")
print(f"PHASE 2 BLEU Score: {bleu_phase2.score:.2f}")
print(f"{'='*60}\n")

# Save predictions
with open("./lo_to_vi/phase2/phase2_predictions.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(preds_phase2))

That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.



PHASE 2 BLEU Score: 48.79



In [19]:
# ============================================================
# CELL 15: Sample Translations
# ============================================================
print("\n" + "="*70)
print("SAMPLE TRANSLATIONS")
print("="*70)

for i in range(10):
    print(f"\nExample {i+1}:")
    print(f"Source    : {test_lo[i]}")
    print(f"Reference : {test_vi[i]}")
    print(f"Prediction: {preds_phase2[i]}")
    print("-" * 70)



SAMPLE TRANSLATIONS

Example 1:
Source    : ພຽງ ແຕ່ ການຈັບ ຄູ່ ກະໂປງ ສັ້ນ ລາຍ ດອກ ກັບ ເສື້ອ ເປີດ ບ່າໄຫລ່ ກໍ ຈະ ເຮັດໃຫ້ ສາວ ໆ ມີ ຊຸດ ທີ່ເບິ່ງ ດີ ແລະ ເທ່ ທັນທີ .
Reference : Chỉ việc mix chân váy hoa dáng ngắn cùng áo trễ vai , các cô gái có ngay một set đồ " hack " dáng lại mát_rượi .
Prediction: Chỉ cần kết_hợp chân váy ngắn hoa với áo thun vai là các cô nàng sẽ có ngay set đồ đẹp_mắt , thon gọn .
----------------------------------------------------------------------

Example 2:
Source    : ຮູບເງົາ ບາງ ເລື່ອງ ທີ່   Phuong   Thanh   ໄດ້ ເຂົ້າຮ່ວມ ສະແດງ ລວມ ມີ :   Walking   and   Crying ,   Surrogate   Mother ,   When   Men   Get   Pregnant ,   The   Soul   of   Truong   Ba ,   the   Butcher ' s   Skin ,   The   Kiss   of   Death ,   Beautiful   Every   Centimeter ,   Rescuing   the   God   of   Death ,   ແລະ   Hot   Boy   Rebellion .
Reference : Các bộ phim Phương_Thanh đã tham_gia như : Vừa đi vừa khóc , Đẻ_mướn , Khi đàn_ông có bầu , Hồn Trương Ba , da hàng thịt , Nụ hôn thần_chết , 