In [13]:
!pip install torch transformers datasets sentencepiece sacrebleu accelerate

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [14]:
import torch
import html
import re
from datasets import Dataset
from transformers import (
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    AutoModelForSeq2SeqLM
)

In [15]:
# %%
print(f"Device: {torch.cuda.get_device_name(0)}")
print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

MODEL_NAME = "facebook/m2m100_418M"

tokenizer = M2M100Tokenizer.from_pretrained(MODEL_NAME)
model = M2M100ForConditionalGeneration.from_pretrained(MODEL_NAME).cuda()

# BẮT BUỘC để giảm VRAM
model.gradient_checkpointing_enable()
model.config.use_cache = False

for p in model.parameters():
    p.requires_grad = True

print("Model ready (gradient checkpointing ON)")


Device: NVIDIA H200
VRAM: 150.02 GB
Model ready (gradient checkpointing ON)


In [17]:
# %%
def clean_text(s: str) -> str:
    s = html.unescape(s)                  
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"\s+([.,!?;:])", r"\1", s)
    return s.strip()


In [18]:
# %%
def load_parallel_clean(src_file, tgt_file):
    with open(src_file, encoding="utf-8") as f:
        src_raw = [clean_text(l) for l in f]

    with open(tgt_file, encoding="utf-8") as f:
        tgt_raw = [clean_text(l) for l in f]

    pairs = [
        (s, t)
        for s, t in zip(src_raw, tgt_raw)
        if s and t
    ]

    src, tgt = zip(*pairs)

    print(f"Loaded {len(src)} aligned sentence pairs")

    return Dataset.from_dict({
        "src_text": list(src),
        "tgt_text": list(tgt)
    })


In [19]:
# %%
DATA_DIR = "/home/admin/dataset"

train_dataset = load_parallel_clean(
    f"{DATA_DIR}/train.vi.txt",
    f"{DATA_DIR}/train.en.txt"
)

dev_dataset = load_parallel_clean(
    f"{DATA_DIR}/dev2012.vi.txt",
    f"{DATA_DIR}/dev2012.en.txt"
)

print(f"Train: {len(train_dataset)} | Dev: {len(dev_dataset)}")

Loaded 133166 aligned sentence pairs
Loaded 1268 aligned sentence pairs
Train: 133166 | Dev: 1268


In [20]:
# %%
MAX_LEN = 192   

def preprocess(batch):
    tokenizer.src_lang = "vi"
    tokenizer.tgt_lang = "en"

    inputs = tokenizer(
        batch["src_text"],
        truncation=True,
        max_length=MAX_LEN
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["tgt_text"],
            truncation=True,
            max_length=MAX_LEN
        )

    inputs["labels"] = labels["input_ids"]
    return inputs


In [21]:
# %%
train_dataset = train_dataset.map(
    preprocess,
    batched=True,
    remove_columns=train_dataset.column_names,
    num_proc=8
)

dev_dataset = dev_dataset.map(
    preprocess,
    batched=True,
    remove_columns=dev_dataset.column_names,
    num_proc=8
)

Map (num_proc=8):   0%|          | 0/133166 [00:00<?, ? examples/s]



Map (num_proc=8):   0%|          | 0/1268 [00:00<?, ? examples/s]



In [22]:
# %%
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

In [23]:
# %%
training_args = TrainingArguments(
    output_dir="/home/admin/checkpoint4",

    eval_strategy="steps", 
    save_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,

    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=8,    

    bf16=True,
    fp16=False,

    learning_rate=5e-5,
    num_train_epochs=10,
    warmup_steps=1000,

    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    report_to="none"
)

In [24]:
# %%
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

  trainer = Trainer(


In [25]:
trainer.train()

Step,Training Loss,Validation Loss
500,1.6168,1.382349
1000,1.5615,1.376668
1500,1.455,1.334916
2000,1.4163,1.308038
2500,1.2728,1.291595
3000,1.2893,1.281267
3500,1.15,1.283141
4000,1.1636,1.267825
4500,1.0517,1.276792
5000,1.0679,1.266086


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=6500, training_loss=1.2445401083139274, metrics={'train_runtime': 3842.157, 'train_samples_per_second': 346.592, 'train_steps_per_second': 2.709, 'total_flos': 1.3059161107454362e+17, 'train_loss': 1.2445401083139274, 'epoch': 6.244142736993872})

In [26]:
trainer.save_model("/home/admin/checkpoint4/best_vi_en")
tokenizer.save_pretrained("/home/admin/checkpoint3/best_vi_en")

('/home/admin/checkpoint3/best_vi_en/tokenizer_config.json',
 '/home/admin/checkpoint3/best_vi_en/special_tokens_map.json',
 '/home/admin/checkpoint3/best_vi_en/vocab.json',
 '/home/admin/checkpoint3/best_vi_en/sentencepiece.bpe.model',
 '/home/admin/checkpoint3/best_vi_en/added_tokens.json')

In [27]:
# %%
def load_clean_lines(path):
    with open(path, encoding="utf-8") as f:
        return [clean_text(l) for l in f if l.strip()]

In [28]:
from sacrebleu import corpus_bleu

In [29]:
# %%
def translate_en_vi(lines):
    model.eval()
    outputs = []

    tokenizer.src_lang = "vi"
    tokenizer.tgt_lang = "en"

    for line in lines:
        inputs = tokenizer(
            line,
            return_tensors="pt",
            truncation=True,
            max_length=256
        ).to("cuda")

        with torch.no_grad():
            gen = model.generate(
                **inputs,
                forced_bos_token_id=tokenizer.get_lang_id("vi"),
                num_beams=5,
                max_length=256
            )

        outputs.append(tokenizer.decode(gen[0], skip_special_tokens=True))

    return outputs

In [30]:
# %%
src_test = load_clean_lines(f"{DATA_DIR}/tst2012.vi.txt")
ref_test = load_clean_lines(f"{DATA_DIR}/tst2012.en.txt")

preds = translate_en_vi(src_test)

bleu = corpus_bleu(preds, [ref_test], tokenize="intl")
print(f"SacreBLEU: {bleu.score:.2f}")

SacreBLEU: 29.65


In [31]:
import random

N = 5  # số câu muốn xem ngẫu nhiên

# Đọc source EN (CLEAN)
with open(f"{DATA_DIR}/tst2012.vi.txt", encoding="utf-8") as f:
    src_lines = [clean_text(l) for l in f if l.strip()]

# Đọc reference VI (CLEAN)
with open(f"{DATA_DIR}/tst2012.en.txt", encoding="utf-8") as f:
    ref_lines = [clean_text(l) for l in f if l.strip()]


assert len(src_lines) == len(ref_lines)

model.eval()
tokenizer.src_lang = "vi"
tokenizer.tgt_lang = "en"

idxs = random.sample(range(len(src_lines)), N)

for i, idx in enumerate(idxs):
    src = src_lines[idx]
    ref = ref_lines[idx]

    inputs = tokenizer(
        src,
        return_tensors="pt",
        truncation=True,
        max_length=256
    ).to("cuda")

    with torch.no_grad():
        gen = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.get_lang_id("en"),
            num_beams=5,
            max_length=256
        )

    pred = tokenizer.decode(gen[0], skip_special_tokens=True)

    print(f"\n--- CÂU NGẪU NHIÊN {i+1} (idx={idx}) ---")
    print("VI (SOURCE):")
    print(src)
    print("\nEN (MODEL):")
    print(pred)
    print("\nEN (REAL):")
    print(ref)
    print("=" * 60)


--- CÂU NGẪU NHIÊN 1 (idx=1309) ---
VI (SOURCE):
Nhưng những ứng dụng như cộng nghệ nhắc nhở rằng chúng ta không chỉ là người tiêu dùng, và chúng ta không chỉ là nhà tiêu dùng của chính phủ, đóng góp thuế và nhận lại dịch vụ.

EN (MODEL):
But applications like technology remind us that we 're not just consumers, and we 're not just consumers of government, paying taxes and receiving services.

EN (REAL):
But these apps are like little digital reminders that we 're not just consumers, and we 're not just consumers of government, putting in our taxes and getting back services.

--- CÂU NGẪU NHIÊN 2 (idx=228) ---
VI (SOURCE):
nhưng vẫn có cách nghĩ khác về chúng ta đang ở đâu trong nhân cách của mình.

EN (MODEL):
But there 's a different way of thinking about where we are in our personalities.

EN (REAL):
But there 's another way of thinking about where we are in our identity.

--- CÂU NGẪU NHIÊN 3 (idx=51) ---
VI (SOURCE):
Các trung tâm được thành lập.

EN (MODEL):
Centers were created

In [32]:
import shutil
import os

src_dir = "/home/admin/checkpoint4/best_vi_en"
zip_path = "/home/admin/checkpoint4/best_vi_en"

shutil.make_archive(zip_path, 'zip', src_dir)

print("Done:", zip_path + ".zip")

Done: /home/admin/checkpoint4/best_vi_en.zip
