In [1]:
!pip install torch transformers datasets sentencepiece sacrebleu accelerate

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting datasets
  Downloading datasets-4.4.2-py3-none-any.whl.metadata (19 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transf

In [2]:
import torch
from datasets import Dataset
from transformers import (
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    AutoModelForSeq2SeqLM
)

[2025-12-22 07:01:00,941] INFO numexpr.utils: Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
[2025-12-22 07:01:00,941] INFO numexpr.utils: Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[2025-12-22 07:01:00,941] INFO numexpr.utils: NumExpr defaulting to 8 threads.


In [3]:
print(f"Device: {torch.cuda.get_device_name(0)}")
print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

MODEL_NAME = "facebook/m2m100_418M"

tokenizer = M2M100Tokenizer.from_pretrained(MODEL_NAME)
model = M2M100ForConditionalGeneration.from_pretrained(MODEL_NAME).cuda()

Device: NVIDIA H200
VRAM: 150.02 GB


tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

In [4]:
for param in model.parameters():
    param.requires_grad = True

print("Đã mở băng toàn bộ Model. Sẵn sàng huấn luyện chuyên sâu.")

Đã mở băng toàn bộ Model. Sẵn sàng huấn luyện chuyên sâu.


In [5]:
DATA_DIR = "/work/data/dataset"

def load_parallel(src_file, tgt_file):
    with open(src_file, encoding="utf-8") as f:
        src = [l.strip() for l in f]
    with open(tgt_file, encoding="utf-8") as f:
        tgt = [l.strip() for l in f]
    assert len(src) == len(tgt)
    return Dataset.from_dict({"src_text": src, "tgt_text": tgt})

In [6]:
train_dataset = load_parallel(
    f"{DATA_DIR}/train2022.vi", 
    f"{DATA_DIR}/train2022.zh"
)

dev_dataset = load_parallel(
    f"{DATA_DIR}/dev2022.vi.txt", 
    f"{DATA_DIR}/dev2022.zh.txt"
)

print(f"Đã load xong: Train ({len(train_dataset)} câu), Dev ({len(dev_dataset)} câu)")

Đã load xong: Train (300348 câu), Dev (1000 câu)


In [7]:
MAX_LEN = 256

def preprocess(batch):
    tokenizer.src_lang = "vi"
    tokenizer.tgt_lang = "zh"
    inputs = tokenizer(batch["src_text"], truncation=True, max_length=MAX_LEN)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["tgt_text"], truncation=True, max_length=MAX_LEN)
    inputs["labels"] = labels["input_ids"]
    return inputs

train_dataset = train_dataset.map(preprocess, batched=True, remove_columns=train_dataset.column_names, num_proc=8)
dev_dataset = dev_dataset.map(preprocess, batched=True, remove_columns=dev_dataset.column_names, num_proc=8)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

Map (num_proc=8):   0%|          | 0/300348 [00:00<?, ? examples/s]



Map (num_proc=8):   0%|          | 0/1000 [00:00<?, ? examples/s]



In [8]:
training_args = TrainingArguments(
   
    output_dir="/work/data/checkpoint",
    
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    
    # TỐI ƯU CHO H200
    per_device_train_batch_size=128, # Batch size lớn giúp tận dụng VRAM 141GB
    per_device_eval_batch_size=64,
    # Precision
    bf16=True,
    fp16=False,    
    
    learning_rate=5e-5,
    num_train_epochs=10,
    warmup_steps=1000,
    save_total_limit=2,
    
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none"
)


In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

  trainer = Trainer(


In [10]:
trainer.train(resume_from_checkpoint=True)


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Step,Training Loss,Validation Loss
11000,1.5618,1.591041
11500,1.4474,1.586191
12000,1.5457,1.583696


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=12000, training_loss=0.19003698857625326, metrics={'train_runtime': 381.4865, 'train_samples_per_second': 7873.096, 'train_steps_per_second': 61.522, 'total_flos': 3.39632129392214e+17, 'train_loss': 0.19003698857625326, 'epoch': 5.112910097997443})

In [11]:
# Lưu bản tốt nhất
trainer.save_model("/work/data/checkpoint/best_vi_zh")
tokenizer.save_pretrained("/work/data/checkpoint/best_vi_zh")

('/work/data/checkpoint/best_vi_zh/tokenizer_config.json',
 '/work/data/checkpoint/best_vi_zh/special_tokens_map.json',
 '/work/data/checkpoint/best_vi_zh/vocab.json',
 '/work/data/checkpoint/best_vi_zh/sentencepiece.bpe.model',
 '/work/data/checkpoint/best_vi_zh/added_tokens.json')

In [12]:
from sacrebleu import corpus_bleu

def translate_vi_zh(src_file):
    outputs = []
    model.eval()
    tokenizer.src_lang = "vi"
    tokenizer.tgt_lang = "zh"

    with open(src_file, encoding="utf-8") as f:
        lines = [l.strip() for l in f if l.strip()]

    print(f"Đang dịch {len(lines)} câu sang tiếng Trung...")
    
    for line in lines:
        inputs = tokenizer(line, return_tensors="pt", truncation=True, max_length=256).to("cuda")
        with torch.no_grad():
            gen = model.generate(
                **inputs,
                forced_bos_token_id=tokenizer.get_lang_id("zh"),
                num_beams=5,
                max_length=256
            )
        text = tokenizer.decode(gen[0], skip_special_tokens=True)
        outputs.append(text)
    return outputs


In [13]:
preds = translate_vi_zh(f"{DATA_DIR}/test.vi-zh.2022.vi")

# Đọc file reference tương ứng
with open(f"{DATA_DIR}/test.vi-zh.2022.zh", encoding="utf-8") as f:
    refs = [f.read().splitlines()]

# Tính BLEU
bleu = corpus_bleu(preds, refs, tokenize='zh')
print("\n" + "="*30)
print(f"KẾT QUẢ VI-ZH TRÊN H200")
print(f"SacreBLEU Score: {bleu.score:.2f}")
print("="*30)

Đang dịch 1000 câu sang tiếng Trung...

KẾT QUẢ VI-ZH TRÊN H200
SacreBLEU Score: 36.97
