In [3]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [4]:
# Import các thư viện cần thiết
import os
import torch
import torch.nn as nn
from datasets import load_dataset
from tokenizers import Tokenizer, models, pre_tokenizers, trainers
from transformers import PreTrainedTokenizerFast, PretrainedConfig, PreTrainedModel, Seq2SeqTrainingArguments, Seq2SeqTrainer
import sacrebleu
from rouge import Rouge

In [5]:
# 1. Chuẩn bị dữ liệu
## Tải bộ dữ liệu song ngữ
ds = load_dataset("ncduy/mt-en-vi")
ds = ds.remove_columns(["source"])  # Xóa cột không cần thiết

## Tạo và lưu tokenizer nếu chưa tồn tại
if not os.path.exists("tokenizer_en.json") or not os.path.exists("tokenizer_vi.json"):
    tokenizer_en = Tokenizer(models.BPE(unk_token="<unk>"))
    tokenizer_vi = Tokenizer(models.BPE(unk_token="<unk>"))
    tokenizer_en.pre_tokenizer = pre_tokenizers.Whitespace()
    tokenizer_vi.pre_tokenizer = pre_tokenizers.Whitespace()
    trainer = trainers.BpeTrainer(
        vocab_size=100_000,
        min_frequency=2,
        special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"],
    )
    tokenizer_en.train_from_iterator(ds["train"]["en"], trainer)
    tokenizer_vi.train_from_iterator(ds["train"]["vi"], trainer)
    tokenizer_en.save("tokenizer_en.json")
    tokenizer_vi.save("tokenizer_vi.json")

## Load tokenizer
MAX_LEN = 50
tokenizer_en = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer_en.json",
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<bos>",
    eos_token="<eos>",
)
tokenizer_vi = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer_vi.json",
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<bos>",
    eos_token="<eos>",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.43k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/597M [00:00<?, ?B/s]

valid.csv:   0%|          | 0.00/2.45M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/2.43M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2884451 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11316 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11225 [00:00<?, ? examples/s]

In [6]:
## Hàm tiền xử lý dữ liệu
def preprocess_function(examples):
    src_texts = examples["en"]
    tgt_texts = ["<bos>" + sent + "<eos>" for sent in examples["vi"]]
    src_encodings = tokenizer_en(
        src_texts, padding="max_length", truncation=True, max_length=MAX_LEN
    )
    tgt_encodings = tokenizer_vi(
        tgt_texts, padding="max_length", truncation=True, max_length=MAX_LEN
    )
    return {
        "input_ids": src_encodings["input_ids"],
        "labels": tgt_encodings["input_ids"],
    }

preprocessed_ds = ds.map(preprocess_function, batched=True)


Map:   0%|          | 0/2884451 [00:00<?, ? examples/s]

Map:   0%|          | 0/11316 [00:00<?, ? examples/s]

Map:   0%|          | 0/11225 [00:00<?, ? examples/s]

In [12]:
# 2. Xây dựng mô hình Transformer
class Seq2SeqTransformerConfig(PretrainedConfig):
    model_type = "seq2seq_transformer"
    def __init__(
        self,
        vocab_size_src=30000,
        vocab_size_tgt=30000,
        d_model=256,
        num_heads=4,
        num_layers=6,
        max_seq_len=50,
        drop_out=0.2,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.vocab_size_src = vocab_size_src
        self.vocab_size_tgt = vocab_size_tgt
        self.d_model = d_model
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.max_seq_len = max_seq_len
        self.drop_out = drop_out

class Seq2SeqTransformerModel(PreTrainedModel):
    config_class = Seq2SeqTransformerConfig
    def __init__(self, config):
        super().__init__(config)
        self.embedding_src = nn.Embedding(config.vocab_size_src, config.d_model)
        self.embedding_tgt = nn.Embedding(config.vocab_size_tgt, config.d_model)
        self.position_embedding_src = nn.Embedding(config.max_seq_len, config.d_model)
        self.position_embedding_tgt = nn.Embedding(config.max_seq_len, config.d_model)
        self.transformer = nn.Transformer(
            d_model=config.d_model,
            nhead=config.num_heads,
            num_encoder_layers=config.num_layers,
            num_decoder_layers=config.num_layers,
            dropout=config.drop_out,
            batch_first=True,
        )
        self.generator = nn.Linear(config.d_model, config.vocab_size_tgt)
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=0)

    def forward(self, input_ids, labels):
        tgt_input = labels[:, :-1]
        tgt_output = labels[:, 1:]
        batch_size, seq_len_src = input_ids.shape
        _, seg_len_tgt = tgt_input.shape
        src_positions = torch.arange(seq_len_src, device=input_ids.device).unsqueeze(0)
        tgt_positions = torch.arange(seg_len_tgt, device=labels.device).unsqueeze(0)
        src_embedded = self.embedding_src(input_ids) + self.position_embedding_src(src_positions)
        tgt_embedded = self.embedding_tgt(tgt_input) + self.position_embedding_tgt(tgt_positions)
        src_mask, tgt_mask, src_key_padding_mask, tgt_key_padding_mask = self.create_mask(input_ids, tgt_input)
        output = self.transformer(
            src=src_embedded,
            tgt=tgt_embedded,
            src_mask=src_mask,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask,
        )
        logits = self.generator(output)
        loss = self.loss_fn(logits.permute(0, 2, 1), tgt_output)
        return {"loss": loss, "logits": logits}

    def create_mask(self, src, tgt):
        src_seq_len = src.shape[1]
        tgt_seq_len = tgt.shape[1]
        device = src.device
        tgt_mask = self.generate_square_subsequent_mask(tgt_seq_len, device).to(torch.bool)
        src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)
        src_padding_mask = (src == 0)
        tgt_padding_mask = (tgt == 0)
        return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

    def generate_square_subsequent_mask(self, sz, device):
        mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(mask == 1, float(0.0))
        return mask

config = Seq2SeqTransformerConfig(
    vocab_size_src=len(tokenizer_en),
    vocab_size_tgt=len(tokenizer_vi),
)
model_transformer = Seq2SeqTransformerModel(config)

In [15]:
# 3. Huấn luyện mô hình
import os
os.environ["WANDB_DISABLED"] = "true"
training_args = Seq2SeqTrainingArguments(
    output_dir="./transformer-en-vi",
    logging_dir="logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=3,
    learning_rate=3e-4,
    save_total_limit=1,
    load_best_model_at_end=True,
    bf16=True,
    weight_decay=0.01,
    gradient_accumulation_steps=2,
)

trainer = Seq2SeqTrainer(
    model=model_transformer,
    args=training_args,
    train_dataset=preprocessed_ds["train"],
    eval_dataset=preprocessed_ds["validation"],
)
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,3.7418,2.380807
2,2.2952,1.871821


TrainOutput(global_step=33801, training_loss=2.8653539281382208, metrics={'train_runtime': 7354.5413, 'train_samples_per_second': 1176.6, 'train_steps_per_second': 4.596, 'total_flos': 1.117864213800768e+17, 'train_loss': 2.8653539281382208, 'epoch': 2.9997781229199023})

In [16]:
# 4. Đánh giá mô hình
def batch_beam_search_decode(model, src_sentences, tokenizer_en, tokenizer_vi, beam_width=5, max_len=50, temperature=1, device="cuda"):
    model.to(device)
    model.eval()
    bos_id = tokenizer_vi.bos_token_id
    eos_id = tokenizer_vi.eos_token_id
    encoded = tokenizer_en.batch_encode_plus(
        src_sentences,
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    src_tensor = encoded['input_ids'].to(device)
    B, src_seq_len = src_tensor.shape
    src_mask = torch.zeros((src_seq_len, src_seq_len), dtype=torch.bool, device=device)
    src_padding_mask = (src_tensor == 0)
    with torch.no_grad():
        encoder_output = model.transformer.encoder(
            model.embedding_src(src_tensor) + model.position_embedding_src(torch.arange(src_seq_len, device=device).unsqueeze(0)),
            src_mask,
            src_padding_mask
        )
    beams = torch.full((B, 1, 1), bos_id, dtype=torch.long, device=device)
    beam_scores = torch.zeros(B, 1, device=device)
    complete_beams = [[] for _ in range(B)]
    for step in range(max_len - 1):
        B_current, beam_num, seq_len = beams.shape
        flat_beams = beams.view(B_current * beam_num, seq_len)
        causal_mask = model.generate_square_subsequent_mask(seq_len, device)
        repeated_encoder_output = encoder_output.unsqueeze(1).repeat(1, beam_num, 1, 1).view(B_current * beam_num, src_seq_len, -1)
        with torch.no_grad():
            decoder_output = model.transformer.decoder(
                model.embedding_tgt(flat_beams) + model.position_embedding_tgt(torch.arange(seq_len, device=device).unsqueeze(0)),
                repeated_encoder_output,
                causal_mask
            )
            logits = model.generator(decoder_output[:, -1, :])
        log_probs = torch.nn.functional.log_softmax(logits / temperature, dim=-1)
        log_probs = log_probs.view(B, beam_num, -1)
        total_scores = beam_scores.unsqueeze(-1) + log_probs
        total_scores = total_scores.view(B, -1)
        topk_scores, topk_indices = total_scores.topk(beam_width, dim=-1)
        beam_indices = topk_indices // log_probs.size(-1)
        token_indices = topk_indices % log_probs.size(-1)
        new_beams = []
        new_beam_scores = []
        for i in range(B):
            beams_i = beams[i]
            new_beams_i = []
            new_scores_i = []
            for j in range(beam_width):
                prev_beam = beams_i[beam_indices[i, j]]
                new_token = token_indices[i, j].unsqueeze(0)
                new_seq = torch.cat([prev_beam, new_token])
                new_beams_i.append(new_seq.unsqueeze(0))
                new_scores_i.append(topk_scores[i, j].unsqueeze(0))
            new_beams.append(torch.cat(new_beams_i, dim=0).unsqueeze(0))
            new_beam_scores.append(torch.cat(new_scores_i, dim=0).unsqueeze(0))
        beams = torch.cat(new_beams, dim=0)
        beam_scores = torch.cat(new_beam_scores, dim=0)
        beams_list = []
        scores_list = []
        for i in range(B):
            beams_i = beams[i]
            scores_i = beam_scores[i]
            ongoing_beams = []
            ongoing_scores = []
            for j in range(beam_width):
                if beams_i[j, -1].item() == eos_id:
                    complete_beams[i].append((beams_i[j], scores_i[j]))
                else:
                    ongoing_beams.append(beams_i[j].unsqueeze(0))
                    ongoing_scores.append(scores_i[j].unsqueeze(0))
            if len(ongoing_beams) == 0:
                ongoing_beams = [beams_i[0].unsqueeze(0)]
                ongoing_scores = [scores_i[0].unsqueeze(0)]
            beams_list.append(torch.cat(ongoing_beams, dim=0))
            scores_list.append(torch.cat(ongoing_scores, dim=0))
        new_beams = []
        new_scores = []
        for i in range(B):
            cur_beams = beams_list[i]
            cur_scores = scores_list[i]
            cur_beam_num = cur_beams.shape[0]
            if cur_beam_num < beam_width:
                pad_num = beam_width - cur_beam_num
                pad_seq = cur_beams[0].unsqueeze(0).repeat(pad_num, 1)
                pad_scores = torch.full((pad_num,), -1e9, device=device)
                cur_beams = torch.cat([cur_beams, pad_seq], dim=0)
                cur_scores = torch.cat([cur_scores, pad_scores], dim=0)
            new_beams.append(cur_beams.unsqueeze(0))
            new_scores.append(cur_scores.unsqueeze(0))
        beams = torch.cat(new_beams, dim=0)
        beam_scores = torch.cat(new_scores, dim=0)
        if all(len(complete_beams[i]) >= beam_width for i in range(B)):
            break
    final_translations = []
    for i in range(B):
        if complete_beams[i]:
            best_beam = max(complete_beams[i], key=lambda x: x[1])[0]
        else:
            best_beam = beams[i][0]
        translation = tokenizer_vi.decode(best_beam.tolist(), skip_special_tokens=True)
        final_translations.append(translation)
    return final_translations

In [18]:
def compute_metrics(model, test_dataset, tokenizer_en, tokenizer_vi, beam_width=5, max_len=50, temperature=1, device="cuda", batch_size=128):
    src_sentences = test_dataset['en']
    target_sentences = test_dataset['vi']
    all_predictions = []
    n_samples = len(src_sentences)
    for i in range(0, n_samples, batch_size):
        batch_src = src_sentences[i:i+batch_size]
        batch_predictions = batch_beam_search_decode(
            model, batch_src, tokenizer_en, tokenizer_vi,
            beam_width=beam_width, max_len=max_len,
            temperature=temperature, device=device
        )
        all_predictions.extend(batch_predictions)
    bleu = sacrebleu.corpus_bleu(all_predictions, [target_sentences])
    rouge = Rouge()
    rouge_scores = rouge.get_scores(all_predictions, target_sentences, avg=True)
    return bleu.score, rouge_scores

bleu_score, rouge_scores = compute_metrics(
    model_transformer,
    preprocessed_ds['test'],
    tokenizer_en,
    tokenizer_vi,
    beam_width=5,
    max_len=50,
    temperature=1,
    device="cuda",
    batch_size=128
)
print(f"BLEU score: {bleu_score:.2f}")
print(f"ROUGE scores: {rouge_scores}")




BLEU score: 33.48
ROUGE scores: {'rouge-1': {'r': 0.5451279975760036, 'p': 0.5478875152864958, 'f': 0.5409670511571266}, 'rouge-2': {'r': 0.3224372861745963, 'p': 0.32037114630937175, 'f': 0.31764619240428904}, 'rouge-l': {'r': 0.5295980926316631, 'p': 0.5321720079701074, 'f': 0.5255090363147331}}


In [19]:

# 5. Inference
def translate(model, src_sentence, tokenizer_en, tokenizer_vi, beam_width=5, max_len=50, temperature=1, device="cuda"):
    translations = batch_beam_search_decode(
        model, [src_sentence], tokenizer_en, tokenizer_vi,
        beam_width=beam_width, max_len=max_len,
        temperature=temperature, device=device
    )
    return translations[0]

# Ví dụ sử dụng
src_sentences = [
    "Hello, how are you?",
    "What is your name?",
    "Don't be pressure about life",
    "When the storm gone, you will never remember how you lived through it",
    "The weather today is nice."
]

# 2) Dịch từng câu và in kết quả
for src in src_sentences:
    translation = translate(model_transformer, src, tokenizer_en, tokenizer_vi)
    print(f"Input:       {src}")
    print(f"Translation: {translation}")
    print("-" * 40)

Input:       Hello, how are you?
Translation: Hel lo , anh thế nào ?
----------------------------------------
Input:       What is your name?
Translation: Tên anh là gì ?
----------------------------------------
Input:       Don't be pressure about life
Translation: Đừng có áp lực về cuộc sống
----------------------------------------
Input:       When the storm gone, you will never remember how you lived through it
Translation: Khi cơn bão biến mất , bạn sẽ không bao giờ nhớ cách bạn sống qua nó
----------------------------------------
Input:       The weather today is nice.
Translation: Thời tiết hôm nay rất đẹp .
----------------------------------------
