In [2]:
! pip install evaluate sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
import evaluate
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
import pandas as pd
import os

In [7]:
def load_split(base_path, split="train"):
    en_path = f"{base_path}/{split}_en.csv"
    vi_path = f"{base_path}/{split}_vi.csv"

    en_df = pd.read_csv(en_path)
    vi_df = pd.read_csv(vi_path)

    df = pd.concat([en_df, vi_df], axis=1)
    return en_df, vi_df, df


In [8]:
base_path = "/kaggle/input/translation-data"

train_df_en, train_df_vi , train_df = load_split(base_path, "train")

valid_df_en , valid_df_vi , valid_df = load_split(base_path, "dev")

test_df_en, test_df_vi , test_df = load_split(base_path,"test")

print("size of train: ",len(train_df_en), '/', len(train_df_vi), "/", len(train_df))
print("size of dev: ",len(valid_df_en) , "/" , len(valid_df_vi), "/",len(valid_df))
print("size of test: ",len(test_df_en) , "/" , len(test_df_vi), "/", len(test_df))

size of train:  2977999 / 2977999 / 2977999
size of dev:  18719 / 18719 / 18719
size of test:  19151 / 19151 / 19151


In [9]:
print(train_df_vi.head())
print(train_df_en.head())
print(train_df.head())

                                                  vi
0          Câu chuyện bắt đầu với buổi lễ đếm ngược.
1  Ngày 14, tháng 8, năm 1947, gần nửa đêm, ở Bom...
2  Cùng lúc, trên khắp đất Ấn, người ta nín thở c...
3  Khi đồng hồ điểm thời khắc nửa đêm, một đứa tr...
4  Những sự kiện này là nền móng tạo nên "Những đ...
                                                  en
0                        It begins with a countdown.
1  On August 14th, 1947, a woman in Bombay goes i...
2  Across India, people hold their breath for the...
3  And at the stroke of midnight, a squirming inf...
4  These events form the foundation of "Midnight'...
                                                  en  \
0                        It begins with a countdown.   
1  On August 14th, 1947, a woman in Bombay goes i...   
2  Across India, people hold their breath for the...   
3  And at the stroke of midnight, a squirming inf...   
4  These events form the foundation of "Midnight'...   

                           

In [10]:
print(valid_df_en.head())
print(valid_df_vi.head())
print(valid_df.head())

                                                  en
0  ﻿Hurricane Dorian, one of the most powerful st...
1  Dorian is especially dangerous due to its slow...
2  The storm passed by the Leeward Islands, Puert...
3  The United States branch office continues to g...
4  At this time, there have been no reported inju...
                                                  vi
0  Vào chủ nhật ngày 1-9-2019, cơn bão Dorian, mộ...
1  Bão Dorian đặc biệt nguy hiểm vì nó di chuyển ...
2  Khi đi qua quần đảo Leeward, Puerto Rico và qu...
3  Văn phòng chi nhánh Hoa Kỳ tiếp tục cập nhật t...
4  Theo báo cáo đến thời điểm hiện tại, trong 46 ...
                                                  en  \
0  ﻿Hurricane Dorian, one of the most powerful st...   
1  Dorian is especially dangerous due to its slow...   
2  The storm passed by the Leeward Islands, Puert...   
3  The United States branch office continues to g...   
4  At this time, there have been no reported inju...   

                           

In [11]:
print(test_df_en.head())
print(test_df_vi.head())
print(test_df.head())

                                                  en
0  Brother Albert Barnett and his wife, Sister Su...
1  Severe storms ripped through parts of the sout...
2  Two days of heavy rain, high winds, and numero...
3  Sadly, Brother Albert Barnett and his wife, Si...
4  The United States branch also reports that at ...
                                                  vi
0  Anh Albert Barnett và chị Susan Barnett, thuộc...
1  Ngày 11 và 12-1-2020, những cơn bão lớn đã qué...
2  Những trận mưa to và gió lớn trong suốt hai ng...
3  Đáng buồn là anh Albert Barnett 85 tuổi, và vợ...
4  Chi nhánh Hoa Kỳ cũng cho biết có ít nhất bốn ...
                                                  en  \
0  Brother Albert Barnett and his wife, Sister Su...   
1  Severe storms ripped through parts of the sout...   
2  Two days of heavy rain, high winds, and numero...   
3  Sadly, Brother Albert Barnett and his wife, Si...   
4  The United States branch also reports that at ...   

                           

In [12]:
class TranslationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_lang="en", target_lang="vi", max_length=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.source_lang = source_lang
        self.target_lang = target_lang
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        source_text = f"translate English to Vietnamese: {self.dataframe[self.source_lang].iloc[idx]}"
        target_text = self.dataframe[self.target_lang].iloc[idx]

        source_encoding = self.tokenizer(
            source_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        with self.tokenizer.as_target_tokenizer():
            target_encoding = self.tokenizer(
                target_text,
                max_length=self.max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )

        labels = target_encoding["input_ids"].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100  # ⚠️ mask pad token

        return {
            "input_ids": source_encoding["input_ids"].squeeze(),
            "attention_mask": source_encoding["attention_mask"].squeeze(),
            "labels": labels
        }


In [23]:
def evaluate_bleu(model, dataloader, tokenizer, device):
    model.eval()
    predictions = []
    references = []
    bleu = evaluate.load("sacrebleu")  # ✅ dùng evaluate

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=128,
                num_beams=4,
                early_stopping=True
            )
            pred_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

            # ✅ Sửa ở đây: thay -100 bằng pad_token_id trước khi decode
            labels[labels == -100] = tokenizer.pad_token_id
            ref_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

            predictions.extend(pred_texts)
            references.extend([[ref] for ref in ref_texts])  # sacrebleu expects list of list

    bleu_score = bleu.compute(predictions=predictions, references=references)
    return bleu_score


In [17]:
# Cài đặt
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Dataset (giữ nguyên nếu bạn đã khai báo class TranslationDataset)
train_dataset = TranslationDataset(train_df.head(80000), tokenizer)
valid_dataset = TranslationDataset(valid_df.head(10000), tokenizer)
test_dataset = TranslationDataset(test_df.head(10000), tokenizer)

batch_size = 32

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = StepLR(optimizer, step_size=1, gamma=0.9)

# ✅ Thư mục lưu mô hình cho Kaggle
output_dir = "/kaggle/working/my_translation_model/best"
os.makedirs(output_dir, exist_ok=True)

num_epochs = 10
early_stop_patience = 2
best_val_loss = float("inf")
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"✅ Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in valid_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(valid_dataloader)
    print(f"📉 Validation Loss: {avg_val_loss:.4f}")

    # Early Stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        print("✅ Validation improved, saving best model...")
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
    else:
        patience_counter += 1
        print(f"⚠️ No improvement. Patience: {patience_counter}/{early_stop_patience}")
        if patience_counter >= early_stop_patience:
            print("🛑 Early stopping triggered.")
            break

    scheduler.step()



Epoch 1/10: 100%|██████████| 2500/2500 [23:32<00:00,  1.77it/s]


✅ Epoch 1, Train Loss: 1.8740
📉 Validation Loss: 1.6092
✅ Validation improved, saving best model...


Epoch 2/10: 100%|██████████| 2500/2500 [23:30<00:00,  1.77it/s]


✅ Epoch 2, Train Loss: 1.5532
📉 Validation Loss: 1.4742
✅ Validation improved, saving best model...


Epoch 3/10: 100%|██████████| 2500/2500 [23:31<00:00,  1.77it/s]


✅ Epoch 3, Train Loss: 1.4259
📉 Validation Loss: 1.3842
✅ Validation improved, saving best model...


Epoch 4/10: 100%|██████████| 2500/2500 [23:32<00:00,  1.77it/s]


✅ Epoch 4, Train Loss: 1.3413
📉 Validation Loss: 1.3278
✅ Validation improved, saving best model...


Epoch 5/10: 100%|██████████| 2500/2500 [23:30<00:00,  1.77it/s]


✅ Epoch 5, Train Loss: 1.2815
📉 Validation Loss: 1.2879
✅ Validation improved, saving best model...


Epoch 6/10: 100%|██████████| 2500/2500 [23:31<00:00,  1.77it/s]


✅ Epoch 6, Train Loss: 1.2362
📉 Validation Loss: 1.2543
✅ Validation improved, saving best model...


Epoch 7/10: 100%|██████████| 2500/2500 [23:31<00:00,  1.77it/s]


✅ Epoch 7, Train Loss: 1.2012
📉 Validation Loss: 1.2311
✅ Validation improved, saving best model...


Epoch 8/10: 100%|██████████| 2500/2500 [23:29<00:00,  1.77it/s]


✅ Epoch 8, Train Loss: 1.1732
📉 Validation Loss: 1.2132
✅ Validation improved, saving best model...


Epoch 9/10: 100%|██████████| 2500/2500 [23:30<00:00,  1.77it/s]


✅ Epoch 9, Train Loss: 1.1500
📉 Validation Loss: 1.1987
✅ Validation improved, saving best model...


Epoch 10/10: 100%|██████████| 2500/2500 [23:29<00:00,  1.77it/s]


✅ Epoch 10, Train Loss: 1.1313
📉 Validation Loss: 1.1849
✅ Validation improved, saving best model...


In [25]:
model = T5ForConditionalGeneration.from_pretrained(output_dir)
tokenizer = T5Tokenizer.from_pretrained(output_dir)
model.to(device)

bleu_score = evaluate_bleu(model, test_dataloader, tokenizer, device)
print(f"🔵 Final BLEU Score: {bleu_score['score']:.4f}")

# ✅ Dịch thử
model.eval()
test_text = "It begins with a countdown."
inputs = tokenizer(test_text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
translated = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
print(f"🌍 Dịch: {translated_text}")


KeyboardInterrupt: 