In [2]:
! pip install evaluate sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
import evaluate
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
import pandas as pd
import os

In [7]:
def load_split(base_path, split="train"):
    en_path = f"{base_path}/{split}_en.csv"
    vi_path = f"{base_path}/{split}_vi.csv"

    en_df = pd.read_csv(en_path)
    vi_df = pd.read_csv(vi_path)

    df = pd.concat([en_df, vi_df], axis=1)
    return en_df, vi_df, df


In [8]:
base_path = "/kaggle/input/translation-data"

train_df_en, train_df_vi , train_df = load_split(base_path, "train")

valid_df_en , valid_df_vi , valid_df = load_split(base_path, "dev")

test_df_en, test_df_vi , test_df = load_split(base_path,"test")

print("size of train: ",len(train_df_en), '/', len(train_df_vi), "/", len(train_df))
print("size of dev: ",len(valid_df_en) , "/" , len(valid_df_vi), "/",len(valid_df))
print("size of test: ",len(test_df_en) , "/" , len(test_df_vi), "/", len(test_df))

size of train:  2977999 / 2977999 / 2977999
size of dev:  18719 / 18719 / 18719
size of test:  19151 / 19151 / 19151


In [9]:
print(train_df_vi.head())
print(train_df_en.head())
print(train_df.head())

                                                  vi
0          C√¢u chuy·ªán b·∫Øt ƒë·∫ßu v·ªõi bu·ªïi l·ªÖ ƒë·∫øm ng∆∞·ª£c.
1  Ng√†y 14, th√°ng 8, nƒÉm 1947, g·∫ßn n·ª≠a ƒë√™m, ·ªü Bom...
2  C√πng l√∫c, tr√™n kh·∫Øp ƒë·∫•t ·∫§n, ng∆∞·ªùi ta n√≠n th·ªü c...
3  Khi ƒë·ªìng h·ªì ƒëi·ªÉm th·ªùi kh·∫Øc n·ª≠a ƒë√™m, m·ªôt ƒë·ª©a tr...
4  Nh·ªØng s·ª± ki·ªán n√†y l√† n·ªÅn m√≥ng t·∫°o n√™n "Nh·ªØng ƒë...
                                                  en
0                        It begins with a countdown.
1  On August 14th, 1947, a woman in Bombay goes i...
2  Across India, people hold their breath for the...
3  And at the stroke of midnight, a squirming inf...
4  These events form the foundation of "Midnight'...
                                                  en  \
0                        It begins with a countdown.   
1  On August 14th, 1947, a woman in Bombay goes i...   
2  Across India, people hold their breath for the...   
3  And at the stroke of midnight, a squirming inf...   

In [10]:
print(valid_df_en.head())
print(valid_df_vi.head())
print(valid_df.head())

                                                  en
0  ÔªøHurricane Dorian, one of the most powerful st...
1  Dorian is especially dangerous due to its slow...
2  The storm passed by the Leeward Islands, Puert...
3  The United States branch office continues to g...
4  At this time, there have been no reported inju...
                                                  vi
0  V√†o ch·ªß nh·∫≠t ng√†y 1-9-2019, c∆°n b√£o Dorian, m·ªô...
1  B√£o Dorian ƒë·∫∑c bi·ªát nguy hi·ªÉm v√¨ n√≥ di chuy·ªÉn ...
2  Khi ƒëi qua qu·∫ßn ƒë·∫£o Leeward, Puerto Rico v√† qu...
3  VƒÉn ph√≤ng chi nh√°nh Hoa K·ª≥ ti·∫øp t·ª•c c·∫≠p nh·∫≠t t...
4  Theo b√°o c√°o ƒë·∫øn th·ªùi ƒëi·ªÉm hi·ªán t·∫°i, trong 46 ...
                                                  en  \
0  ÔªøHurricane Dorian, one of the most powerful st...   
1  Dorian is especially dangerous due to its slow...   
2  The storm passed by the Leeward Islands, Puert...   
3  The United States branch office continues to g...   
4  At this time, there h

In [11]:
print(test_df_en.head())
print(test_df_vi.head())
print(test_df.head())

                                                  en
0  Brother Albert Barnett and his wife, Sister Su...
1  Severe storms ripped through parts of the sout...
2  Two days of heavy rain, high winds, and numero...
3  Sadly, Brother Albert Barnett and his wife, Si...
4  The United States branch also reports that at ...
                                                  vi
0  Anh Albert Barnett v√† ch·ªã Susan Barnett, thu·ªôc...
1  Ng√†y 11 v√† 12-1-2020, nh·ªØng c∆°n b√£o l·ªõn ƒë√£ qu√©...
2  Nh·ªØng tr·∫≠n m∆∞a to v√† gi√≥ l·ªõn trong su·ªët hai ng...
3  ƒê√°ng bu·ªìn l√† anh Albert Barnett 85 tu·ªïi, v√† v·ª£...
4  Chi nh√°nh Hoa K·ª≥ c≈©ng cho bi·∫øt c√≥ √≠t nh·∫•t b·ªën ...
                                                  en  \
0  Brother Albert Barnett and his wife, Sister Su...   
1  Severe storms ripped through parts of the sout...   
2  Two days of heavy rain, high winds, and numero...   
3  Sadly, Brother Albert Barnett and his wife, Si...   
4  The United States branch also re

In [12]:
class TranslationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_lang="en", target_lang="vi", max_length=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.source_lang = source_lang
        self.target_lang = target_lang
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        source_text = f"translate English to Vietnamese: {self.dataframe[self.source_lang].iloc[idx]}"
        target_text = self.dataframe[self.target_lang].iloc[idx]

        source_encoding = self.tokenizer(
            source_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        with self.tokenizer.as_target_tokenizer():
            target_encoding = self.tokenizer(
                target_text,
                max_length=self.max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )

        labels = target_encoding["input_ids"].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100  # ‚ö†Ô∏è mask pad token

        return {
            "input_ids": source_encoding["input_ids"].squeeze(),
            "attention_mask": source_encoding["attention_mask"].squeeze(),
            "labels": labels
        }


In [23]:
def evaluate_bleu(model, dataloader, tokenizer, device):
    model.eval()
    predictions = []
    references = []
    bleu = evaluate.load("sacrebleu")  # ‚úÖ d√πng evaluate

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=128,
                num_beams=4,
                early_stopping=True
            )
            pred_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

            # ‚úÖ S·ª≠a ·ªü ƒë√¢y: thay -100 b·∫±ng pad_token_id tr∆∞·ªõc khi decode
            labels[labels == -100] = tokenizer.pad_token_id
            ref_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

            predictions.extend(pred_texts)
            references.extend([[ref] for ref in ref_texts])  # sacrebleu expects list of list

    bleu_score = bleu.compute(predictions=predictions, references=references)
    return bleu_score


In [17]:
# C√†i ƒë·∫∑t
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Dataset (gi·ªØ nguy√™n n·∫øu b·∫°n ƒë√£ khai b√°o class TranslationDataset)
train_dataset = TranslationDataset(train_df.head(80000), tokenizer)
valid_dataset = TranslationDataset(valid_df.head(10000), tokenizer)
test_dataset = TranslationDataset(test_df.head(10000), tokenizer)

batch_size = 32

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = StepLR(optimizer, step_size=1, gamma=0.9)

# ‚úÖ Th∆∞ m·ª•c l∆∞u m√¥ h√¨nh cho Kaggle
output_dir = "/kaggle/working/my_translation_model/best"
os.makedirs(output_dir, exist_ok=True)

num_epochs = 10
early_stop_patience = 2
best_val_loss = float("inf")
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"‚úÖ Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in valid_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(valid_dataloader)
    print(f"üìâ Validation Loss: {avg_val_loss:.4f}")

    # Early Stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        print("‚úÖ Validation improved, saving best model...")
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
    else:
        patience_counter += 1
        print(f"‚ö†Ô∏è No improvement. Patience: {patience_counter}/{early_stop_patience}")
        if patience_counter >= early_stop_patience:
            print("üõë Early stopping triggered.")
            break

    scheduler.step()



Epoch 1/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2500/2500 [23:32<00:00,  1.77it/s]


‚úÖ Epoch 1, Train Loss: 1.8740
üìâ Validation Loss: 1.6092
‚úÖ Validation improved, saving best model...


Epoch 2/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2500/2500 [23:30<00:00,  1.77it/s]


‚úÖ Epoch 2, Train Loss: 1.5532
üìâ Validation Loss: 1.4742
‚úÖ Validation improved, saving best model...


Epoch 3/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2500/2500 [23:31<00:00,  1.77it/s]


‚úÖ Epoch 3, Train Loss: 1.4259
üìâ Validation Loss: 1.3842
‚úÖ Validation improved, saving best model...


Epoch 4/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2500/2500 [23:32<00:00,  1.77it/s]


‚úÖ Epoch 4, Train Loss: 1.3413
üìâ Validation Loss: 1.3278
‚úÖ Validation improved, saving best model...


Epoch 5/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2500/2500 [23:30<00:00,  1.77it/s]


‚úÖ Epoch 5, Train Loss: 1.2815
üìâ Validation Loss: 1.2879
‚úÖ Validation improved, saving best model...


Epoch 6/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2500/2500 [23:31<00:00,  1.77it/s]


‚úÖ Epoch 6, Train Loss: 1.2362
üìâ Validation Loss: 1.2543
‚úÖ Validation improved, saving best model...


Epoch 7/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2500/2500 [23:31<00:00,  1.77it/s]


‚úÖ Epoch 7, Train Loss: 1.2012
üìâ Validation Loss: 1.2311
‚úÖ Validation improved, saving best model...


Epoch 8/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2500/2500 [23:29<00:00,  1.77it/s]


‚úÖ Epoch 8, Train Loss: 1.1732
üìâ Validation Loss: 1.2132
‚úÖ Validation improved, saving best model...


Epoch 9/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2500/2500 [23:30<00:00,  1.77it/s]


‚úÖ Epoch 9, Train Loss: 1.1500
üìâ Validation Loss: 1.1987
‚úÖ Validation improved, saving best model...


Epoch 10/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2500/2500 [23:29<00:00,  1.77it/s]


‚úÖ Epoch 10, Train Loss: 1.1313
üìâ Validation Loss: 1.1849
‚úÖ Validation improved, saving best model...


In [25]:
model = T5ForConditionalGeneration.from_pretrained(output_dir)
tokenizer = T5Tokenizer.from_pretrained(output_dir)
model.to(device)

bleu_score = evaluate_bleu(model, test_dataloader, tokenizer, device)
print(f"üîµ Final BLEU Score: {bleu_score['score']:.4f}")

# ‚úÖ D·ªãch th·ª≠
model.eval()
test_text = "It begins with a countdown."
inputs = tokenizer(test_text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
translated = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
print(f"üåç D·ªãch: {translated_text}")


KeyboardInterrupt: 