In [2]:
! pip install evaluate sacrebleu --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 1

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import tqdm as tqdm_
from torch.optim import AdamW
from sacrebleu import corpus_bleu


In [9]:
def load_dataset_kaggle(en_path, vi_path, sample_size=None):
    en_df = pd.read_csv(en_path, names=["en"], encoding="utf-8")
    vi_df = pd.read_csv(vi_path, names=["vi"], encoding="utf-8")
    df = pd.concat([en_df, vi_df], axis=1).dropna().reset_index(drop=True)
    print(len(en_df),"/",len(vi_df),"/",len(df))
    if sample_size:
        df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
    return df

# 🔹 Đường dẫn Kaggle
base_path = "/kaggle/input/translation-data"

# 🔹 Load training data
df_training = load_dataset_kaggle(
    f"{base_path}/train_en.csv",
    f"{base_path}/train_vi.csv",
    sample_size=80000
)
print(df_training.head())

# 🔹 Load validation data
df_validation = load_dataset_kaggle(
    f"{base_path}/dev_en.csv",
    f"{base_path}/dev_vi.csv",
    sample_size=10000
)
print(df_validation.head())

# 🔹 Load test data
df_test = load_dataset_kaggle(
    f"{base_path}/test_en.csv",
    f"{base_path}/test_vi.csv",
    sample_size=10000
)
print(df_test.head())


2978000 / 2978000 / 2978000
                                                  en  \
0  I knew if I wanted to live, I would have to th...   
1  But once I lost my sight and was walking along...   
2  She is a member of FRELIMO and was elected to ...   
3                        What are we supposed to do?   
4  And this is really what I want to talk to you ...   

                                                  vi  
0  Tôi biết nếu tôi muốn sống, tôi phải biết nghĩ...  
1  Nhưng khi mất đi thị giác và đi dọc trên đường...  
2  Cô là thành viên của FRELIMO và được bầu vào H...  
3                           Gia đình tớ phải làm gì?  
4  Đây là điều mà tôi thực sự muốn nói hôm nay - ...  
18720 / 18720 / 18720
                                                  en  \
0  Tanjirou Kamado is a kindhearted, intelligent ...   
1  Incorporate gold and bronze into your look, es...   
2  You're better off just getting to work despite...   
3                           She's not Iris, I guess.   
4  

In [10]:
class TranslationData(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        src = "translate English to Vietnamese: " + self.dataframe.loc[index, 'en']
        tgt = self.dataframe.loc[index, 'vi']

        src_tokenizer = self.tokenizer(src, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
        tgt_tokenizer = self.tokenizer(tgt, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")

        input_ids = src_tokenizer['input_ids'].squeeze()
        attention_mask = src_tokenizer['attention_mask'].squeeze()
        labels = tgt_tokenizer['input_ids'].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

In [None]:
#  Model setup
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

#  Dataloader setup
max_length = 128
batch_size = 32

train_loader = DataLoader(TranslationData(df_training, tokenizer, max_length), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TranslationData(df_validation, tokenizer, max_length), batch_size=batch_size)
test_loader = DataLoader(TranslationData(df_test, tokenizer, max_length), batch_size=batch_size)

#  Optimizer & Training
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 10
patience = 2
best_val_loss = float("inf")
epochs_no_improve = 0
save_dir = "/kaggle/working/my_t5_translation_model"
os.makedirs(save_dir, exist_ok=True)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    loop = tqdm_.tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} Training")

    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_postfix(loss=loss.item())

    avg_train_loss = train_loss / len(train_loader)

    #  Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)
        print(" Saved best model.")
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        print(f"⚠️ No improvement. Patience {epochs_no_improve}/{patience}")
        if epochs_no_improve >= patience:
            print(" Early stopping.")
            break


Epoch 1/10 Training:   0%|          | 0/2500 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Epoch 1/10 Training: 100%|██████████| 2500/2500 [23:03<00:00,  1.81it/s, loss=1.81]


Epoch 1 - Train Loss: 1.9839, Val Loss: 1.6297
✅ Saved best model.


Epoch 2/10 Training: 100%|██████████| 2500/2500 [23:09<00:00,  1.80it/s, loss=1.53]


Epoch 2 - Train Loss: 1.6843, Val Loss: 1.4659
✅ Saved best model.


Epoch 3/10 Training: 100%|██████████| 2500/2500 [23:08<00:00,  1.80it/s, loss=1.58]


Epoch 3 - Train Loss: 1.5564, Val Loss: 1.3583
✅ Saved best model.


Epoch 4/10 Training: 100%|██████████| 2500/2500 [23:08<00:00,  1.80it/s, loss=1.41]


Epoch 4 - Train Loss: 1.4641, Val Loss: 1.2764
✅ Saved best model.


Epoch 5/10 Training: 100%|██████████| 2500/2500 [23:08<00:00,  1.80it/s, loss=1.37]


Epoch 5 - Train Loss: 1.3929, Val Loss: 1.2150
✅ Saved best model.


Epoch 6/10 Training: 100%|██████████| 2500/2500 [23:08<00:00,  1.80it/s, loss=1.31]


Epoch 6 - Train Loss: 1.3351, Val Loss: 1.1638
✅ Saved best model.


Epoch 7/10 Training: 100%|██████████| 2500/2500 [23:08<00:00,  1.80it/s, loss=1.31]


Epoch 7 - Train Loss: 1.2863, Val Loss: 1.1201
✅ Saved best model.


Epoch 8/10 Training: 100%|██████████| 2500/2500 [23:09<00:00,  1.80it/s, loss=1.13]


Epoch 8 - Train Loss: 1.2441, Val Loss: 1.0815
✅ Saved best model.


Epoch 9/10 Training: 100%|██████████| 2500/2500 [23:08<00:00,  1.80it/s, loss=1.07] 


Epoch 9 - Train Loss: 1.2068, Val Loss: 1.0527
✅ Saved best model.


Epoch 10/10 Training: 100%|██████████| 2500/2500 [23:08<00:00,  1.80it/s, loss=1.13] 


Epoch 10 - Train Loss: 1.1739, Val Loss: 1.0240
✅ Saved best model.


In [None]:
#  Evaluation
model = T5ForConditionalGeneration.from_pretrained(save_dir).to(device)
tokenizer = T5Tokenizer.from_pretrained(save_dir)
model.eval()

predictions = []
references = []
with torch.no_grad():
    for batch in tqdm_.tqdm(test_loader, desc="Testing"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"]

        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        labels[labels == -100] = tokenizer.pad_token_id
        decoded_refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

        predictions.extend(decoded_preds)
        references.extend([[ref] for ref in decoded_refs])

bleu = corpus_bleu(predictions, references)
print(f" Final BLEU score: {bleu.score:.2f}")

Testing: 100%|██████████| 313/313 [09:02<00:00,  1.73s/it]


🔵 Final BLEU score: 39.34
