In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

2025-06-27 09:35:22.207689: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751016922.396721      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751016922.451058      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
train = pd.read_json("/kaggle/input/corpus-errors/Corpus_text_generated.jsonl", lines = True)
train[['text', 'correct_text']].to_csv('train.csv', index = False)

In [4]:
class SpellingCorrectionDataset(Dataset):
    def __init__(self, tokenizer, dataframe, max_len=128):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        input_text = str(row['text'])
        target_text = str(row['correct_text'])

        source = self.tokenizer(
            input_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        target = self.tokenizer(
            target_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        
        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        labels = target_ids.clone()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": source_ids,
            "attention_mask": source_mask,
            "labels": labels
        }

In [5]:
# --- Cấu hình huấn luyện ---
MODEL_NAME = "vinai/bartpho-syllable"
MAX_LEN = 128
N_EPOCHS = 1.5
BATCH_SIZE = 32

# --- Tải dữ liệu và xử lý ---
df = pd.read_csv('/kaggle/working/train.csv')
df.dropna(subset=['text', 'correct_text'], inplace=True)
df = df.astype(str)

# Lọc câu dài quá
df = df[(df['text'].str.len() < MAX_LEN) & (df['correct_text'].str.len() < MAX_LEN)]
print(f"✅ Số lượng câu dùng để huấn luyện: {len(df)}")

# --- Chia train/val ---
train_df, val_df = train_test_split(df, test_size=0.05, random_state=42)
print(f"📘 Train: {len(train_df)} | 🔍 Val: {len(val_df)}")

# --- Load tokenizer và model ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# --- Dataset và DataCollator ---
train_dataset = SpellingCorrectionDataset(tokenizer, train_df.reset_index(drop=True), MAX_LEN)
val_dataset = SpellingCorrectionDataset(tokenizer, val_df.reset_index(drop=True), MAX_LEN)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# --- Cấu hình Trainer ---
training_args = Seq2SeqTrainingArguments(
    output_dir="./bartpho-tuned",
    num_train_epochs=N_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=2,
    warmup_steps=300,
    weight_decay=0.01,
    save_total_limit=1,
    eval_strategy="epoch",  # <--- ĐÃ SỬA
    save_strategy="epoch",
    save_steps=None,
    logging_dir="./logs",
    logging_steps=100,
    fp16=True,
    dataloader_num_workers=2,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)

# --- Train ---
print("🚀 Bắt đầu fine-tuning...")
trainer.train()
print("🏁 Huấn luyện hoàn tất!")

# --- Lưu model ---
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")
print("✅ Đã lưu model và tokenizer vào ./final_model")

✅ Số lượng câu dùng để huấn luyện: 150181
📘 Train: 142671 | 🔍 Val: 7510


config.json:   0%|          | 0.00/897 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

dict.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

🚀 Bắt đầu fine-tuning...


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss
1,0.0475,0.038777


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


🏁 Huấn luyện hoàn tất!
✅ Đã lưu model và tokenizer vào ./final_model
