In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import unicodedata
import re
import time
from tqdm import tqdm

In [2]:
class Vocab:
    def __init__(self, chars):
        self.pad, self.go, self.eos = 0, 1, 2
        self.chars = chars
        self.c2i = {c: i + 3 for i, c in enumerate(chars)}
        self.i2c = {i + 3: c for i, c in enumerate(chars)}
        self.i2c.update({0: '<pad>', 1: '<sos>', 2: '<eos>'})
    def encode(self, text): return [self.go] + [self.c2i.get(c, 0) for c in text] + [self.eos]
    def decode(self, ids):
        first = 1 if self.go in ids else 0
        last = ids.index(self.eos) if self.eos in ids else None
        return "".join([self.i2c.get(i, '') for i in ids[first:last]])
    def __len__(self): return len(self.c2i) + 3

In [3]:
class SpellingDataset(Dataset):
    def __init__(self, df, vocab, max_len=100):
        self.df = df
        self.vocab = vocab
        self.max_len = max_len
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row['text'])
        correct_text = str(row['correct_text'])

        # Cắt bớt câu nếu cần để đảm bảo nó + 2 tokens đặc biệt không vượt quá max_len
        if len(text) > self.max_len - 2:
            text = text[:self.max_len - 2]
        if len(correct_text) > self.max_len - 2:
            correct_text = correct_text[:self.max_len - 2]

        src_encoded = self.vocab.encode(text)
        tgt_encoded = self.vocab.encode(correct_text)
        src = np.zeros(self.max_len, dtype=np.int64)
        src[:len(src_encoded)] = src_encoded
        tgt = np.zeros(self.max_len, dtype=np.int64)
        tgt[:len(tgt_encoded)] = tgt_encoded
        return torch.LongTensor(src), torch.LongTensor(tgt)

In [4]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        return outputs, hidden
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
    def forward(self, hidden, encoder_outputs):
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2)))
        attention = self.v(energy).squeeze(2)
        return F.softmax(attention, dim=1)
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs)
        a = a.unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs)
        weighted = weighted.permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        return prediction, hidden.squeeze(0)
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
        input = trg[0,:]
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = np.random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs

In [5]:
if __name__ == '__main__':
    # --- Cấu hình siêu tham số ---
    MAX_LEN_LIMIT = 250
    TRAIN_SAMPLE_SIZE = 150000
    N_EPOCHS = 3
    
    GPU_BATCH_SIZE = 16 
    REAL_BATCH_SIZE = 64
    ACCUMULATION_STEPS = REAL_BATCH_SIZE // GPU_BATCH_SIZE
    
    ENC_EMB_DIM, DEC_EMB_DIM = 128, 128
    ENC_HID_DIM, DEC_HID_DIM = 256, 256

    # --- Tải và lọc dữ liệu ---
    train_path = '/kaggle/input/preprocessing/train.csv'
    df_train = pd.read_csv(train_path)
    df_train.dropna(subset=['text', 'correct_text'], inplace=True)
    df_train = df_train.astype(str)
    
    df_filtered = df_train[(df_train['text'].str.len() < MAX_LEN_LIMIT) & (df_train['correct_text'].str.len() < MAX_LEN_LIMIT)].copy()
    df_sample = df_filtered.sample(n=min(len(df_filtered), TRAIN_SAMPLE_SIZE), random_state=42)
    print(f"Số lượng câu sẽ được dùng để huấn luyện: {len(df_sample)}")

    # --- Tạo Vocabulary ---
    vietnamese_chars = 'aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ0123456789!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~ '
    vocab = Vocab(vietnamese_chars)
    
    # --- Tạo DataLoader ---
    train_dataset = SpellingDataset(df_sample, vocab, max_len=MAX_LEN_LIMIT)
    train_loader = DataLoader(train_dataset, batch_size=GPU_BATCH_SIZE, shuffle=True)
    print(f"Đã tạo DataLoader với MAX_LEN = {MAX_LEN_LIMIT} và GPU_BATCH_SIZE = {GPU_BATCH_SIZE}")

    # --- Khởi tạo model ---
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    INPUT_DIM, OUTPUT_DIM = len(vocab), len(vocab)
    ENC_DROPOUT, DEC_DROPOUT = 0.5, 0.5
    CLIP, LEARNING_RATE = 1, 0.001
    MODEL_SAVE_PATH = 'vietnamese_spelling_correction_final.pth'

    attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)
    model = Seq2Seq(enc, dec, device).to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad)
    
    # --- Vòng lặp huấn luyện ---
    print(f"--- Bắt đầu huấn luyện trên thiết bị: {device} ---")
    start_time_total = time.time()
    
    for epoch in range(N_EPOCHS):
        model.train()
        epoch_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{N_EPOCHS}")
        
        optimizer.zero_grad()
        
        for i, (src, tgt) in enumerate(progress_bar):
            src, tgt = src.to(device).T, tgt.to(device).T
            
            output = model(src, tgt)
            output_dim = output.shape[-1]
            
            output = output[1:].reshape(-1, output_dim)
            tgt = tgt[1:].reshape(-1)
            
            loss = criterion(output, tgt)
            loss = loss / ACCUMULATION_STEPS
            loss.backward()
            
            if (i + 1) % ACCUMULATION_STEPS == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
                optimizer.step()
                optimizer.zero_grad()

            epoch_loss += loss.item() * ACCUMULATION_STEPS
            progress_bar.set_postfix({'loss': f'{loss.item()*ACCUMULATION_STEPS:.3f}'})

        avg_loss = epoch_loss / len(train_loader.dataset) * GPU_BATCH_SIZE
        print(f"\nEpoch: {epoch+1:02} | Train Loss: {avg_loss:.3f}")

    end_time_total = time.time()
    training_mins, training_secs = divmod(end_time_total - start_time_total, 60)

    # --- Lưu model ---
    torch.save(model.state_dict(), MODEL_SAVE_PATH)
    print(f"\n--- HUẤN LUYỆN HOÀN TẤT TRONG {int(training_mins)}m {int(training_secs)}s ---")
    print(f"Model đã được lưu tại: /kaggle/working/{MODEL_SAVE_PATH}")

Số lượng câu sẽ được dùng để huấn luyện: 150000
Đã tạo DataLoader với MAX_LEN = 250 và GPU_BATCH_SIZE = 16
--- Bắt đầu huấn luyện trên thiết bị: cuda ---


Epoch 1/3: 100%|██████████| 9375/9375 [1:24:41<00:00,  1.84it/s, loss=0.226]



Epoch: 01 | Train Loss: 1.089


Epoch 2/3: 100%|██████████| 9375/9375 [1:26:51<00:00,  1.80it/s, loss=0.159]



Epoch: 02 | Train Loss: 0.226


Epoch 3/3: 100%|██████████| 9375/9375 [1:26:52<00:00,  1.80it/s, loss=0.133]


Epoch: 03 | Train Loss: 0.198

--- HUẤN LUYỆN HOÀN TẤT TRONG 258m 25s ---
Model đã được lưu tại: /kaggle/working/vietnamese_spelling_correction_final.pth



