In [None]:
import gdown
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Tạo thư mục để chứa dữ liệu
os.makedirs('/content/my_data', exist_ok=True)

# mount folder cụ thể
folder_url = 'https://drive.google.com/drive/folders/186OAOuSEYEDVcry7WP5UBdqECXo26QAb?usp=drive_link'

!gdown --folder https://drive.google.com/drive/folders/186OAOuSEYEDVcry7WP5UBdqECXo26QAb?usp=drive_link -O /content/my_data

Retrieving folder contents
Processing file 1hoTd2hFwjSeFThlPm6YpN0NW5ePXS3Jc small-dev.json
Processing file 1_3L25SH1_jaEfOjpmpgnfMik4N3MxSyn small-test.json
Processing file 1-eG6FeF-v__rsf77iWurddahXbyjTYh5 small-train.json
Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1hoTd2hFwjSeFThlPm6YpN0NW5ePXS3Jc
To: /content/my_data/small-dev.json
100% 594k/594k [00:00<00:00, 119MB/s]
Downloading...
From: https://drive.google.com/uc?id=1_3L25SH1_jaEfOjpmpgnfMik4N3MxSyn
To: /content/my_data/small-test.json
100% 669k/669k [00:00<00:00, 114MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-eG6FeF-v__rsf77iWurddahXbyjTYh5
To: /content/my_data/small-train.json
100% 5.68M/5.68M [00:00<00:00, 234MB/s]
Download completed


In [None]:
!pip install rouge-score




In [None]:
import json

train_path = "/content/my_data/small-train.json"
dev_path   = "/content/my_data/small-dev.json"
test_path  = "/content/my_data/small-test.json"

# Load dữ liệu
with open(train_path, "r", encoding="utf-8") as f:
    train_data = json.load(f)

with open(dev_path, "r", encoding="utf-8") as f:
    dev_data = json.load(f)

with open(test_path, "r", encoding="utf-8") as f:
    test_data = json.load(f)

print("Train size:", len(train_data))
print("Dev size:", len(dev_data))
print("Test size:", len(test_data))


Train size: 20000
Dev size: 2000
Test size: 2000


In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random
import json
from typing import List, Tuple, Dict
from collections import Counter


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Sử dụng thiết bị: {device}")



Sử dụng thiết bị: cuda


In [None]:

# Thiết bị
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Token đặc biệt
PAD_TOKEN = "<pad>"
SOS_TOKEN = "<sos>"
EOS_TOKEN = "<eos>"
UNK_TOKEN = "<unk>"

EMBEDDING_DIM = 256
HIDDEN_SIZE = 256
NUM_LAYERS = 3
BATCH_SIZE = 64
N_EPOCHS = 10
TEACHER_FORCING_RATIO = 0.5




Device: cuda


# Bài 1: Xây dựng kiến trúc Encoder-Decoder gồm 3 lớp LSTM cho module encoder và 3 lớp LSTM cho module decoder, với hidden size là 256, cho bài toán dịch máy từ tiếng Anh sang tiếng Việt. Huấn luyện mô hình này trên bộ dữ liệu PhoMT sử dụng Adam làm phương thức tối ưu tham số. Đánh giá độ hiệu quả của mô hình sử dụng độ đo ROUGE-L.

In [None]:
class Vocab:
    def __init__(self, min_freq=2):
        self.word2idx = {
            PAD_TOKEN: 0,
            SOS_TOKEN: 1,
            EOS_TOKEN: 2,
            UNK_TOKEN: 3
        }
        self.idx2word = {i:w for w,i in self.word2idx.items()}
        self.counter = Counter()
        self.n_words = 4
        self.min_freq = min_freq

    def add_sentence(self, sentence):
        for w in sentence.split():
            self.counter[w] += 1

    def build(self):
        for w, c in self.counter.items():
            if c >= self.min_freq and w not in self.word2idx:
                self.word2idx[w] = self.n_words
                self.idx2word[self.n_words] = w
                self.n_words += 1

    def encode(self, sentence):
        return [self.word2idx.get(w, self.word2idx[UNK_TOKEN]) for w in sentence.split()]

src_vocab = Vocab()
tgt_vocab = Vocab()

for pair in train_data:
    src_vocab.add_sentence(pair["english"])
    tgt_vocab.add_sentence(pair["vietnamese"])

src_vocab.build()
tgt_vocab.build()

PAD_IDX = src_vocab.word2idx[PAD_TOKEN]

print("SRC vocab size:", src_vocab.n_words)
print("TGT vocab size:", tgt_vocab.n_words)


SRC vocab size: 10721
TGT vocab size: 5080


In [None]:
class PhoMTDataset(Dataset):
    def __init__(self, data, src_vocab, tgt_vocab):
        self.data = data
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src = self.src_vocab.encode(self.data[idx]["english"])
        tgt = self.tgt_vocab.encode(self.data[idx]["vietnamese"])
        tgt = [self.tgt_vocab.word2idx[SOS_TOKEN]] + tgt + [self.tgt_vocab.word2idx[EOS_TOKEN]]
        return torch.tensor(src), torch.tensor(tgt)




In [None]:
def collate_fn(batch):
    srcs, tgts = zip(*batch)
    srcs = nn.utils.rnn.pad_sequence(srcs, batch_first=True, padding_value=PAD_IDX)
    tgts = nn.utils.rnn.pad_sequence(tgts, batch_first=True, padding_value=PAD_IDX)
    return srcs.to(device), tgts.to(device)


In [None]:
train_loader = DataLoader(
    PhoMTDataset(train_data, src_vocab, tgt_vocab),
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn
)


In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, EMBEDDING_DIM, padding_idx=PAD_IDX)
        self.lstm = nn.LSTM(
            EMBEDDING_DIM,
            HIDDEN_SIZE,
            num_layers=NUM_LAYERS,
            batch_first=True
        )

    def forward(self, src):
        embedded = self.embedding(src)
        _, (hidden, cell) = self.lstm(embedded)
        return hidden, cell


In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, EMBEDDING_DIM, padding_idx=PAD_IDX)
        self.lstm = nn.LSTM(
            EMBEDDING_DIM,
            HIDDEN_SIZE,
            num_layers=NUM_LAYERS,
            batch_first=True
        )
        self.fc = nn.Linear(HIDDEN_SIZE, vocab_size)

    def forward(self, input_token, hidden, cell):
        embedded = self.embedding(input_token)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell


In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, tgt_len-1, vocab_size).to(device)

        hidden, cell = self.encoder(src)
        decoder_input = tgt[:, 0].unsqueeze(1)

        for t in range(1, tgt_len):
            output, hidden, cell = self.decoder(decoder_input, hidden, cell)
            outputs[:, t-1] = output

            use_teacher = random.random() < TEACHER_FORCING_RATIO
            top1 = output.argmax(1)
            decoder_input = tgt[:, t].unsqueeze(1) if use_teacher else top1.unsqueeze(1)

        return outputs


In [None]:
encoder = Encoder(src_vocab.n_words).to(device)
decoder = Decoder(tgt_vocab.n_words).to(device)
model = Seq2Seq(encoder, decoder).to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)


In [None]:
for epoch in range(N_EPOCHS):
    model.train()
    total_loss = 0

    for src, tgt in train_loader:
        optimizer.zero_grad()
        output = model(src, tgt)

        loss = criterion(
            output.reshape(-1, output.size(-1)),
            tgt[:, 1:].reshape(-1)
        )

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{N_EPOCHS} - Loss: {total_loss/len(train_loader):.4f}")


Epoch 1/5 - Loss: 6.2686
Epoch 2/5 - Loss: 6.0120
Epoch 3/5 - Loss: 5.8003
Epoch 4/5 - Loss: 5.6275
Epoch 5/5 - Loss: 5.4756


# Bài 2: Xây dựng kiến trúc Encoder-Decoder gồm 3 lớp LSTM cho module encoder và 3 lớp LSTM cho module decoder, với hidden size là 256, cho bài toán dịch máy từ tiếng Anh sang tiếng Việt. Module decoder được trang bị kỹ thuật attention theo mô tả của nghiên cứu "[Neural Machine Translation by Jointly Learning to Align and Translate](https://arxiv.org/abs/1409.0473)". Huấn luyện mô hình này trên bộ dữ liệu PhoMT sử dụng Adam làm phương thức tối ưu tham số. Đánh giá độ hiệu quả của mô hình sử dụn độ đo ROUGE-L.

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, EMBEDDING_DIM, padding_idx=PAD_IDX)
        self.lstm = nn.LSTM(
            EMBEDDING_DIM,
            HIDDEN_SIZE,
            num_layers=NUM_LAYERS,
            batch_first=True
        )

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell


In [None]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.W1 = nn.Linear(hidden_size, hidden_size)
        self.W2 = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Linear(hidden_size, 1)

    def forward(self, decoder_hidden, encoder_outputs):
        src_len = encoder_outputs.size(1)

        decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(
            self.W1(encoder_outputs) + self.W2(decoder_hidden)
        )

        attention = self.v(energy).squeeze(2)
        attention_weights = torch.softmax(attention, dim=1)

        context = torch.bmm(
            attention_weights.unsqueeze(1),
            encoder_outputs
        ).squeeze(1)

        return context, attention_weights


In [None]:
class DecoderAttention(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, EMBEDDING_DIM, padding_idx=PAD_IDX)

        self.lstm = nn.LSTM(
            EMBEDDING_DIM + HIDDEN_SIZE,
            HIDDEN_SIZE,
            num_layers=NUM_LAYERS,
            batch_first=True
        )

        self.attention = BahdanauAttention(HIDDEN_SIZE)
        self.fc = nn.Linear(HIDDEN_SIZE * 2, vocab_size)

    def forward(self, input_token, hidden, cell, encoder_outputs):
        embedded = self.embedding(input_token)

        decoder_hidden = hidden[-1]
        context, attn_weights = self.attention(decoder_hidden, encoder_outputs)

        context = context.unsqueeze(1)
        lstm_input = torch.cat((embedded, context), dim=2)

        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))

        prediction = self.fc(
            torch.cat((output.squeeze(1), context.squeeze(1)), dim=1)
        )

        return prediction, hidden, cell, attn_weights


In [None]:
class Seq2SeqAttention(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, tgt_len-1, vocab_size).to(device)

        encoder_outputs, hidden, cell = self.encoder(src)
        decoder_input = tgt[:, 0].unsqueeze(1)

        for t in range(1, tgt_len):
            output, hidden, cell, _ = self.decoder(
                decoder_input, hidden, cell, encoder_outputs
            )

            outputs[:, t-1] = output
            top1 = output.argmax(1)

            decoder_input = (
                tgt[:, t].unsqueeze(1)
                if random.random() < TEACHER_FORCING_RATIO
                else top1.unsqueeze(1)
            )

        return outputs


In [None]:
encoder = Encoder(src_vocab.n_words).to(device)
decoder = DecoderAttention(tgt_vocab.n_words).to(device)

model = Seq2SeqAttention(encoder, decoder).to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)


In [None]:
for epoch in range(N_EPOCHS):
    model.train()
    total_loss = 0

    for src, tgt in train_loader:
        optimizer.zero_grad()
        output = model(src, tgt)

        loss = criterion(
            output.reshape(-1, output.size(-1)),
            tgt[:, 1:].reshape(-1)
        )

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f" Epoch {epoch+1}/{N_EPOCHS} - Loss: {total_loss/len(train_loader):.4f}")


 Epoch 1/10 - Loss: 5.8666
 Epoch 2/10 - Loss: 5.5696
 Epoch 3/10 - Loss: 5.2287
 Epoch 4/10 - Loss: 4.9235
 Epoch 5/10 - Loss: 4.6527
 Epoch 6/10 - Loss: 4.4172
 Epoch 7/10 - Loss: 4.1822
 Epoch 8/10 - Loss: 3.9768
 Epoch 9/10 - Loss: 3.7914
 Epoch 10/10 - Loss: 3.6238


# Bài 3: Xây dựng kiến trúc Encoder-Decoder gồm 3 lớp LSTM cho module encoder và 3 lớp LSTM cho module decoder, với hidden size là 256, cho bài toán dịch máy từ tiếng Anh sang tiếng Việt. Module decoder được trang bị kỹ thuật attention theo mô tả của nghiên cứu "[Effective Approaches to Attention-based Neural Machine Translation](https://arxiv.org/abs/1508.04025)". Huấn luyện mô hình này trên bộ dữ liệu PhoMT sử dụng Adam làm phương thức tối ưu tham số. Đánh giá độ hiệu quả của mô hình sử dụn độ đo ROUGE-L.

In [None]:
class LuongAttention(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, decoder_hidden, encoder_outputs):
        scores = torch.bmm(
            encoder_outputs,
            decoder_hidden.unsqueeze(2)
        ).squeeze(2)

        attn_weights = torch.softmax(scores, dim=1)

        context = torch.bmm(
            attn_weights.unsqueeze(1),
            encoder_outputs
        ).squeeze(1)

        return context, attn_weights


In [None]:
class DecoderLuong(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, EMBEDDING_DIM, padding_idx=PAD_IDX)

        self.lstm = nn.LSTM(
            EMBEDDING_DIM,
            HIDDEN_SIZE,
            num_layers=NUM_LAYERS,
            batch_first=True
        )

        self.attention = LuongAttention()
        self.fc = nn.Linear(HIDDEN_SIZE * 2, vocab_size)

    def forward(self, input_token, hidden, cell, encoder_outputs):
        embedded = self.embedding(input_token)

        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        decoder_hidden = output.squeeze(1)

        context, attn_weights = self.attention(decoder_hidden, encoder_outputs)

        prediction = self.fc(
            torch.cat((decoder_hidden, context), dim=1)
        )

        return prediction, hidden, cell, attn_weights


In [None]:
class Seq2SeqLuong(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, tgt_len-1, vocab_size).to(device)

        encoder_outputs, hidden, cell = self.encoder(src)
        decoder_input = tgt[:, 0].unsqueeze(1)

        for t in range(1, tgt_len):
            output, hidden, cell, _ = self.decoder(
                decoder_input, hidden, cell, encoder_outputs
            )

            outputs[:, t-1] = output
            top1 = output.argmax(1)

            decoder_input = (
                tgt[:, t].unsqueeze(1)
                if random.random() < TEACHER_FORCING_RATIO
                else top1.unsqueeze(1)
            )

        return outputs


In [None]:
encoder = Encoder(src_vocab.n_words).to(device)
decoder = DecoderLuong(tgt_vocab.n_words).to(device)

model = Seq2SeqLuong(encoder, decoder).to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)


In [None]:
for epoch in range(N_EPOCHS):
    model.train()
    total_loss = 0

    for src, tgt in train_loader:
        optimizer.zero_grad()
        output = model(src, tgt)

        loss = criterion(
            output.reshape(-1, output.size(-1)),
            tgt[:, 1:].reshape(-1)
        )

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{N_EPOCHS} - Loss: {total_loss/len(train_loader):.4f}")


Epoch 1/10 - Loss: 6.2031
Epoch 2/10 - Loss: 5.8211
Epoch 3/10 - Loss: 5.4783
Epoch 4/10 - Loss: 5.1753
Epoch 5/10 - Loss: 4.9282
Epoch 6/10 - Loss: 4.6970
Epoch 7/10 - Loss: 4.4839
Epoch 8/10 - Loss: 4.2885
Epoch 9/10 - Loss: 4.1127
Epoch 10/10 - Loss: 3.9396
