In [2]:
from google.colab import files

uploaded = files.upload()

Saving dev_word.json to dev_word.json
Saving test_word.json to test_word.json
Saving train_word.json to train_word.json


 ### Bài 1: Xây dựng mô hình Transformer Encoder gồm 3 lớp theo mô tả trong nghiên cứu [Attention is all you need](https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf). Huấn luyện mô hình này cho bài toán phân loại domain câu bình luận trên bộ dữ liệu [UIT-ViOCD](https://drive.google.com/drive/folders/1Lu9axyLkw7dMx80uLRgvCnZsmNzhJWAa?usp=sharing).

In [None]:
import json
import torch
import torch.nn as nn
import numpy as np
import math
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

BATCH_SIZE = 32
MAX_LEN = 128
MIN_FREQ = 2
EPOCHS = 10
LEARNING_RATE = 1e-4

def load_data(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    samples = list(data.values())
    return samples

def tokenize(text):
    return text.lower().split()

def build_vocab_and_labels(samples):
    counter = Counter()
    domains = set()

    for item in samples:
        text = item.get('review', '')
        domain = item.get('domain', '')

        tokens = tokenize(text)
        counter.update(tokens)
        if domain:
            domains.add(domain)

    vocab = {'<pad>': 0, '<unk>': 1}
    idx = 2
    for word, freq in counter.items():
        if freq >= MIN_FREQ:
            vocab[word] = idx
            idx += 1

    domain_map = {d: i for i, d in enumerate(sorted(list(domains)))}

    return vocab, domain_map

train_data = load_data('train.json')
dev_data = load_data('dev.json')
test_data = load_data('test.json')

vocab, domain_map = build_vocab_and_labels(train_data)
print(f"Vocab size: {len(vocab)}")
print(f"Danh sách Domain: {domain_map}")

class DomainClassificationDataset(Dataset):
    def __init__(self, samples, vocab, domain_map, max_len):
        self.samples = samples
        self.vocab = vocab
        self.domain_map = domain_map
        self.max_len = max_len

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        text = item.get('review', '')
        domain = item.get('domain', '')

        tokens = tokenize(text)
        indices = [self.vocab.get(t, self.vocab['<unk>']) for t in tokens]

        if len(indices) > self.max_len:
            indices = indices[:self.max_len]

        label_idx = self.domain_map.get(domain, -1)

        return torch.tensor(indices, dtype=torch.long), torch.tensor(label_idx, dtype=torch.long)

def collate_fn(batch):
    sentences, labels = zip(*batch)
    padded_sentences = pad_sequence(sentences, batch_first=True, padding_value=0)
    return padded_sentences, torch.stack(labels)

train_dataset = DomainClassificationDataset(train_data, vocab, domain_map, MAX_LEN)
dev_dataset = DomainClassificationDataset(dev_data, vocab, domain_map, MAX_LEN)
test_dataset = DomainClassificationDataset(test_data, vocab, domain_map, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

Using device: cuda
Vocab size: 2759
Danh sách Domain: {'app': 0, 'cosmetic': 1, 'fashion': 2, 'mobile': 3}


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k):
        super().__init__()
        self.d_k = d_k

    def forward(self, q, k, v, mask=None):
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        attn = torch.softmax(scores, dim=-1)
        output = torch.matmul(attn, v)
        return output

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super().__init__()
        assert d_model % n_head == 0
        self.d_k = d_model // n_head
        self.n_head = n_head
        self.d_model = d_model

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.fc = nn.Linear(d_model, d_model)
        self.attention = ScaledDotProductAttention(self.d_k)

    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)

        q = self.w_q(q).view(batch_size, -1, self.n_head, self.d_k).transpose(1, 2)
        k = self.w_k(k).view(batch_size, -1, self.n_head, self.d_k).transpose(1, 2)
        v = self.w_v(v).view(batch_size, -1, self.n_head, self.d_k).transpose(1, 2)

        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(2)

        context = self.attention(q, k, v, mask)

        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        output = self.fc(context)
        return output

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.linear2(self.dropout(self.relu(self.linear1(x))))

class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, n_head, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, n_head)
        self.ffw = FeedForward(d_model, d_ff, dropout)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))

        ff_output = self.ffw(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class DomainClassifierModel(nn.Module):
    def __init__(self, vocab_size, num_classes, d_model=256, n_head=4, num_layers=3, d_ff=512, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pe = PositionalEncoding(d_model, dropout=dropout)

        self.layers = nn.ModuleList([
            TransformerEncoderLayer(d_model, n_head, d_ff, dropout)
            for _ in range(num_layers)
        ])

        self.classifier = nn.Linear(d_model, num_classes)

    def forward(self, src, mask=None):
        x = self.embedding(src)
        x = self.pe(x)

        for layer in self.layers:
            x = layer(x, mask)

        x = x.mean(dim=1)

        logits = self.classifier(x)
        return logits

In [None]:
model = DomainClassifierModel(
    vocab_size=len(vocab),
    num_classes=len(domain_map),
    d_model=256,
    n_head=4,
    num_layers=3,
    d_ff=512
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

def train(model, loader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0

    for inputs, labels in loader:
        inputs, labels = inputs.to(device), labels.to(device)

        mask = (inputs != 0).to(device)

        optimizer.zero_grad()
        outputs = model(inputs, mask)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    return epoch_loss / len(loader), correct / total

def evaluate(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            mask = (inputs != 0).to(device)

            outputs = model(inputs, mask)
            loss = criterion(outputs, labels)

            epoch_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return epoch_loss / len(loader), correct / total

print("Huấn luyện mô hình:")
for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    val_loss, val_acc = evaluate(model, dev_loader, criterion)

    print(f"Epoch {epoch+1}/{EPOCHS}")
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.4f}")

test_loss, test_acc = evaluate(model, test_loader, criterion)
print(f"Kết quả trên tập Test:")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Acc:  {test_acc:.4f}")

Huấn luyện mô hình:
Epoch 1/10
Train Loss: 0.8240 | Train Acc: 0.6813
Val Loss:   0.5728 | Val Acc:   0.7828
Epoch 2/10
Train Loss: 0.4302 | Train Acc: 0.8443
Val Loss:   0.4425 | Val Acc:   0.8358
Epoch 3/10
Train Loss: 0.3362 | Train Acc: 0.8767
Val Loss:   0.3306 | Val Acc:   0.8741
Epoch 4/10
Train Loss: 0.2672 | Train Acc: 0.8997
Val Loss:   0.2906 | Val Acc:   0.8887
Epoch 5/10
Train Loss: 0.2276 | Train Acc: 0.9134
Val Loss:   0.3101 | Val Acc:   0.8905
Epoch 6/10
Train Loss: 0.1896 | Train Acc: 0.9339
Val Loss:   0.3187 | Val Acc:   0.8832
Epoch 7/10
Train Loss: 0.1710 | Train Acc: 0.9378
Val Loss:   0.3157 | Val Acc:   0.8869
Epoch 8/10
Train Loss: 0.1520 | Train Acc: 0.9473
Val Loss:   0.3479 | Val Acc:   0.8942
Epoch 9/10
Train Loss: 0.1343 | Train Acc: 0.9494
Val Loss:   0.4016 | Val Acc:   0.8887
Epoch 10/10
Train Loss: 0.1220 | Train Acc: 0.9558
Val Loss:   0.3709 | Val Acc:   0.8960
Kết quả trên tập Test:
Test Loss: 0.2838
Test Acc:  0.9217


### Bài 2: Xây dựng mô hình Transformer Encoder gồm 3 lớp theo mô tả trong nghiên cứu [Attention is all you need](https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf). Huấn luyện mô hình này cho bài toán gán nhãn chuỗi trên bộ dữ liệu [PhoNERT](https://github.com/VinAIResearch/PhoNER_COVID19).

In [5]:

def load_data_jsonl(file_path):
    data = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    data.append(json.loads(line))
    except FileNotFoundError:
        print(f"Warning: File {file_path} not found.")
    return data

def build_vocab_and_tags(data_list):
    counter = Counter()
    tags = set()

    for item in data_list:
        words = item['words']
        item_tags = item['tags']

        words = [w.lower() for w in words]
        counter.update(words)
        tags.update(item_tags)

    vocab = {'<pad>': 0, '<unk>': 1}
    idx = 2
    for word, freq in counter.items():
        if freq >= MIN_FREQ:
            vocab[word] = idx
            idx += 1

    tag_list = sorted(list(tags))
    tag_map = {t: i+1 for i, t in enumerate(tag_list)}
    tag_map['<pad>'] = 0

    return vocab, tag_map

train_data = load_data_jsonl('train_word.json')
dev_data = load_data_jsonl('dev_word.json')
test_data = load_data_jsonl('test_word.json')

vocab, tag_map = build_vocab_and_tags(train_data)
print(f"Vocab size: {len(vocab)}")
print(f"Num tags: {len(tag_map)}")
print(f"Tags: {tag_map}")

class NERDataset(Dataset):
    def __init__(self, data, vocab, tag_map, max_len):
        self.data = data
        self.vocab = vocab
        self.tag_map = tag_map
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        words = item['words']
        tags = item['tags']

        word_indices = [self.vocab.get(w.lower(), self.vocab['<unk>']) for w in words]

        tag_indices = [self.tag_map.get(t, 0) for t in tags]

        if len(word_indices) > self.max_len:
            word_indices = word_indices[:self.max_len]
            tag_indices = tag_indices[:self.max_len]

        return torch.tensor(word_indices, dtype=torch.long), torch.tensor(tag_indices, dtype=torch.long)

def collate_fn(batch):
    words, tags = zip(*batch)
    padded_words = pad_sequence(words, batch_first=True, padding_value=0)
    padded_tags = pad_sequence(tags, batch_first=True, padding_value=0)

    return padded_words, padded_tags

train_dataset = NERDataset(train_data, vocab, tag_map, MAX_LEN)
dev_dataset = NERDataset(dev_data, vocab, tag_map, MAX_LEN)
test_dataset = NERDataset(test_data, vocab, tag_map, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

Using device: cuda
Vocab size: 3043
Num tags: 21
Tags: {'B-AGE': 1, 'B-DATE': 2, 'B-GENDER': 3, 'B-JOB': 4, 'B-LOCATION': 5, 'B-NAME': 6, 'B-ORGANIZATION': 7, 'B-PATIENT_ID': 8, 'B-SYMPTOM_AND_DISEASE': 9, 'B-TRANSPORTATION': 10, 'I-AGE': 11, 'I-DATE': 12, 'I-JOB': 13, 'I-LOCATION': 14, 'I-NAME': 15, 'I-ORGANIZATION': 16, 'I-PATIENT_ID': 17, 'I-SYMPTOM_AND_DISEASE': 18, 'I-TRANSPORTATION': 19, 'O': 20, '<pad>': 0}


In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k):
        super().__init__()
        self.d_k = d_k

    def forward(self, q, k, v, mask=None):
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)

        if mask is not None:
            # Mask (batch, 1, 1, seq_len)
            scores = scores.masked_fill(mask == 0, -1e9)

        attn = torch.softmax(scores, dim=-1)
        output = torch.matmul(attn, v)
        return output

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super().__init__()
        assert d_model % n_head == 0
        self.d_k = d_model // n_head
        self.n_head = n_head
        self.d_model = d_model

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.fc = nn.Linear(d_model, d_model)
        self.attention = ScaledDotProductAttention(self.d_k)

    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)

        q = self.w_q(q).view(batch_size, -1, self.n_head, self.d_k).transpose(1, 2)
        k = self.w_k(k).view(batch_size, -1, self.n_head, self.d_k).transpose(1, 2)
        v = self.w_v(v).view(batch_size, -1, self.n_head, self.d_k).transpose(1, 2)

        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(2)

        context = self.attention(q, k, v, mask)

        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        output = self.fc(context)
        return output

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.linear2(self.dropout(self.relu(self.linear1(x))))

class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, n_head, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, n_head)
        self.ffw = FeedForward(d_model, d_ff, dropout)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))

        ff_output = self.ffw(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class TransformerForTokenClassification(nn.Module):
    def __init__(self, vocab_size, num_tags, d_model=256, n_head=4, num_layers=3, d_ff=512, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pe = PositionalEncoding(d_model, dropout=dropout)

        self.layers = nn.ModuleList([
            TransformerEncoderLayer(d_model, n_head, d_ff, dropout)
            for _ in range(num_layers)
        ])

        self.classifier = nn.Linear(d_model, num_tags)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, mask=None):
        x = self.embedding(src)
        x = self.pe(x)

        for layer in self.layers:
            x = layer(x, mask)

        x = self.dropout(x)
        logits = self.classifier(x)
        return logits

In [7]:
model = TransformerForTokenClassification(
    vocab_size=len(vocab),
    num_tags=len(tag_map),
    d_model=256,
    n_head=4,
    num_layers=3,
    d_ff=512
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

def train_ner(model, loader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    correct_tokens = 0
    total_tokens = 0

    for inputs, tags in loader:
        inputs, tags = inputs.to(device), tags.to(device)

        mask = (inputs != 0).to(device)

        optimizer.zero_grad()
        outputs = model(inputs, mask)

        outputs_flat = outputs.view(-1, outputs.shape[-1])
        tags_flat = tags.view(-1)

        loss = criterion(outputs_flat, tags_flat)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        predictions = torch.argmax(outputs, dim=-1)
        active_mask = (tags != 0)

        correct_tokens += (predictions[active_mask] == tags[active_mask]).sum().item()
        total_tokens += active_mask.sum().item()

    return epoch_loss / len(loader), correct_tokens / total_tokens if total_tokens > 0 else 0

def evaluate_ner(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    correct_tokens = 0
    total_tokens = 0

    with torch.no_grad():
        for inputs, tags in loader:
            inputs, tags = inputs.to(device), tags.to(device)
            mask = (inputs != 0).to(device)

            outputs = model(inputs, mask)

            outputs_flat = outputs.view(-1, outputs.shape[-1])
            tags_flat = tags.view(-1)
            loss = criterion(outputs_flat, tags_flat)
            epoch_loss += loss.item()

            predictions = torch.argmax(outputs, dim=-1)
            active_mask = (tags != 0)

            correct_tokens += (predictions[active_mask] == tags[active_mask]).sum().item()
            total_tokens += active_mask.sum().item()

    return epoch_loss / len(loader), correct_tokens / total_tokens if total_tokens > 0 else 0

print("Huấn luyện mô hình:")
for epoch in range(EPOCHS):
    train_loss, train_acc = train_ner(model, train_loader, optimizer, criterion)
    val_loss, val_acc = evaluate_ner(model, dev_loader, criterion)

    print(f"Epoch {epoch+1}/{EPOCHS}")
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.4f}")

test_loss, test_acc = evaluate_ner(model, test_loader, criterion)
print(f"Kết quả trên tập Test:")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Acc: {test_acc:.4f}")

Huấn luyện mô hình:
Epoch 1/10
Train Loss: 0.9378 | Train Acc: 0.7875
Val Loss:   0.7466 | Val Acc:   0.7971
Epoch 2/10
Train Loss: 0.5539 | Train Acc: 0.8504
Val Loss:   0.5352 | Val Acc:   0.8408
Epoch 3/10
Train Loss: 0.4185 | Train Acc: 0.8781
Val Loss:   0.4376 | Val Acc:   0.8647
Epoch 4/10
Train Loss: 0.3400 | Train Acc: 0.8976
Val Loss:   0.3727 | Val Acc:   0.8831
Epoch 5/10
Train Loss: 0.2864 | Train Acc: 0.9122
Val Loss:   0.3292 | Val Acc:   0.8962
Epoch 6/10
Train Loss: 0.2450 | Train Acc: 0.9243
Val Loss:   0.2983 | Val Acc:   0.9046
Epoch 7/10
Train Loss: 0.2174 | Train Acc: 0.9318
Val Loss:   0.2932 | Val Acc:   0.9075
Epoch 8/10
Train Loss: 0.1926 | Train Acc: 0.9392
Val Loss:   0.2631 | Val Acc:   0.9164
Epoch 9/10
Train Loss: 0.1746 | Train Acc: 0.9444
Val Loss:   0.2587 | Val Acc:   0.9169
Epoch 10/10
Train Loss: 0.1594 | Train Acc: 0.9490
Val Loss:   0.2424 | Val Acc:   0.9240
Kết quả trên tập Test:
Test Loss: 0.2725
Test Acc: 0.9185
