### Task 1

1. Tải dữ liệu từ Hugging Face (cần downgrade `datasets` xuống version 3.2.0 để hỗ trợ remote code)

In [4]:
!pip install -q datasets==3.2.0

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.9.0 which is incompatible.[0m[31m
[0m

In [5]:
from datasets import load_dataset

ds = load_dataset("conll2003", trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

conll2003.py: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

2. Trích xuất câu và nhãn

In [6]:
train_sentences = ds['train']['tokens']
train_tags = ds['train']['ner_tags']
val_sentences = ds['validation']['tokens']
val_tags = ds['validation']['ner_tags']
test_sentences = ds['test']['tokens']
test_tags = ds['test']['ner_tags']

In [7]:
tag_names = ds['train'].features['ner_tags'].feature.names
idx2tag = {idx: tag for idx, tag in enumerate(tag_names)}
train_tag_names = []
for tag_ids in train_tags:
    train_tag_names.append([idx2tag[tag_id] for tag_id in tag_ids])
val_tag_names = []
for tag_ids in val_tags:
    val_tag_names.append([idx2tag[tag_id] for tag_id in tag_ids])
test_tag_names = []
for tag_ids in test_tags:
    test_tag_names.append([idx2tag[tag_id] for tag_id in tag_ids])

# sanity check
train_tag_names[:2]

[['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'],
 ['B-PER', 'I-PER']]

3. Xây dựng vocab

In [8]:
# word2idx
pad_token = '<PAD>'
pad_token_id = 0
unk_token = '<UNK>'
unk_token_id = 1

word2idx = {pad_token: pad_token_id, unk_token: unk_token_id}

words_set = set()
for sent in train_sentences:
    for word in sent:
        words_set.add(word)
word2idx.update({word: idx+2 for idx, word in enumerate(list(words_set))})

# tag2idx
tag_names = ds['train'].features['ner_tags'].feature.names
tag2idx = {tag: idx for idx, tag in enumerate(tag_names)}

print(f"Vocab size: {len(word2idx)}")
print(f"Num tag: {len(tag2idx)}")

Vocab size: 23625
Num tag: 9


### Task 2

1. Tạo lớp NER dataset

In [19]:
import torch
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, sentences, tags, word2idx, tag2idx, pad_token='<PAD>', unk_token='<UNK>'):
        super().__init__()
        self.sentences = sentences
        self.tags = tags
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.pad_token = pad_token
        self.unk_token = unk_token

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        tag = self.tags[idx]
        input_ids = torch.tensor([self.word2idx[token] if token in self.word2idx.keys() else self.word2idx[self.unk_token] for token in sentence], dtype=torch.long)
        tag_ids = torch.tensor([self.tag2idx[tag_id] for tag_id in tag], dtype=torch.long)
        return input_ids, tag_ids

# sanity check
dataset = NERDataset(train_sentences, train_tag_names, word2idx, tag2idx)
dataset[0]

(tensor([ 3043, 11874, 16177,  4916, 23167,  6324, 12205, 22790, 22699]),
 tensor([3, 0, 7, 0, 0, 0, 7, 0, 0]))

2. Tạo data loader

In [20]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def collate_fn_with_padding(batch):
    sentences, tag_ids = zip(*batch)
    sentence_lengths = torch.tensor([len(sent) for sent in sentences], dtype=torch.long)

    # thực hiện padding
    # với nhãn, padding bằng giá trị -100, đây là giá trị mặc định được bỏ qua bởi cross entropy loss
    sentences_padded = pad_sequence(sentences, padding_value=word2idx[pad_token], batch_first=True)
    tag_ids_padded = pad_sequence(tag_ids, padding_value=-100, batch_first=True)

    return sentences_padded, tag_ids_padded, sentence_lengths

train_ds = NERDataset(train_sentences, train_tag_names, word2idx, tag2idx)
val_ds = NERDataset(val_sentences, val_tag_names, word2idx, tag2idx)
test_ds = NERDataset(test_sentences, test_tag_names, word2idx, tag2idx)

train_loader = DataLoader(train_ds, batch_size=8, collate_fn=collate_fn_with_padding, shuffle=True, num_workers=2)
val_loader = DataLoader(val_ds, batch_size=8, collate_fn=collate_fn_with_padding, shuffle=False, num_workers=2)
test_loader = DataLoader(test_ds, batch_size=8, collate_fn=collate_fn_with_padding, shuffle=False, num_workers=2)

# sanity check
for input_ids, tag_ids, lengths in train_loader:
    print(input_ids.shape)
    print(tag_ids.shape)
    print(lengths)
    break

torch.Size([8, 33])
torch.Size([8, 33])
tensor([ 7,  8,  4, 26, 17, 33,  7,  3])


### Task 3

In [1]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class SimpleRNNForTokenClassification(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes, dropout_p=0.3):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        self.dropout_p = dropout_p

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word2idx[pad_token])
        self.rnn = nn.RNN(embedding_dim, hidden_size ,batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input_seqs, seq_lens):
        emb = self.dropout(self.embedding(input_seqs))
        packed_emb = pack_padded_sequence(emb, seq_lens, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.rnn(packed_emb) # (batch, seq len, hidden)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        logits = self.fc(output) # fc tự động áp dụng lên dim cuối cùng
        return logits # (batch, seq len, num_classes)


Khởi tạo mô hình:

In [None]:
import torch.optim as optim
import torch.nn as nn
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

vocab_size = len(word2idx)
embedding_dim = 128
hidden_size = 256
num_classes = len(tag2idx)
model = SimpleRNNForTokenClassification(vocab_size, embedding_dim, hidden_size, num_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=-100)

print(model)

SimpleRNNForTokenClassification(
  (embedding): Embedding(23625, 128, padding_idx=0)
  (rnn): RNN(128, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=9, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


### Task 4 + 5

In [None]:
import torch
from tqdm import tqdm

def train_one_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    pbar = tqdm(data_loader, desc='Training', total=len(data_loader))

    for input_ids, tag_ids, seq_lens in pbar:
        input_ids = input_ids.to(device)
        tag_ids = tag_ids.to(device)

        optimizer.zero_grad()

        logits = model(input_ids, seq_lens).view(-1, model.num_classes)
        targets = tag_ids.view(-1)
        loss = criterion(logits, targets)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    epoch_loss = total_loss / len(data_loader)
    return epoch_loss

@torch.no_grad()
def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    pbar = tqdm(data_loader, desc='Evaluating', total=len(data_loader))
    total_correct = 0
    total_token = 0
    for input_ids, tag_ids, seq_lens in pbar:
        input_ids = input_ids.to(device)
        tag_ids = tag_ids.to(device)
        logits = model(input_ids, seq_lens).view(-1, model.num_classes)
        targets = tag_ids.view(-1)
        loss = criterion(logits, targets)
        total_loss += loss.item()
        # Tính accuracy
        preds = torch.argmax(logits, dim=-1)
        mask = targets != -100
        total_correct += (preds[mask] == targets[mask]).sum().item()
        total_token += mask.sum().item()

    validation_loss = total_loss / len(data_loader)
    accuracy = total_correct / total_token

    return validation_loss, accuracy

In [None]:
epochs = 50
best_accuracy = 0.0
for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}:")
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, accuracy = evaluate(model, val_loader, criterion, device)
    print(f"Train loss: {train_loss:.5f} | Val loss: {val_loss:.5f} | Accuracy: {accuracy:.5f}")

    # lưu lại mô hình tốt nhất dựa trên validation accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), 'best_model.pth')
        print(f"Best model saved with accuracy: {best_accuracy:.5f}")

    print("==="*20)

print(f"Training end. Best validation accuracy: {best_accuracy}")


Epoch 1/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 121.41it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 206.22it/s]


Train loss: 0.75726 | Val loss: 0.69846 | Accuracy: 0.84045
Best model saved with accuracy: 0.84045

Epoch 2/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 123.49it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 255.76it/s]


Train loss: 0.59142 | Val loss: 0.60948 | Accuracy: 0.84821
Best model saved with accuracy: 0.84821

Epoch 3/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 126.16it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 255.41it/s]


Train loss: 0.53271 | Val loss: 0.55742 | Accuracy: 0.85698
Best model saved with accuracy: 0.85698

Epoch 4/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 126.38it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 261.74it/s]


Train loss: 0.49308 | Val loss: 0.51984 | Accuracy: 0.86434
Best model saved with accuracy: 0.86434

Epoch 5/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 125.22it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 232.96it/s]


Train loss: 0.45793 | Val loss: 0.48348 | Accuracy: 0.87197
Best model saved with accuracy: 0.87197

Epoch 6/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 126.87it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 259.17it/s]


Train loss: 0.43131 | Val loss: 0.48011 | Accuracy: 0.87358
Best model saved with accuracy: 0.87358

Epoch 7/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 127.68it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 257.44it/s]


Train loss: 0.40480 | Val loss: 0.43401 | Accuracy: 0.88501
Best model saved with accuracy: 0.88501

Epoch 8/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 125.93it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 230.20it/s]


Train loss: 0.38445 | Val loss: 0.42323 | Accuracy: 0.88741
Best model saved with accuracy: 0.88741

Epoch 9/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 124.30it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 261.66it/s]


Train loss: 0.36478 | Val loss: 0.40041 | Accuracy: 0.89282
Best model saved with accuracy: 0.89282

Epoch 10/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 126.78it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 257.00it/s]


Train loss: 0.34716 | Val loss: 0.39219 | Accuracy: 0.89617
Best model saved with accuracy: 0.89617

Epoch 11/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 126.38it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 254.50it/s]


Train loss: 0.33027 | Val loss: 0.37365 | Accuracy: 0.90041
Best model saved with accuracy: 0.90041

Epoch 12/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 124.37it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 217.94it/s]


Train loss: 0.31409 | Val loss: 0.38311 | Accuracy: 0.89934

Epoch 13/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 126.74it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 255.83it/s]


Train loss: 0.30023 | Val loss: 0.37438 | Accuracy: 0.90394
Best model saved with accuracy: 0.90394

Epoch 14/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 126.31it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 255.62it/s]


Train loss: 0.28603 | Val loss: 0.33850 | Accuracy: 0.90927
Best model saved with accuracy: 0.90927

Epoch 15/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 126.53it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 231.21it/s]


Train loss: 0.27545 | Val loss: 0.35527 | Accuracy: 0.90855

Epoch 16/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 122.29it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 257.73it/s]


Train loss: 0.26415 | Val loss: 0.33474 | Accuracy: 0.91389
Best model saved with accuracy: 0.91389

Epoch 17/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 126.56it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 261.61it/s]


Train loss: 0.25294 | Val loss: 0.33408 | Accuracy: 0.91457
Best model saved with accuracy: 0.91457

Epoch 18/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 125.33it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 256.99it/s]


Train loss: 0.24098 | Val loss: 0.33258 | Accuracy: 0.91579
Best model saved with accuracy: 0.91579

Epoch 19/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 124.10it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 213.38it/s]


Train loss: 0.23252 | Val loss: 0.31405 | Accuracy: 0.91942
Best model saved with accuracy: 0.91942

Epoch 20/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 125.93it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 252.51it/s]


Train loss: 0.22532 | Val loss: 0.32638 | Accuracy: 0.91815

Epoch 21/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 125.91it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 246.33it/s]


Train loss: 0.21551 | Val loss: 0.32192 | Accuracy: 0.92084
Best model saved with accuracy: 0.92084

Epoch 22/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 126.29it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 221.51it/s]


Train loss: 0.20647 | Val loss: 0.31108 | Accuracy: 0.92292
Best model saved with accuracy: 0.92292

Epoch 23/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 123.55it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 259.82it/s]


Train loss: 0.20057 | Val loss: 0.32265 | Accuracy: 0.92302
Best model saved with accuracy: 0.92302

Epoch 24/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 127.09it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 253.69it/s]


Train loss: 0.19179 | Val loss: 0.29482 | Accuracy: 0.92701
Best model saved with accuracy: 0.92701

Epoch 25/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 125.92it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 258.77it/s]


Train loss: 0.18745 | Val loss: 0.29794 | Accuracy: 0.92642

Epoch 26/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 124.72it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 217.47it/s]


Train loss: 0.17978 | Val loss: 0.29316 | Accuracy: 0.92866
Best model saved with accuracy: 0.92866

Epoch 27/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 124.81it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 260.82it/s]


Train loss: 0.17334 | Val loss: 0.30515 | Accuracy: 0.92777

Epoch 28/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 125.38it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 252.53it/s]


Train loss: 0.16708 | Val loss: 0.29025 | Accuracy: 0.93065
Best model saved with accuracy: 0.93065

Epoch 29/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 123.16it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 215.74it/s]


Train loss: 0.16032 | Val loss: 0.29073 | Accuracy: 0.93057

Epoch 30/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 123.12it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 256.02it/s]


Train loss: 0.15717 | Val loss: 0.29712 | Accuracy: 0.93092
Best model saved with accuracy: 0.93092

Epoch 31/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 125.61it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 257.09it/s]


Train loss: 0.15078 | Val loss: 0.27491 | Accuracy: 0.93295
Best model saved with accuracy: 0.93295

Epoch 32/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 125.16it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 256.69it/s]


Train loss: 0.14624 | Val loss: 0.29518 | Accuracy: 0.93188

Epoch 33/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 121.64it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 234.71it/s]


Train loss: 0.14165 | Val loss: 0.29048 | Accuracy: 0.93355
Best model saved with accuracy: 0.93355

Epoch 34/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 124.89it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 256.14it/s]


Train loss: 0.13796 | Val loss: 0.29686 | Accuracy: 0.93310

Epoch 35/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 124.76it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 256.37it/s]


Train loss: 0.13227 | Val loss: 0.28391 | Accuracy: 0.93491
Best model saved with accuracy: 0.93491

Epoch 36/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 126.63it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 212.05it/s]


Train loss: 0.13083 | Val loss: 0.29945 | Accuracy: 0.93448

Epoch 37/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 124.83it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 259.70it/s]


Train loss: 0.12369 | Val loss: 0.28217 | Accuracy: 0.93630
Best model saved with accuracy: 0.93630

Epoch 38/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 126.59it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 263.11it/s]


Train loss: 0.12310 | Val loss: 0.27268 | Accuracy: 0.93828
Best model saved with accuracy: 0.93828

Epoch 39/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 125.51it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 259.49it/s]


Train loss: 0.11913 | Val loss: 0.28134 | Accuracy: 0.93838
Best model saved with accuracy: 0.93838

Epoch 40/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 122.40it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 242.36it/s]


Train loss: 0.11449 | Val loss: 0.27207 | Accuracy: 0.93869
Best model saved with accuracy: 0.93869

Epoch 41/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 126.43it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 260.52it/s]


Train loss: 0.11140 | Val loss: 0.26616 | Accuracy: 0.93984
Best model saved with accuracy: 0.93984

Epoch 42/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 126.70it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 259.08it/s]


Train loss: 0.10942 | Val loss: 0.29512 | Accuracy: 0.93836

Epoch 43/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 126.17it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 206.48it/s]


Train loss: 0.10458 | Val loss: 0.28929 | Accuracy: 0.93816

Epoch 44/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 125.72it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 255.31it/s]


Train loss: 0.10117 | Val loss: 0.29151 | Accuracy: 0.93945

Epoch 45/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 125.30it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 257.99it/s]


Train loss: 0.09848 | Val loss: 0.27808 | Accuracy: 0.93959

Epoch 46/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 127.21it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 249.55it/s]


Train loss: 0.09627 | Val loss: 0.28380 | Accuracy: 0.94040
Best model saved with accuracy: 0.94040

Epoch 47/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 123.79it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 234.27it/s]


Train loss: 0.09329 | Val loss: 0.27470 | Accuracy: 0.94120
Best model saved with accuracy: 0.94120

Epoch 48/50:


Training: 100%|██████████| 1756/1756 [00:14<00:00, 125.28it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 253.08it/s]


Train loss: 0.09035 | Val loss: 0.28794 | Accuracy: 0.94107

Epoch 49/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 126.79it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 256.01it/s]


Train loss: 0.08825 | Val loss: 0.30253 | Accuracy: 0.94019

Epoch 50/50:


Training: 100%|██████████| 1756/1756 [00:13<00:00, 126.27it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 218.08it/s]

Train loss: 0.08731 | Val loss: 0.27863 | Accuracy: 0.94229
Best model saved with accuracy: 0.94229
Training end. Best validation accuracy: 0.9422919668237218





In [None]:
# lưu mô hình
torch.save(model.state_dict(), 'ner_part4.pth')

Test accuracy:

In [None]:
test_loss, test_accuracy = evaluate(model, test_loader, criterion, device)
print(f"\nTest loss: {test_loss}")
print(f"Test accuracy: {test_accuracy}")

Evaluating: 100%|██████████| 432/432 [00:01<00:00, 261.78it/s]


Test loss: 0.4565457258746998
Test accuracy: 0.9204264024981157





Đánh giá bằng `seqeval`:

In [18]:
!pip install -q seqeval

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [41]:
from seqeval.metrics import classification_report

y_true = []
y_pred = []

for input_ids, tag_ids, seq_lens in test_loader:
    mask_ids = []  # mask để loại bỏ các vị trí padding
    for label in tag_ids:
        y_true.append([idx2tag[tag_id] for tag_id in label.tolist() if tag_id != -100])
        mask_ids.append(label != -100)

    input_ids = input_ids.to(device)
    tag_ids = tag_ids.to(device)

    logits = model(input_ids, seq_lens)
    preds = torch.argmax(logits, dim=-1)
    for pred, mask in zip(preds, mask_ids):
        y_pred.append([idx2tag[tag_id] for tag_id in pred[mask].cpu().tolist()])

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         LOC       0.03      0.03      0.03      1668
        MISC       0.02      0.01      0.01       702
         ORG       0.07      0.04      0.05      1661
         PER       0.01      0.02      0.01      1617

   micro avg       0.02      0.03      0.02      5648
   macro avg       0.03      0.03      0.03      5648
weighted avg       0.03      0.03      0.03      5648



Predict:

In [2]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [16]:
from nltk.tokenize import word_tokenize
import torch

@torch.no_grad()
def predict(model, tokenizer, sentence, word2idx, idx2word, idx2tag, device):
    model.eval()
    tokens = tokenizer(sentence)
    token_ids = [word2idx[token] if token in word2idx.keys() else word2idx['<UNK>'] for token in tokens]

    input_ids = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)
    seq_lens = torch.tensor([len(tokens)], dtype=torch.long)
    logits = model(input_ids, seq_lens)
    preds = torch.argmax(logits, dim=-1).squeeze(0)
    predicted_tags = [idx2tag[tag_id] for tag_id in preds.tolist()]
    return list(zip(tokens, predicted_tags))

sample_sentences = [
    "I love NLP",
    "This is Sparta!!!!!",
    "VNU University is located in Hanoi",
    "Paris is the capital city of France",
    "Linda is my pen pal",
    "The former director, Dr. Quill, flew from Kona International Airport in Hawaii to meet Senator P. O."
]

idx2word = {idx: word for word, idx in word2idx.items()}
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

In [17]:
# load model từ checkpoint đã huấn luyện
import torch.optim as optim
import torch.nn as nn
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

vocab_size = len(word2idx)
embedding_dim = 128
hidden_size = 256
num_classes = len(tag2idx)
model = SimpleRNNForTokenClassification(vocab_size, embedding_dim, hidden_size, num_classes)
model.load_state_dict(torch.load('/content/ner_part4.pth'))
model = model.to(device)

for sent in sample_sentences:
    output = predict(model, word_tokenize, sent, word2idx, idx2word, idx2tag, device)
    print(output)

[('I', 'O'), ('love', 'B-ORG'), ('NLP', 'O')]
[('This', 'O'), ('is', 'I-PER'), ('Sparta', 'O'), ('!', 'O'), ('!', 'O'), ('!', 'O'), ('!', 'O'), ('!', 'O')]
[('VNU', 'B-PER'), ('University', 'I-ORG'), ('is', 'B-PER'), ('located', 'O'), ('in', 'O'), ('Hanoi', 'O')]
[('Paris', 'O'), ('is', 'I-PER'), ('the', 'I-PER'), ('capital', 'I-PER'), ('city', 'O'), ('of', 'O'), ('France', 'O')]
[('Linda', 'B-ORG'), ('is', 'I-PER'), ('my', 'O'), ('pen', 'O'), ('pal', 'O')]
[('The', 'O'), ('former', 'O'), ('director', 'O'), (',', 'O'), ('Dr.', 'O'), ('Quill', 'O'), (',', 'O'), ('flew', 'O'), ('from', 'O'), ('Kona', 'O'), ('International', 'O'), ('Airport', 'O'), ('in', 'O'), ('Hawaii', 'I-PER'), ('to', 'I-PER'), ('meet', 'I-PER'), ('Senator', 'O'), ('P.', 'O'), ('O', 'O'), ('.', 'I-PER')]


### Kết quả thực hiện (50 epochs):
- Độ chính xác trên tập validation: 0.9422 (94.22%)
- Độ chính xác trên tập test: 0.9204 (92.04%)

**Một điểm cần lưu ý:** Tuy accuracy trên tập validation và test rất cao nhưng đây không phải là một metrics phù hợp trong bài toán NER do đặc thù của bài toán là dữ liệu với nhãn O chiếm đa số, do đó, mô hình chỉ cần dự đoán đúng nhiều trên các nhãn này thì accuracy sẽ cao. Trong khi đó, các nhãn PER, ORG,... mới thực sự là các nhãn quan trọng.

Có thể thấy trong các ví dụ trên, mô hình dự đoán vẫn chưa được chính xác:
- Tên các địa danh như `Hanoi`, `Paris`, `France` bị gán nhãn O
- Tên tổ chức như `VNU` bị nhầm thành B-PER
- Tên người `Linda` bị nhầm thành tổ chức (B-ORG)  