### Task 1

1. Tải dữ liệu từ Hugging Face (cần downgrade `datasets` xuống version 3.2.0 để hỗ trợ remote code)

In [1]:
!pip install -q datasets==3.2.0

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m471.0/480.6 kB[0m [31m17.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.9.0 which is incompatible.[0m[31m
[0m

In [2]:
from datasets import load_dataset

ds = load_dataset("conll2003", trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

conll2003.py: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

2. Trích xuất câu và nhãn

In [3]:
train_sentences = ds['train']['tokens']
train_tags = ds['train']['ner_tags']
val_sentences = ds['validation']['tokens']
val_tags = ds['validation']['ner_tags']
test_sentences = ds['test']['tokens']
test_tags = ds['test']['ner_tags']

In [4]:
tag_names = ds['train'].features['ner_tags'].feature.names
idx2tag = {idx: tag for idx, tag in enumerate(tag_names)}
train_tag_names = []
for tag_ids in train_tags:
    train_tag_names.append([idx2tag[tag_id] for tag_id in tag_ids])
val_tag_names = []
for tag_ids in val_tags:
    val_tag_names.append([idx2tag[tag_id] for tag_id in tag_ids])
test_tag_names = []
for tag_ids in test_tags:
    test_tag_names.append([idx2tag[tag_id] for tag_id in tag_ids])

# sanity check
train_tag_names[:2]

[['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'],
 ['B-PER', 'I-PER']]

3. Xây dựng vocab

In [5]:
# word2idx
pad_token = '<PAD>'
pad_token_id = 0
unk_token = '<UNK>'
unk_token_id = 1

word2idx = {pad_token: pad_token_id, unk_token: unk_token_id}

words_set = set()
for sent in train_sentences:
    for word in sent:
        words_set.add(word)
word2idx.update({word: idx+2 for idx, word in enumerate(list(words_set))})

# tag2idx
tag_names = ds['train'].features['ner_tags'].feature.names
tag2idx = {tag: idx for idx, tag in enumerate(tag_names)}

print(f"Vocab size: {len(word2idx)}")
print(f"Num tag: {len(tag2idx)}")

Vocab size: 23625
Num tag: 9


### Task 2

1. Tạo lớp NER dataset

In [6]:
import torch
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, sentences, tags, word2idx, tag2idx, pad_token='<PAD>', unk_token='<UNK>'):
        super().__init__()
        self.sentences = sentences
        self.tags = tags
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.pad_token = pad_token
        self.unk_token = unk_token

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        tag = self.tags[idx]
        input_ids = torch.tensor([self.word2idx[token] if token in self.word2idx.keys() else self.word2idx[self.unk_token] for token in sentence], dtype=torch.long)
        tag_ids = torch.tensor([self.tag2idx[tag_id] for tag_id in tag], dtype=torch.long)
        return input_ids, tag_ids

# sanity check
dataset = NERDataset(train_sentences, train_tag_names, word2idx, tag2idx)
dataset[0]

(tensor([14048, 12171, 11095, 15770, 22237, 19475, 12272, 18737,  7201]),
 tensor([3, 0, 7, 0, 0, 0, 7, 0, 0]))

2. Tạo data loader

In [None]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def collate_fn_with_padding(batch):
    sentences, tag_ids = zip(*batch)

    # thực hiện padding
    # với nhãn, padding bằng giá trị -100, đây là giá trị mặc định được bỏ qua bởi cross entropy loss
    sentences_padded = pad_sequence(sentences, padding_value=word2idx[pad_token], batch_first=True)
    tag_ids_padded = pad_sequence(tag_ids, padding_value=-100, batch_first=True)

    return sentences_padded, tag_ids_padded

train_ds = NERDataset(train_sentences, train_tag_names, word2idx, tag2idx)
val_ds = NERDataset(val_sentences, val_tag_names, word2idx, tag2idx)
test_ds = NERDataset(test_sentences, test_tag_names, word2idx, tag2idx)

train_loader = DataLoader(train_ds, batch_size=8, collate_fn=collate_fn_with_padding, shuffle=True, num_workers=2)
val_loader = DataLoader(val_ds, batch_size=8, collate_fn=collate_fn_with_padding, shuffle=False, num_workers=2)
test_loader = DataLoader(test_ds, batch_size=8, collate_fn=collate_fn_with_padding, shuffle=False, num_workers=2)

# sanity check
for input_ids, tag_ids in train_loader:
    print(input_ids.shape)
    print(tag_ids.shape)
    break

torch.Size([8, 34])
torch.Size([8, 34])


### Task 3

In [8]:
import torch.nn as nn

class SimpleRNNForTokenClassification(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes, dropout_p=0.3):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        self.dropout_p = dropout_p

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word2idx[pad_token])
        self.rnn = nn.RNN(embedding_dim, hidden_size ,batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input_seqs):
        emb = self.dropout(self.embedding(input_seqs))
        output, _ = self.rnn(emb) # (batch, seq len, hidden)
        logits = self.fc(output) # fc tự động áp dụng lên dim cuối cùng
        return logits # (batch, seq len, num_classes)


Khởi tạo mô hình:

In [13]:
import torch.optim as optim
import torch.nn as nn
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

vocab_size = len(word2idx)
embedding_dim = 128
hidden_size = 256
num_classes = len(tag2idx)
model = SimpleRNNForTokenClassification(vocab_size, embedding_dim, hidden_size, num_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=-100)

print(model)

SimpleRNNForTokenClassification(
  (embedding): Embedding(23625, 128, padding_idx=0)
  (rnn): RNN(128, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=9, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


### Task 4 + 5

In [10]:
import torch
from tqdm import tqdm

def train_one_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    pbar = tqdm(data_loader, desc='Training', total=len(data_loader))

    for input_ids, tag_ids in pbar:
        input_ids = input_ids.to(device)
        tag_ids = tag_ids.to(device)

        optimizer.zero_grad()

        logits = model(input_ids).view(-1, model.num_classes)
        targets = tag_ids.view(-1)
        loss = criterion(logits, targets)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    epoch_loss = total_loss / len(data_loader)
    return epoch_loss

@torch.no_grad()
def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    pbar = tqdm(data_loader, desc='Evaluating', total=len(data_loader))
    total_correct = 0
    total_token = 0
    for input_ids, tag_ids in pbar:
        input_ids = input_ids.to(device)
        tag_ids = tag_ids.to(device)
        logits = model(input_ids).view(-1, model.num_classes)
        targets = tag_ids.view(-1)
        loss = criterion(logits, targets)
        total_loss += loss.item()
        # Tính accuracy
        preds = torch.argmax(logits, dim=-1)
        mask = targets != -100
        total_correct += (preds[mask] == targets[mask]).sum().item()
        total_token += mask.sum().item()

    validation_loss = total_loss / len(data_loader)
    accuracy = total_correct / total_token

    return validation_loss, accuracy

In [14]:
epochs = 50
best_accuracy = 0.0
for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}:")
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, accuracy = evaluate(model, val_loader, criterion, device)
    print(f"Train loss: {train_loss:.5f} | Val loss: {val_loss:.5f} | Accuracy: {accuracy:.5f}")

    # lưu lại mô hình tốt nhất dựa trên validation accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), 'best_model.pth')
        print(f"Best model saved with accuracy: {best_accuracy:.5f}")

    print("==="*20)

print(f"Training end. Best validation accuracy: {best_accuracy}")


Epoch 1/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 209.52it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 369.23it/s]


Train loss: 0.76331 | Val loss: 0.72008 | Accuracy: 0.84062
Best model saved with accuracy: 0.84062

Epoch 2/50:


Training: 100%|██████████| 1756/1756 [00:09<00:00, 189.00it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 375.46it/s]


Train loss: 0.59714 | Val loss: 0.62357 | Accuracy: 0.84905
Best model saved with accuracy: 0.84905

Epoch 3/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 210.29it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 381.32it/s]


Train loss: 0.54193 | Val loss: 0.56035 | Accuracy: 0.85754
Best model saved with accuracy: 0.85754

Epoch 4/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 207.99it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 376.00it/s]


Train loss: 0.49765 | Val loss: 0.51836 | Accuracy: 0.86545
Best model saved with accuracy: 0.86545

Epoch 5/50:


Training: 100%|██████████| 1756/1756 [00:07<00:00, 224.18it/s]
Evaluating: 100%|██████████| 407/407 [00:02<00:00, 198.18it/s]


Train loss: 0.46591 | Val loss: 0.49820 | Accuracy: 0.86891
Best model saved with accuracy: 0.86891

Epoch 6/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 210.73it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 376.22it/s]


Train loss: 0.43608 | Val loss: 0.46006 | Accuracy: 0.87619
Best model saved with accuracy: 0.87619

Epoch 7/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 202.75it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 381.16it/s]


Train loss: 0.41026 | Val loss: 0.43536 | Accuracy: 0.88268
Best model saved with accuracy: 0.88268

Epoch 8/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 196.56it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 352.01it/s]


Train loss: 0.38926 | Val loss: 0.41636 | Accuracy: 0.88801
Best model saved with accuracy: 0.88801

Epoch 9/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 209.77it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 378.55it/s]


Train loss: 0.36576 | Val loss: 0.40622 | Accuracy: 0.89148
Best model saved with accuracy: 0.89148

Epoch 10/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 218.86it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 290.54it/s]


Train loss: 0.34890 | Val loss: 0.39175 | Accuracy: 0.89498
Best model saved with accuracy: 0.89498

Epoch 11/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 215.13it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 374.90it/s]


Train loss: 0.33312 | Val loss: 0.37110 | Accuracy: 0.90106
Best model saved with accuracy: 0.90106

Epoch 12/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 209.31it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 381.22it/s]


Train loss: 0.31693 | Val loss: 0.35323 | Accuracy: 0.90474
Best model saved with accuracy: 0.90474

Epoch 13/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 207.90it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 381.15it/s]


Train loss: 0.30287 | Val loss: 0.36386 | Accuracy: 0.90355

Epoch 14/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 216.97it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 307.38it/s]


Train loss: 0.29083 | Val loss: 0.34572 | Accuracy: 0.90986
Best model saved with accuracy: 0.90986

Epoch 15/50:


Training: 100%|██████████| 1756/1756 [00:07<00:00, 220.11it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 378.72it/s]


Train loss: 0.27860 | Val loss: 0.34422 | Accuracy: 0.90976

Epoch 16/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 210.36it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 377.90it/s]


Train loss: 0.26602 | Val loss: 0.32779 | Accuracy: 0.91254
Best model saved with accuracy: 0.91254

Epoch 17/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 209.31it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 384.69it/s]


Train loss: 0.25694 | Val loss: 0.32857 | Accuracy: 0.91422
Best model saved with accuracy: 0.91422

Epoch 18/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 209.68it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 376.33it/s]


Train loss: 0.24669 | Val loss: 0.33273 | Accuracy: 0.91550
Best model saved with accuracy: 0.91550

Epoch 19/50:


Training: 100%|██████████| 1756/1756 [00:07<00:00, 219.90it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 312.56it/s]


Train loss: 0.23636 | Val loss: 0.30390 | Accuracy: 0.92082
Best model saved with accuracy: 0.92082

Epoch 20/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 209.13it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 356.71it/s]


Train loss: 0.22885 | Val loss: 0.31104 | Accuracy: 0.91984

Epoch 21/50:


Training: 100%|██████████| 1756/1756 [00:10<00:00, 166.14it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 364.28it/s]


Train loss: 0.21911 | Val loss: 0.29483 | Accuracy: 0.92335
Best model saved with accuracy: 0.92335

Epoch 22/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 202.56it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 323.09it/s]


Train loss: 0.20980 | Val loss: 0.31300 | Accuracy: 0.92136

Epoch 23/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 206.67it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 374.42it/s]


Train loss: 0.20435 | Val loss: 0.29040 | Accuracy: 0.92582
Best model saved with accuracy: 0.92582

Epoch 24/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 213.77it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 271.47it/s]


Train loss: 0.19670 | Val loss: 0.28459 | Accuracy: 0.92627
Best model saved with accuracy: 0.92627

Epoch 25/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 211.25it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 378.94it/s]


Train loss: 0.18773 | Val loss: 0.27274 | Accuracy: 0.92866
Best model saved with accuracy: 0.92866

Epoch 26/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 203.67it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 365.00it/s]


Train loss: 0.18223 | Val loss: 0.27221 | Accuracy: 0.93008
Best model saved with accuracy: 0.93008

Epoch 27/50:


Training: 100%|██████████| 1756/1756 [00:11<00:00, 152.04it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 284.04it/s]


Train loss: 0.17830 | Val loss: 0.28416 | Accuracy: 0.93008

Epoch 28/50:


Training: 100%|██████████| 1756/1756 [00:09<00:00, 182.26it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 324.07it/s]


Train loss: 0.16997 | Val loss: 0.27613 | Accuracy: 0.93086
Best model saved with accuracy: 0.93086

Epoch 29/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 208.35it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 278.98it/s]


Train loss: 0.16496 | Val loss: 0.27567 | Accuracy: 0.93197
Best model saved with accuracy: 0.93197

Epoch 30/50:


Training: 100%|██████████| 1756/1756 [00:09<00:00, 189.40it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 360.85it/s]


Train loss: 0.16184 | Val loss: 0.26745 | Accuracy: 0.93178

Epoch 31/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 196.89it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 282.17it/s]


Train loss: 0.15341 | Val loss: 0.27509 | Accuracy: 0.93119

Epoch 32/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 218.76it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 376.32it/s]


Train loss: 0.14913 | Val loss: 0.26372 | Accuracy: 0.93361
Best model saved with accuracy: 0.93361

Epoch 33/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 207.48it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 372.80it/s]


Train loss: 0.14380 | Val loss: 0.29153 | Accuracy: 0.93160

Epoch 34/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 210.02it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 380.89it/s]


Train loss: 0.14183 | Val loss: 0.25159 | Accuracy: 0.93526
Best model saved with accuracy: 0.93526

Epoch 35/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 209.20it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 379.01it/s]


Train loss: 0.13694 | Val loss: 0.26113 | Accuracy: 0.93515

Epoch 36/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 208.25it/s]
Evaluating: 100%|██████████| 407/407 [00:02<00:00, 166.30it/s]


Train loss: 0.13128 | Val loss: 0.24472 | Accuracy: 0.93859
Best model saved with accuracy: 0.93859

Epoch 37/50:


Training: 100%|██████████| 1756/1756 [00:09<00:00, 188.25it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 304.00it/s]


Train loss: 0.12810 | Val loss: 0.26257 | Accuracy: 0.93696

Epoch 38/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 214.34it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 381.00it/s]


Train loss: 0.12610 | Val loss: 0.24229 | Accuracy: 0.93801

Epoch 39/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 208.92it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 376.82it/s]


Train loss: 0.12041 | Val loss: 0.24097 | Accuracy: 0.93933
Best model saved with accuracy: 0.93933

Epoch 40/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 211.00it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 379.20it/s]


Train loss: 0.11756 | Val loss: 0.26158 | Accuracy: 0.93803

Epoch 41/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 215.46it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 268.10it/s]


Train loss: 0.11461 | Val loss: 0.23998 | Accuracy: 0.94005
Best model saved with accuracy: 0.94005

Epoch 42/50:


Training: 100%|██████████| 1756/1756 [00:07<00:00, 220.12it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 365.90it/s]


Train loss: 0.11021 | Val loss: 0.24124 | Accuracy: 0.94052
Best model saved with accuracy: 0.94052

Epoch 43/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 209.68it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 382.79it/s]


Train loss: 0.10775 | Val loss: 0.26497 | Accuracy: 0.93865

Epoch 44/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 208.15it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 376.29it/s]


Train loss: 0.10588 | Val loss: 0.25511 | Accuracy: 0.94075
Best model saved with accuracy: 0.94075

Epoch 45/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 211.97it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 323.19it/s]


Train loss: 0.10280 | Val loss: 0.26103 | Accuracy: 0.94077
Best model saved with accuracy: 0.94077

Epoch 46/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 198.22it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 289.99it/s]


Train loss: 0.10015 | Val loss: 0.25148 | Accuracy: 0.94066

Epoch 47/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 213.65it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 379.66it/s]


Train loss: 0.09515 | Val loss: 0.23701 | Accuracy: 0.94291
Best model saved with accuracy: 0.94291

Epoch 48/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 207.02it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 386.42it/s]


Train loss: 0.09374 | Val loss: 0.26083 | Accuracy: 0.94163

Epoch 49/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 209.89it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 376.01it/s]


Train loss: 0.09210 | Val loss: 0.25347 | Accuracy: 0.94229

Epoch 50/50:


Training: 100%|██████████| 1756/1756 [00:08<00:00, 211.91it/s]
Evaluating: 100%|██████████| 407/407 [00:01<00:00, 289.76it/s]

Train loss: 0.08838 | Val loss: 0.24607 | Accuracy: 0.94274
Training end. Best validation accuracy: 0.9429149955219812





In [26]:
# lưu mô hình
torch.save(model.state_dict(), 'ner_part4.pth')

Test accuracy:

In [17]:
test_loss, test_accuracy = evaluate(model, test_loader, criterion, device)
print(f"\nTest loss: {test_loss}")
print(f"Test accuracy: {test_accuracy}")

Evaluating: 100%|██████████| 432/432 [00:01<00:00, 317.30it/s]


Test loss: 0.41029281684448643
Test accuracy: 0.9213093571659309





Predict:

In [19]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from nltk.tokenize import word_tokenize
import torch

@torch.no_grad()
def predict(model, tokenizer, sentence, word2idx, idx2tag, device):
    model.eval()
    tokens = tokenizer(sentence)
    token_ids = [word2idx[token] if token in word2idx.keys() else word2idx['<UNK>'] for token in tokens]
    input_ids = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)
    logits = model(input_ids)
    preds = torch.argmax(logits, dim=-1).squeeze(0)
    predicted_tags = [idx2tag[tag_id] for tag_id in preds.tolist()]
    return list(zip(tokens, predicted_tags))

sample_sentences = [
    "I love NLP",
    "This is Sparta!!!!!",
    "VNU University is located in Hanoi",
    "Paris is the capital city of France",
    "Linda is my pen pal",
    "The former director, Dr. Quill, flew from Kona International Airport in Hawaii to meet Senator P. O."
]

idx2word = {idx: word for word, idx in word2idx.items()}
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

for sent in sample_sentences:
    output = predict(model, word_tokenize, sent, word2idx, idx2word, idx2tag, device)
    print(output)

[('I', 'O'), ('love', 'O'), ('NLP', 'O')]
[('This', 'O'), ('is', 'O'), ('Sparta', 'O'), ('!', 'O'), ('!', 'O'), ('!', 'O'), ('!', 'O'), ('!', 'O')]
[('VNU', 'B-PER'), ('University', 'I-ORG'), ('is', 'O'), ('located', 'O'), ('in', 'O'), ('Hanoi', 'O')]
[('Paris', 'B-ORG'), ('is', 'O'), ('the', 'O'), ('capital', 'O'), ('city', 'O'), ('of', 'O'), ('France', 'B-LOC')]
[('Linda', 'B-PER'), ('is', 'O'), ('my', 'O'), ('pen', 'O'), ('pal', 'O')]
[('The', 'O'), ('former', 'O'), ('director', 'O'), (',', 'O'), ('Dr.', 'O'), ('Quill', 'O'), (',', 'O'), ('flew', 'O'), ('from', 'O'), ('Kona', 'O'), ('International', 'B-MISC'), ('Airport', 'I-ORG'), ('in', 'O'), ('Hawaii', 'B-LOC'), ('to', 'O'), ('meet', 'O'), ('Senator', 'O'), ('P.', 'B-PER'), ('O', 'I-PER'), ('.', 'O')]


**Một điểm cần lưu ý:** Tuy accuracy trên tập validation và test rất cao nhưng đây không phải là một metrics phù hợp trong bài toán NER do đặc thù của bài toán là dữ liệu với nhãn O chiếm đa số, do đó, mô hình chỉ cần dự đoán đúng nhiều trên các nhãn này thì accuracy sẽ cao. Trong khi đó, các nhãn PER, ORG,... mới thực sự là các nhãn quan trọng.

Có thể thấy trong các ví dụ trên,