In [3]:
#Task 1
from datasets import load_dataset
from collections import Counter

dataset = load_dataset("lhoestq/conll2003")

# Extract sentences and ner tags
train_tokens = dataset["train"]["tokens"]
train_ner_tags = dataset["train"]["ner_tags"]
val_tokens = dataset["validation"]["tokens"]
val_ner_tags = dataset["validation"]["ner_tags"]

# Manual mapping for CoNLL-2003
ner_label_names = [
    "O",
    "B-PER", "I-PER",
    "B-ORG", "I-ORG",
    "B-LOC", "I-LOC",
    "B-MISC", "I-MISC"
]

# Build word vocab
word_counter = Counter()
for sent in train_tokens:
    word_counter.update(sent)

PAD = "<PAD>"
UNK = "<UNK>"
vocab = [PAD, UNK] + sorted(word_counter.keys())
word_to_ix = {w: i for i, w in enumerate(vocab)}

# Build tag vocab
tag_to_ix = {t: i for i, t in enumerate(ner_label_names)}

train_tags_str = [
    [ner_label_names[idx] for idx in seq]
    for seq in train_ner_tags
]

print("Vocab size:", len(word_to_ix))
print("NER tag size:", len(tag_to_ix))
print("Example sentence:", train_tokens[0])
print("NER ids:", train_ner_tags[0])
print("NER tags:", train_tags_str[0])


Vocab size: 23625
NER tag size: 9
Example sentence: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
NER ids: [3, 0, 7, 0, 0, 0, 7, 0, 0]
NER tags: ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


In [4]:
#Task 2
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class NERDataset(Dataset):
    def __init__(self, sentences, tags, word_to_ix, tag_to_ix):
        self.sentences = sentences
        self.tags = tags
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix
        self.UNK = word_to_ix["<UNK>"]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        words = self.sentences[idx]
        labels = self.tags[idx]
        word_ids = torch.tensor([self.word_to_ix.get(w, self.UNK) for w in words])
        label_ids = torch.tensor([label for label in labels])
        return word_ids, label_ids
PAD_IDX = word_to_ix["<PAD>"]
TAG_PAD = -1

def collate_fn(batch):
    sents = [item[0] for item in batch]
    tags = [item[1] for item in batch]
    sents_padded = pad_sequence(sents, batch_first=True, padding_value=PAD_IDX)
    tags_padded = pad_sequence(tags, batch_first=True, padding_value=TAG_PAD)
    return sents_padded, tags_padded
train_dataset = NERDataset(train_tokens, train_ner_tags, word_to_ix, tag_to_ix)
val_dataset = NERDataset(val_tokens, val_ner_tags, word_to_ix, tag_to_ix)

train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)

val_loader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=collate_fn
)
batch_sents, batch_tags = next(iter(train_loader))
print(batch_sents.shape)
print(batch_tags.shape)


torch.Size([32, 38])
torch.Size([32, 38])


In [5]:
#Task 3
import torch
import torch.nn as nn

class NERTagger(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            padding_idx=pad_idx
        )
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        emb = self.embedding(x)
        out, _ = self.lstm(emb)
        logits = self.fc(out)
        return logits


vocab_size = len(word_to_ix)
embedding_dim = 100
hidden_dim = 128
output_size = len(tag_to_ix)
pad_idx = word_to_ix["<PAD>"]

model = NERTagger(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    output_size=output_size,
    pad_idx=pad_idx
)

print(model)


NERTagger(
  (embedding): Embedding(23625, 100, padding_idx=0)
  (lstm): LSTM(100, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=9, bias=True)
)


In [6]:
#Task 4
import torch
import torch.nn as nn
from torch.optim import Adam

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.CrossEntropyLoss(ignore_index=TAG_PAD)
optimizer = Adam(model.parameters(), lr=1e-3)

epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch_sents, batch_tags in train_loader:
        batch_sents = batch_sents.to(device)
        batch_tags = batch_tags.to(device)

        optimizer.zero_grad()
        logits = model(batch_sents)

        loss = criterion(
            logits.view(-1, logits.size(-1)),
            batch_tags.view(-1)
        )

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} | Loss: {avg_loss:.4f}")


Epoch 1/5 | Loss: 0.6817
Epoch 2/5 | Loss: 0.3596
Epoch 3/5 | Loss: 0.2310
Epoch 4/5 | Loss: 0.1565
Epoch 5/5 | Loss: 0.1064


In [13]:
#Task 5
import torch

def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch_sents, batch_tags in dataloader:
            batch_sents = batch_sents.to(device)
            batch_tags = batch_tags.to(device)

            logits = model(batch_sents)
            preds = torch.argmax(logits, dim=-1)

            mask = batch_tags != TAG_PAD
            correct += (preds[mask] == batch_tags[mask]).sum().item()
            total += mask.sum().item()

    return correct / total


val_acc = evaluate(model, val_loader)
print("Validation accuracy:", val_acc)


def predict_sentence(sentence):
    tokens = sentence.split()
    ids = [word_to_ix.get(w, word_to_ix["<UNK>"]) for w in tokens]
    x = torch.tensor(ids).unsqueeze(0).to(device)

    model.eval()
    with torch.no_grad():
        logits = model(x)
        preds = torch.argmax(logits, dim=-1).squeeze().tolist()

    tags = [ner_label_names[i] for i in preds]
    for w, t in zip(tokens, tags):
        print(w, t)

print(predict_sentence("VNU University is located in Hanoi"))

Validation accuracy: 0.9364510727775398
VNU B-ORG
University B-ORG
is O
located O
in O
Hanoi O
None


• Độ chính xác trên tập validation: 0.9364510727775398
• Ví dụ dự đoán câu mới:

Câu: “VNU University is located in Hanoi”
Dự đoán: VNU B-ORG
University B-ORG
is O
located O
in O
Hanoi O