In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [10]:
# Đọc dữ liệu từ file
def read_data(file_path):
    sentences = []
    labels = []
    with open(file_path, "r") as f:
        sentence = []
        label = []
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence = []
                    label = []
                continue
            word, tag = line.split()
            sentence.append(word)
            label.append(tag)
        if sentence:
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels

# Định nghĩa ánh xạ từ/ngữ nghĩa sang chỉ số
def build_vocab(data):
    vocab = {"<PAD>": 0, "<UNK>": 1}
    for sentence in data:
        for token in sentence:
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab

# Tạo ánh xạ từ và nhãn
file_path = "../data/named-entity/eng.train"  # Đường dẫn file
sentences, labels = read_data(file_path)
word2idx = build_vocab(sentences)
tag2idx = build_vocab(labels)
idx2tag = {v: k for k, v in tag2idx.items()}

In [11]:
class NERDataset(Dataset):
    def __init__(self, sentences, labels, word2idx, tag2idx):
        self.sentences = [[word2idx.get(word, word2idx["<UNK>"]) for word in sentence] for sentence in sentences]
        self.labels = [[tag2idx[tag] for tag in label] for label in labels]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return torch.tensor(self.sentences[idx]), torch.tensor(self.labels[idx])

def collate_fn(batch):
    sentences, labels = zip(*batch)
    sentences = pad_sequence(sentences, batch_first=True, padding_value=word2idx["<PAD>"])
    labels = pad_sequence(labels, batch_first=True, padding_value=tag2idx["<PAD>"])
    return sentences, labels

dataset = NERDataset(sentences, labels, word2idx, tag2idx)
dataloader = DataLoader(dataset, batch_size=16, collate_fn=collate_fn, shuffle=True)


In [12]:
class NERModel(nn.Module):
    def __init__(self, vocab_size, tag_size, embedding_dim, hidden_dim):
        super(NERModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word2idx["<PAD>"])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, tag_size)
    
    def forward(self, x):
        embeddings = self.embedding(x)
        lstm_out, _ = self.lstm(embeddings)
        out = self.fc(lstm_out)
        return out

model = NERModel(vocab_size=len(word2idx), tag_size=len(tag2idx), embedding_dim=100, hidden_dim=128)


In [13]:
from torch.optim import Adam
import torch.nn.functional as F

optimizer = Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    model.train()
    total_loss = 0
    for sentences, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(sentences)
        outputs = outputs.view(-1, len(tag2idx))
        labels = labels.view(-1)

        loss = F.cross_entropy(outputs, labels, ignore_index=tag2idx["<PAD>"])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 538.4523
Epoch 2, Loss: 221.9930
Epoch 3, Loss: 117.1243
Epoch 4, Loss: 60.4298
Epoch 5, Loss: 28.4835
Epoch 6, Loss: 12.5023
Epoch 7, Loss: 5.2217
Epoch 8, Loss: 2.6239
Epoch 9, Loss: 1.4631
Epoch 10, Loss: 1.1386
