## Import thư viện

In [51]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
from transformers import AutoTokenizer
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import classification_report

## Load dataset

In [52]:
dataset = load_dataset("conll2003")
dataset['train'][0]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [53]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [54]:
label_list = dataset["train"].features["ner_tags"].feature.names
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}
label_to_id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}

In [55]:
class NERDataset(Dataset):
    def __init__(self, data, tokenizer, label_to_id, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.label_to_id = label_to_id
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = self.data[idx]["tokens"]
        labels = self.data[idx]["ner_tags"]

        encoding = self.tokenizer(tokens, truncation=True, is_split_into_words=True, 
                                  padding="max_length", max_length=self.max_len, return_tensors="pt")
        word_ids = encoding.word_ids()

        aligned_labels = [-100 if word_id is None else labels[word_id] for word_id in word_ids]

        return encoding["input_ids"].squeeze(0), encoding["attention_mask"].squeeze(0), torch.tensor(aligned_labels)

In [56]:
train_dataset = NERDataset(dataset["train"], tokenizer, label_to_id)
val_dataset = NERDataset(dataset["validation"], tokenizer, label_to_id)
test_dataset = NERDataset(dataset["test"], tokenizer, label_to_id)

## Model

In [57]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_labels):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_labels)

    def forward(self, input_ids, attention_mask):
        embeddings = self.embedding(input_ids)
        packed_output, _ = self.lstm(embeddings)
        logits = self.fc(packed_output)
        return logits

## Train, test

In [58]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [59]:
embed_dim = 128
hidden_dim = 256
num_labels = len(label_to_id)
vocab_size = len(tokenizer)
batch_size = 64
num_epochs = 50

In [60]:
model = LSTMModel(vocab_size, embed_dim, hidden_dim, num_labels)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [61]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
def train(model, train_loader, criterion, optimizer, num_epochs=100):
    model.train()
    for epoch in tqdm(range(num_epochs)):
        running_loss = 0
        for input_ids, attention_mask, labels in train_loader:
            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            loss = criterion(logits.view(-1, num_labels), labels.view(-1))
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/100:.4f}')

def evaluate(model, dataloader, label_to_id):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for input_ids, attention_mask, labels in dataloader:
            logits = model(input_ids, attention_mask)
            predictions = torch.argmax(logits, dim=-1)
            all_preds.extend(predictions.view(-1).tolist())
            all_labels.extend(labels.view(-1).tolist())

    all_preds = [p for p, l in zip(all_preds, all_labels) if l != -100]
    all_labels = [l for l in all_labels if l != -100]
    print(classification_report(all_labels, all_preds, target_names=list(label_to_id.keys())))

In [63]:
train(model, train_loader, criterion, optimizer, num_epochs=num_epochs)

  2%|▏         | 1/50 [00:57<46:40, 57.15s/it]

Epoch [1/50], Loss: 1.5411


  4%|▍         | 2/50 [01:52<45:03, 56.33s/it]

Epoch [2/50], Loss: 0.7716


  6%|▌         | 3/50 [02:48<43:58, 56.14s/it]

Epoch [3/50], Loss: 0.4735


  8%|▊         | 4/50 [03:44<42:47, 55.81s/it]

Epoch [4/50], Loss: 0.2922


 10%|█         | 5/50 [04:41<42:20, 56.44s/it]

Epoch [5/50], Loss: 0.1723


 12%|█▏        | 6/50 [05:37<41:09, 56.13s/it]

Epoch [6/50], Loss: 0.0987


 14%|█▍        | 7/50 [06:34<40:25, 56.40s/it]

Epoch [7/50], Loss: 0.0604


 16%|█▌        | 8/50 [07:31<39:42, 56.73s/it]

Epoch [8/50], Loss: 0.0303


 18%|█▊        | 9/50 [08:29<39:03, 57.16s/it]

Epoch [9/50], Loss: 0.0165


 20%|██        | 10/50 [09:28<38:31, 57.80s/it]

Epoch [10/50], Loss: 0.0098


 22%|██▏       | 11/50 [10:28<37:54, 58.32s/it]

Epoch [11/50], Loss: 0.0067


 24%|██▍       | 12/50 [11:25<36:45, 58.04s/it]

Epoch [12/50], Loss: 0.0052


 26%|██▌       | 13/50 [12:24<35:57, 58.32s/it]

Epoch [13/50], Loss: 0.0044


 28%|██▊       | 14/50 [13:23<34:59, 58.31s/it]

Epoch [14/50], Loss: 0.0030


 30%|███       | 15/50 [14:21<33:58, 58.23s/it]

Epoch [15/50], Loss: 0.0020


 32%|███▏      | 16/50 [15:19<33:00, 58.26s/it]

Epoch [16/50], Loss: 0.0653


 34%|███▍      | 17/50 [16:15<31:44, 57.73s/it]

Epoch [17/50], Loss: 0.0210


 36%|███▌      | 18/50 [17:11<30:24, 57.02s/it]

Epoch [18/50], Loss: 0.0061


 38%|███▊      | 19/50 [18:07<29:17, 56.71s/it]

Epoch [19/50], Loss: 0.0026


 40%|████      | 20/50 [19:02<28:09, 56.30s/it]

Epoch [20/50], Loss: 0.0017


 42%|████▏     | 21/50 [19:57<26:59, 55.85s/it]

Epoch [21/50], Loss: 0.0013


 44%|████▍     | 22/50 [20:51<25:49, 55.35s/it]

Epoch [22/50], Loss: 0.0010


 46%|████▌     | 23/50 [21:46<24:51, 55.25s/it]

Epoch [23/50], Loss: 0.0011


 48%|████▊     | 24/50 [22:40<23:45, 54.84s/it]

Epoch [24/50], Loss: 0.0011


 50%|█████     | 25/50 [23:36<22:56, 55.07s/it]

Epoch [25/50], Loss: 0.0010


 52%|█████▏    | 26/50 [24:30<21:54, 54.76s/it]

Epoch [26/50], Loss: 0.0011


 54%|█████▍    | 27/50 [25:24<20:56, 54.63s/it]

Epoch [27/50], Loss: 0.0063


 56%|█████▌    | 28/50 [26:19<20:05, 54.81s/it]

Epoch [28/50], Loss: 0.0181


 58%|█████▊    | 29/50 [27:14<19:10, 54.80s/it]

Epoch [29/50], Loss: 0.0097


 60%|██████    | 30/50 [28:08<18:11, 54.58s/it]

Epoch [30/50], Loss: 0.0028


 62%|██████▏   | 31/50 [29:03<17:18, 54.68s/it]

Epoch [31/50], Loss: 0.0010


 64%|██████▍   | 32/50 [29:57<16:21, 54.52s/it]

Epoch [32/50], Loss: 0.0007


 66%|██████▌   | 33/50 [30:51<15:25, 54.44s/it]

Epoch [33/50], Loss: 0.0005


 68%|██████▊   | 34/50 [31:47<14:35, 54.71s/it]

Epoch [34/50], Loss: 0.0004


 70%|███████   | 35/50 [32:43<13:45, 55.04s/it]

Epoch [35/50], Loss: 0.0004


 72%|███████▏  | 36/50 [33:38<12:51, 55.10s/it]

Epoch [36/50], Loss: 0.0005


 74%|███████▍  | 37/50 [34:33<11:57, 55.16s/it]

Epoch [37/50], Loss: 0.0004


 76%|███████▌  | 38/50 [35:28<11:02, 55.19s/it]

Epoch [38/50], Loss: 0.0004


 78%|███████▊  | 39/50 [36:25<10:11, 55.58s/it]

Epoch [39/50], Loss: 0.0004


 80%|████████  | 40/50 [37:20<09:14, 55.45s/it]

Epoch [40/50], Loss: 0.0003


 82%|████████▏ | 41/50 [38:15<08:17, 55.30s/it]

Epoch [41/50], Loss: 0.0003


 84%|████████▍ | 42/50 [39:11<07:23, 55.45s/it]

Epoch [42/50], Loss: 0.0004


 86%|████████▌ | 43/50 [40:06<06:28, 55.44s/it]

Epoch [43/50], Loss: 0.0002


 88%|████████▊ | 44/50 [41:01<05:32, 55.38s/it]

Epoch [44/50], Loss: 0.0050


 90%|█████████ | 45/50 [41:58<04:38, 55.76s/it]

Epoch [45/50], Loss: 0.0442


 92%|█████████▏| 46/50 [42:55<03:44, 56.19s/it]

Epoch [46/50], Loss: 0.0078


 94%|█████████▍| 47/50 [43:51<02:48, 56.16s/it]

Epoch [47/50], Loss: 0.0014


 96%|█████████▌| 48/50 [44:46<01:51, 55.86s/it]

Epoch [48/50], Loss: 0.0007


 98%|█████████▊| 49/50 [45:41<00:55, 55.56s/it]

Epoch [49/50], Loss: 0.0005


100%|██████████| 50/50 [46:37<00:00, 55.95s/it]

Epoch [50/50], Loss: 0.0004





In [64]:
evaluate(model, test_loader, label_to_id)

              precision    recall  f1-score   support

           O       0.96      0.98      0.97     47914
       B-PER       0.73      0.74      0.74      2986
       I-PER       0.86      0.81      0.84      2704
       B-ORG       0.77      0.71      0.74      3524
       I-ORG       0.73      0.68      0.71      1309
       B-LOC       0.85      0.82      0.83      2998
       I-LOC       0.77      0.64      0.70       415
      B-MISC       0.63      0.60      0.62      1266
      I-MISC       0.46      0.54      0.50       322

    accuracy                           0.92     63438
   macro avg       0.75      0.72      0.74     63438
weighted avg       0.92      0.92      0.92     63438

