# Implement the pipeline for Named Entity Recognition with NLP

In [5]:
# Import libraries
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"This notebook was last compiled at: {datetime.datetime.now():%Y-%m-%d %H:%M:%S}")

This notebook was last compiled at: 2025-09-06 07:24:43


## Import dataset

In [6]:
def load_data(data_dir, mode=None):
    df = pd.read_csv(data_dir, encoding="ISO-8859-1")
    return df

In [7]:
source_dir = "/mnt/e/Development/Python/NLP/NaturalLanguageProcessing/"
data_dir = source_dir + "data/archive/ner_dataset.csv"
print(os.path.exists(data_dir))
df = load_data(data_dir)

print(df.head(10))

True
    Sentence #           Word  POS    Tag
0  Sentence: 1      Thousands  NNS      O
1          NaN             of   IN      O
2          NaN  demonstrators  NNS      O
3          NaN           have  VBP      O
4          NaN        marched  VBN      O
5          NaN        through   IN      O
6          NaN         London  NNP  B-geo
7          NaN             to   TO      O
8          NaN        protest   VB      O
9          NaN            the   DT      O


## Preprocessing data

In [8]:
print(df.isna().sum())

Sentence #    1000616
Word               10
POS                 0
Tag                 0
dtype: int64


In [9]:
def preprocessing_data(data):
    # Fill NaN values in "Sentence #"
    data["Sentence #"] = data["Sentence #"].ffill()
    data["Sentence #"] = data["Sentence #"].astype(str)

    # Drop rows with missing Word
    data = data.dropna(subset=["Word"]).reset_index(drop=True)

    # Strip whitespace
    for col in ["Word", "POS", "Tag"]:
        data[col] = data[col].str.strip()
        
    return data


In [10]:
df_prep = preprocessing_data(df)

In [13]:
# Groupby follow sentence for training
grouped = df.groupby("Sentence #").apply(
    lambda s: list(zip(s["Word"], s["Tag"]))
).tolist()

print(grouped[:10])


[[('Thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O'), ('marched', 'O'), ('through', 'O'), ('London', 'B-geo'), ('to', 'O'), ('protest', 'O'), ('the', 'O'), ('war', 'O'), ('in', 'O'), ('Iraq', 'B-geo'), ('and', 'O'), ('demand', 'O'), ('the', 'O'), ('withdrawal', 'O'), ('of', 'O'), ('British', 'B-gpe'), ('troops', 'O'), ('from', 'O'), ('that', 'O'), ('country', 'O'), ('.', 'O')], [('Iranian', 'B-gpe'), ('officials', 'O'), ('say', 'O'), ('they', 'O'), ('expect', 'O'), ('to', 'O'), ('get', 'O'), ('access', 'O'), ('to', 'O'), ('sealed', 'O'), ('sensitive', 'O'), ('parts', 'O'), ('of', 'O'), ('the', 'O'), ('plant', 'O'), ('Wednesday', 'B-tim'), (',', 'O'), ('after', 'O'), ('an', 'O'), ('IAEA', 'B-org'), ('surveillance', 'O'), ('system', 'O'), ('begins', 'O'), ('functioning', 'O'), ('.', 'O')], [('Helicopter', 'O'), ('gunships', 'O'), ('Saturday', 'B-tim'), ('pounded', 'O'), ('militant', 'O'), ('hideouts', 'O'), ('in', 'O'), ('the', 'O'), ('Orakzai', 'B-geo'), ('tribal', '

  grouped = df.groupby("Sentence #").apply(


## Encoding data

In [14]:
# Create vocabulary for word and tag
words = list(df["Word"].unique())
tags = list(df["Tag"].unique())

# Create dictionary word2idx, with UNK and PAD
word2idx = {w : i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1
word2idx["PAD"] = 0

# Create dictionary tag2idx, with UNK and PAD
tag2idx = {w : i + 1 for i, w in enumerate(tags)}
tag2idx["PAD"] = 0

idx2word = {i: w for w, i in word2idx.items()}
idx2tag = {i: w for w, i in tag2idx.items()}

In [15]:
import torch
from torch.nn.utils.rnn import pad_sequence

# Mapping words to index
X = [torch.tensor([word2idx[w[0]] for w in s], dtype=torch.long) for s in grouped]

# Pad
X = pad_sequence(X, batch_first=True, padding_value=word2idx["PAD"])

# Mapping tags to index
y = [torch.tensor([tag2idx[w[1]] for w in s], dtype=torch.long) for s in grouped]

# Pad
y = pad_sequence(y, batch_first=True, padding_value=tag2idx["PAD"])

# Select num_tags
num_tags = len(tag2idx)

print(X.shape, y.shape)  # (num_sentences, max_len)


torch.Size([47959, 104]) torch.Size([47959, 104])


In [16]:
print(X[:10])

tensor([[    2,     3,     4,  ...,     0,     0,     0],
        [  126,   127,   128,  ...,     0,     0,     0],
        [  944,   945,   365,  ...,     0,     0,     0],
        ...,
        [  890, 16293,   326,  ...,     0,     0,     0],
        [  837,    80,  1230,  ...,     0,     0,     0],
        [ 4488,   304,   182,  ...,     0,     0,     0]])


In [18]:
import torch.nn.functional as F

num_tags = len(tag2idx)
y_onehot = F.one_hot(y, num_classes=num_tags + 1)  


In [19]:
print(y_onehot[:5])

tensor([[[0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         ...,
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         ...,
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0]],

        [[0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0]],

        [[0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         ...,
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 1,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0

In [21]:
# Create mask
pad_idx = word2idx["PAD"]

mask = (X != pad_idx).to(torch.uint8) 

## Split into train and test sets

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, mask_train, mask_test = train_test_split(
    X, y, mask, test_size=0.15, random_state=42
)

## Build model 0 - PIPELINE with CRF and BiLSTM

In [None]:
!which python
!python --version

/home/dikhang_hcmut/miniconda3/envs/pytorch_env/bin/python
Python 3.12.11


In [50]:
!python -c "import torchcrf; print(torchcrf.__version__)"


0.7.2


In [26]:
import torch
import torch.nn as nn
from torchcrf import CRF

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, pad_idx):
        super(BiLSTM_CRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim, tagset_size)  # fully connected to tag space
        self.crf = CRF(tagset_size, batch_first=True)

    def forward(self, x, tags=None, mask=None):
        # x: [batch, seq_len]
        embeds = self.embedding(x)                  # [batch, seq_len, embedding_dim]
        lstm_out, _ = self.lstm(embeds)             # [batch, seq_len, hidden_dim]
        emissions = self.fc(lstm_out)               # [batch, seq_len, tagset_size]

        if tags is not None:  # Training -> trả loss
            loss = -self.crf(emissions, tags, mask=mask, reduction='mean')
            return loss
        else:  # Inference -> decode best path
            return self.crf.decode(emissions, mask=mask)


In [23]:
# Create Data Loader
from torch.utils.data import DataLoader, TensorDataset

# Convert numpy -> tensor
X_train_t = torch.tensor(X_train, dtype=torch.long)
y_train_t = torch.tensor(y_train, dtype=torch.long)
mask_train_t = torch.tensor(mask_train, dtype=torch.uint8)

X_test_t = torch.tensor(X_test, dtype=torch.long)
y_test_t = torch.tensor(y_test, dtype=torch.long)
mask_test_t = torch.tensor(mask_test, dtype=torch.uint8)

# Dataset
train_dataset = TensorDataset(X_train_t, y_train_t, mask_train_t)
test_dataset = TensorDataset(X_test_t, y_test_t, mask_test_t)

# Dataloader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

  X_train_t = torch.tensor(X_train, dtype=torch.long)
  y_train_t = torch.tensor(y_train, dtype=torch.long)
  mask_train_t = torch.tensor(mask_train, dtype=torch.uint8)
  X_test_t = torch.tensor(X_test, dtype=torch.long)
  y_test_t = torch.tensor(y_test, dtype=torch.long)
  mask_test_t = torch.tensor(mask_test, dtype=torch.uint8)


In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = len(word2idx)
tagset_size = len(tag2idx)       # số nhãn
embedding_dim = 100
hidden_dim = 256

model = BiLSTM_CRF(vocab_size, tagset_size, embedding_dim, hidden_dim, pad_idx).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [None]:
EPOCHS = 5
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0

    for i, (X_batch, y_batch, mask_batch) in enumerate(train_loader):
        # Đưa dữ liệu lên GPU/CPU
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        mask_batch = mask_batch.to(device)
        # Reset gradient
        optimizer.zero_grad()
        # Forward
        loss = model(X_batch, tags=y_batch, mask=mask_batch)
        #Backward
        loss.backward()
        # Upgrade parameters
        optimizer.step()

        total_loss += loss.item()

        if i % 200 == 0:  # báo progress mỗi 200 batch
            print(f"Epoch {epoch+1}, Step {i}/{len(train_loader)}, Loss: {loss.item():.4f}")
            
    avg_loss = total_loss / len(train_loader)
    print(f"✅ Epoch {epoch+1}/{EPOCHS} finished, Avg Loss: {avg_loss:.4f}")


Epoch 1, Step 0/1274, Loss: 64.3719
Epoch 1, Step 200/1274, Loss: 11.2397


In [None]:
# Evaluation
from sklearn.metrics import classification_report

model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for X_batch, y_batch, mask_batch in test_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        mask_batch = mask_batch.to(device)

        preds = model(X_batch, mask=mask_batch)
        labels = y_batch.cpu().numpy().tolist()
        masks = mask_batch.cpu().numpy().tolist()

        # Bỏ PAD
        for p, l, m in zip(preds, labels, masks):
            true_l = [ll for ll, mm in zip(l, m) if mm == 1]
            all_labels.extend(true_l)
            all_preds.extend(p)

print(classification_report(all_labels, all_preds, target_names=list(tag2idx.keys())))
