# Implement the pipeline for Named Entity Recognition with NLP

In [None]:
# Import libraries
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"This notebook was last compiled at: {datetime.datetime.now():%Y-%m-%d %H:%M:%S}")

This notebook was last compiled at: 2025-09-06 08:36:35


## Import dataset

In [None]:
def load_data(data_dir, mode=None):
    df = pd.read_csv(data_dir, encoding="ISO-8859-1")
    return df

In [None]:
source_dir = "/mnt/e/Development/Python/NLP/NaturalLanguageProcessing/"
data_dir = source_dir + "data/archive/ner_dataset.csv"
print(os.path.exists(data_dir))
df = load_data(data_dir)

print(df.head(10))

True
    Sentence #           Word  POS    Tag
0  Sentence: 1      Thousands  NNS      O
1          NaN             of   IN      O
2          NaN  demonstrators  NNS      O
3          NaN           have  VBP      O
4          NaN        marched  VBN      O
5          NaN        through   IN      O
6          NaN         London  NNP  B-geo
7          NaN             to   TO      O
8          NaN        protest   VB      O
9          NaN            the   DT      O


## Preprocessing data

In [None]:
print(df.isna().sum())

Sentence #    1000616
Word               10
POS                 0
Tag                 0
dtype: int64


In [None]:
def preprocessing_data(data):
    # Fill NaN values in "Sentence #"
    data["Sentence #"] = data["Sentence #"].ffill()
    data["Sentence #"] = data["Sentence #"].astype(str)

    # Drop rows with missing Word
    data = data.dropna(subset=["Word"]).reset_index(drop=True)

    # Strip whitespace
    for col in ["Word", "POS", "Tag"]:
        data[col] = data[col].str.strip()
        
    return data


In [None]:
df_prep = preprocessing_data(df)

In [None]:
# Groupby follow sentence for training
grouped = df.groupby("Sentence #").apply(
    lambda s: list(zip(s["Word"], s["Tag"]))
).tolist()

print(grouped[:10])


[[('Thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O'), ('marched', 'O'), ('through', 'O'), ('London', 'B-geo'), ('to', 'O'), ('protest', 'O'), ('the', 'O'), ('war', 'O'), ('in', 'O'), ('Iraq', 'B-geo'), ('and', 'O'), ('demand', 'O'), ('the', 'O'), ('withdrawal', 'O'), ('of', 'O'), ('British', 'B-gpe'), ('troops', 'O'), ('from', 'O'), ('that', 'O'), ('country', 'O'), ('.', 'O')], [('Iranian', 'B-gpe'), ('officials', 'O'), ('say', 'O'), ('they', 'O'), ('expect', 'O'), ('to', 'O'), ('get', 'O'), ('access', 'O'), ('to', 'O'), ('sealed', 'O'), ('sensitive', 'O'), ('parts', 'O'), ('of', 'O'), ('the', 'O'), ('plant', 'O'), ('Wednesday', 'B-tim'), (',', 'O'), ('after', 'O'), ('an', 'O'), ('IAEA', 'B-org'), ('surveillance', 'O'), ('system', 'O'), ('begins', 'O'), ('functioning', 'O'), ('.', 'O')], [('Helicopter', 'O'), ('gunships', 'O'), ('Saturday', 'B-tim'), ('pounded', 'O'), ('militant', 'O'), ('hideouts', 'O'), ('in', 'O'), ('the', 'O'), ('Orakzai', 'B-geo'), ('tribal', '

  grouped = df.groupby("Sentence #").apply(


## Encoding data

In [None]:
# Create vocabulary for word and tag
words = list(df["Word"].unique())
tags = list(df["Tag"].unique())

# Create dictionary word2idx, with UNK and PAD
word2idx = {w : i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1
word2idx["PAD"] = 0

# Create dictionary tag2idx, with UNK and PAD
tag2idx = {w : i + 1 for i, w in enumerate(tags)}
tag2idx["PAD"] = 0

idx2word = {i: w for w, i in word2idx.items()}
idx2tag = {i: w for w, i in tag2idx.items()}

print(f"\nWord2idx size: {len(word2idx)}")
print(f"Tag2idx size: {len(tag2idx)}")
print(f"Tag2idx: {tag2idx}")


Word2idx size: 35179
Tag2idx size: 18
Tag2idx: {'O': 1, 'B-geo': 2, 'B-gpe': 3, 'B-per': 4, 'I-geo': 5, 'B-org': 6, 'I-org': 7, 'B-tim': 8, 'B-art': 9, 'I-art': 10, 'I-per': 11, 'I-gpe': 12, 'I-tim': 13, 'B-nat': 14, 'B-eve': 15, 'I-eve': 16, 'I-nat': 17, 'PAD': 0}


In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

# Mapping words to index
X = [torch.tensor([word2idx.get(w[0], word2idx["UNK"]) for w in s], dtype=torch.long) for s in grouped]

# Pad
X = pad_sequence(X, batch_first=True, padding_value=word2idx["PAD"])

# Mapping tags to index
y = [torch.tensor([tag2idx.get(w[1], tag2idx["PAD"]) for w in s], dtype=torch.long) for s in grouped]

# Pad
y = pad_sequence(y, batch_first=True, padding_value=tag2idx["PAD"])

# Select num_tags
num_tags = len(tag2idx)
vocab_size = len(word2idx)
print(X.shape, y.shape)  # (num_sentences, max_len)


torch.Size([47959, 104]) torch.Size([47959, 104])


In [None]:
print(X[:10])

tensor([[    2,     3,     4,  ...,     0,     0,     0],
        [  126,   127,   128,  ...,     0,     0,     0],
        [  944,   945,   365,  ...,     0,     0,     0],
        ...,
        [  890, 16293,   326,  ...,     0,     0,     0],
        [  837,    80,  1230,  ...,     0,     0,     0],
        [ 4488,   304,   182,  ...,     0,     0,     0]])


In [None]:
import torch.nn.functional as F

y_onehot = F.one_hot(y, num_classes=num_tags + 1)  


In [None]:
print(y_onehot[:5])

tensor([[[0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         ...,
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         ...,
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0]],

        [[0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0]],

        [[0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         ...,
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 1,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0

In [None]:
# Create mask
pad_idx = word2idx["PAD"]

mask = (X != pad_idx).to(torch.uint8) 
mask_bool = (X != pad_idx)

## Split into train and test sets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, mask_train, mask_test = train_test_split(
    X, y, mask_bool, test_size=0.15, random_state=42
)

## Build model 0 - PIPELINE with CRF and BiLSTM

In [None]:
!which python
!python --version

/home/dikhang_hcmut/miniconda3/envs/pytorch_env/bin/python
Python 3.12.11


In [None]:
!python -c "import torchcrf; print(torchcrf.__version__)"


0.7.2


In [None]:
import torch
import torch.nn as nn
from torchcrf import CRF

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, pad_idx):
        super(BiLSTM_CRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim, tagset_size)  # fully connected to tag space
        self.crf = CRF(tagset_size, batch_first=True)

    def forward(self, x, tags=None, mask=None):
        # x: [batch, seq_len]
        embeds = self.embedding(x)                  # [batch, seq_len, embedding_dim]
        lstm_out, _ = self.lstm(embeds)             # [batch, seq_len, hidden_dim]
        emissions = self.fc(lstm_out)               # [batch, seq_len, tagset_size]

        if tags is not None:  # Training -> trả loss
            loss = -self.crf(emissions, tags, mask=mask, reduction='mean')
            return loss
        else:  # Inference -> decode best path
            return self.crf.decode(emissions, mask=mask)


In [None]:
# DEBUG TRỰC TIẾP VẤN ĐỀ VOCAB
print("=== DEBUG VOCABULARY ISSUE ===")

# 1. Kiểm tra vocab mapping hiện tại
print(f"Current vocab_size: {vocab_size}")
print(f"Current word2idx length: {len(word2idx)}")
print(f"Max index in word2idx: {max(word2idx.values())}")

# Tìm từ nào có index = vocab_size
problem_words = [word for word, idx in word2idx.items() if idx >= vocab_size]
print(f"Words with index >= vocab_size: {problem_words}")

# 2. Tìm batch có vấn đề
print(f"\n=== FINDING PROBLEMATIC BATCH ===")
for batch_idx, (X_batch, y_batch, mask_batch) in enumerate(train_loader):
    max_idx = X_batch.max().item()
    if max_idx >= vocab_size:
        print(f"Found problematic batch {batch_idx}")
        print(f"Max index: {max_idx}")
        
        # Tìm vị trí có index sai
        problem_positions = (X_batch >= vocab_size).nonzero()
        print(f"Problem positions: {problem_positions[:5]}")  # Show first 5
        
        # Tìm giá trị sai
        problem_values = X_batch[X_batch >= vocab_size]
        print(f"Problem values: {problem_values.unique()}")
        break

=== DEBUG VOCABULARY ISSUE ===
Current vocab_size: 35179
Current word2idx length: 35179
Max index in word2idx: 35179
Words with index >= vocab_size: ['Bermel']

=== FINDING PROBLEMATIC BATCH ===
Found problematic batch 161
Max index: 35179
Problem positions: tensor([[26, 18]])
Problem values: tensor([35179])


In [None]:
# Create Data Loader
from torch.utils.data import DataLoader, TensorDataset

# Convert numpy -> tensor
X_train_t = torch.tensor(X_train, dtype=torch.long)
y_train_t = torch.tensor(y_train, dtype=torch.long)
mask_train_t = torch.tensor(mask_train, dtype=torch.bool)

X_test_t = torch.tensor(X_test, dtype=torch.long)
y_test_t = torch.tensor(y_test, dtype=torch.long)
mask_test_t = torch.tensor(mask_test, dtype=torch.bool)

# Dataset
train_dataset = TensorDataset(X_train_t, y_train_t, mask_train_t)
test_dataset = TensorDataset(X_test_t, y_test_t, mask_test_t)

# Dataloader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

  X_train_t = torch.tensor(X_train, dtype=torch.long)
  y_train_t = torch.tensor(y_train, dtype=torch.long)
  mask_train_t = torch.tensor(mask_train, dtype=torch.bool)
  X_test_t = torch.tensor(X_test, dtype=torch.long)
  y_test_t = torch.tensor(y_test, dtype=torch.long)
  mask_test_t = torch.tensor(mask_test, dtype=torch.bool)


In [None]:
print(max(X_batch.max().item(), y_batch.max().item()))
print(vocab_size)


: 

In [None]:
# 1. CLEAR CUDA CACHE
import torch, gc
print("🧹 Clearing CUDA cache...")
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()  # Đợi tất cả operations hoàn thành
    
# 2. FORCE GARBAGE COLLECTION
print("🗑️ Running garbage collection...")
gc.collect()

vocab_size = len(word2idx)
tagset_size = len(tag2idx)
embedding_dim = 100
hidden_dim = 256
pad_idx = word2idx["PAD"]

print(f"Creating model with:")
print(f"vocab_size: {vocab_size}")
print(f"tagset_size: {tagset_size}")
print(f"pad_idx: {pad_idx}")

# Tạo trên CPU trước
model = BiLSTM_CRF(vocab_size, tagset_size, embedding_dim, hidden_dim, pad_idx)
print("Model created on CPU successfully")

try:
    sample_x = X[:2]  # CPU
    sample_y = y[:2]  # CPU  
    sample_mask = mask[:2]  # CPU
    
    loss = model(sample_x, tags=sample_y, mask=sample_mask)
    print(f"CPU test successful, loss: {loss.item()}")
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if device.type == "cuda":
        model = model.to(device)
        print("Model moved to GPU successfully")
    
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    print("Setup complete!")
    
except Exception as e:
    print(f"Error: {str(e)}")
    print("Problem is in BiLSTM_CRF class or data, not CUDA")

🧹 Clearing CUDA cache...
🗑️ Running garbage collection...
Creating model with:
vocab_size: 35179
tagset_size: 18
pad_idx: 0
Model created on CPU successfully
CPU test successful, loss: 71.67552185058594
Model moved to GPU successfully


  score = torch.where(mask[i].unsqueeze(1), next_score, score)


Setup complete!


: 

In [None]:
EPOCHS = 5
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0

    for i, (X_batch, y_batch, mask_batch) in enumerate(train_loader):
        # Transfer train and test data to GPU/CPU
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        mask_batch = mask_batch.to(device)
        # Reset gradient
        optimizer.zero_grad()
        # Forward
        loss = model(X_batch, tags=y_batch, mask=mask_batch)
        #Backward
        loss.backward()
        # Upgrade parameters
        optimizer.step()

        total_loss += loss.item()

        if i % 200 == 0:
            print(f"Epoch {epoch+1}, Step {i}/{len(train_loader)}, Loss: {loss.item():.4f}")
            
    avg_loss = total_loss / len(train_loader)
    print(f"✅ Epoch {epoch+1}/{EPOCHS} finished, Avg Loss: {avg_loss:.4f}")


Epoch 1, Step 0/1274, Loss: 62.5754
Epoch 1, Step 200/1274, Loss: 7.7107
Epoch 1, Step 400/1274, Loss: 6.9028
Epoch 1, Step 600/1274, Loss: 8.3703
Epoch 1, Step 800/1274, Loss: 3.7171


/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [126,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [126,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [126,0,0], thread: [2,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [126,0,0], thread: [3,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [126,0,0], thread: [4,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [126,0,0], thread: [5,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553:

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


: 

In [None]:
import torch
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_model(model, dataloader, idx2tag, device):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for X_batch, y_batch, mask_batch in dataloader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            mask_batch = mask_batch.to(device)

            # preds: list of list of tag indices
            preds = model(X_batch, mask=mask_batch)

            labels = y_batch.cpu().numpy().tolist()
            masks = mask_batch.cpu().numpy().tolist()

            # Loại bỏ PAD
            for p, l, m in zip(preds, labels, masks):
                true_l = [ll for ll, mm in zip(l, m) if mm == 1]
                true_p = [pp for pp, mm in zip(p, m) if mm == 1]  # bỏ PAD trong preds
                all_labels.extend(true_l)
                all_preds.extend(true_p)

    # Chuyển chỉ số sang nhãn
    all_labels_str = [idx2tag[i] for i in all_labels]
    all_preds_str = [idx2tag[i] for i in all_preds]

    # Loại bỏ PAD khỏi target_names
    labels_no_pad = [i for i, tag in idx2tag.items() if tag != "PAD"]
    target_names_no_pad = [idx2tag[i] for i in labels_no_pad]

    # Classification report
    report = classification_report(all_labels_str, all_preds_str,
                                   labels=target_names_no_pad)
    print("=== Classification Report ===")
    print(report)

    # Confusion matrix
    cm = confusion_matrix(all_labels_str, all_preds_str,
                          labels=target_names_no_pad)
    plt.figure(figsize=(12,10))
    sns.heatmap(cm, annot=True, fmt='d',
                xticklabels=target_names_no_pad,
                yticklabels=target_names_no_pad,
                cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.show()

    return all_preds_str, all_labels_str


ModuleNotFoundError: No module named 'seaborn'

In [None]:
all_preds_str, all_labels_str = evaluate_model(model, test_loader, idx2tag, device)
