# Named Entity Recognition Experiment

In [3]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast, BertForTokenClassification

In [25]:
import modin.pandas as pd_modin
import time

## Load dataset

Load text dataset with entity tags

In [30]:
import csv
import random
from faker import Faker
from datetime import datetime

l=Faker('en_GB') 
# f=open("test.csv","r")
# k=csv.reader(f)

with open("big.csv","a") as g:
    w=csv.writer(g)
    w.writerow(('id','name','address','college','company','dob','age'))
    for i in range(1000000):

        w.writerow((i+1,l.name(),l.address(),random.choice(['psg','sona','amirta','anna university']),random.choice(['CTS','INFY','HTC']),(random.randrange(1950,1995,1),random.randrange(1,13,1),random.randrange(1,32,1)),random.choice(range(0,100))))

In [None]:
start = time.perf_counter()
pd.read_csv("large.csv")
print(time.perf_counter() -start)

start = time.perf_counter()
pd_modin.read_csv("large.csv")
print(time.perf_counter() -start)


In [None]:
start = time.time()
pd.read_csv("large.cs")
print(time.time() -start)

start = time.time()
pd_modin.read_csv("large.csv")
print(time.time() -start)

In [None]:
%%timeit
pd.read_csv("large.csv")

In [None]:
%%timeit
pd_modin.read_csv("large.csv")

In [26]:
start = time.time()
dataset = pd.read_csv("ner.csv")
time.time()- 

0.4190807342529297

In [10]:
start = perf_counter()
dataset = pd.read_csv("ner.csv")
perf_counter() - start

0.31632183901092503

In [21]:
import os
os.environ["MODIN_ENGINE"] = "ray"

In [27]:
start = time.time()
modin_dataset = pd_modin.read_csv("ner.csv")
time.time() - start

0.6143596172332764

In [2]:
dataset["Sentence #"] = dataset["Sentence #"].fillna(method='ffill')
sentences, targets = [], []
for sent_i, x in dataset.groupby("Sentence #"):
    words = x["Word"].tolist()
    tags = x["Tag"].tolist()
    sentences.append(words)
    targets.append(tags)

Number of sentences in dataset

In [3]:
len(sentences)

22862

### Text encoding

Convert each word into subwords and their respective subword ids such that Bert can work with the words

In [4]:
# tokenize words
PRETRAINED = "prajjwal1/bert-tiny"
tokenizer = BertTokenizerFast.from_pretrained(PRETRAINED)
sentences_encoded = tokenizer(
    sentences, is_split_into_words=True, return_tensors="pt", padding=True, truncation=True, max_length=150, add_special_tokens=False
);

Ignored unknown kwarg option direction


In [5]:
sentences_encoded["input_ids"].shape

torch.Size([22862, 150])

### Target encoding

Convert the NER tags into tensors such that Bert can work with them

In [6]:
# mapping from ner tag to number
tag2idx = {tag: i for i, tag in enumerate(set(t for ts in targets for t in ts))}
tag2idx

{'I-tim': 0,
 'B-tim': 1,
 'B-org': 2,
 'B-gpe': 3,
 'I-geo': 4,
 'B-per': 5,
 'I-eve': 6,
 'B-art': 7,
 'I-art': 8,
 'I-gpe': 9,
 'O': 10,
 'I-org': 11,
 'I-per': 12,
 'B-eve': 13,
 'B-nat': 14,
 'B-geo': 15,
 'I-nat': 16}

Pad the target tensors because sentences have different length

In [7]:
max_len = sentences_encoded["input_ids"].shape[1]
targets_encoded = torch.empty((0, max_len), dtype=torch.long)

for sent_idx, target in enumerate(targets):
    enc = torch.full(size=(max_len,), fill_value=tag2idx['O'], dtype=torch.long)
    # repeat ner tag for each subword
    for word_idx, tag in enumerate(target):
        span = sentences_encoded.word_to_tokens(sent_idx, word_idx)
        # ignore words that tokenizer did not understand e.g. special characters
        if span is not None:
            start, end = span
            enc[start:end] = tag2idx[tag]
    targets_encoded = torch.vstack((targets_encoded, enc))

targets_encoded.shape

torch.Size([22862, 150])

Show the first sample and its target tensor

In [8]:
tokenizer.decode(sentences_encoded["input_ids"][0])

'thousands of demonstrators have marched through london to protest the war in iraq and demand the withdrawal of british troops from that country. [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [9]:
targets_encoded[:1]

tensor([[10, 10, 10, 10, 10, 10, 15, 10, 10, 10, 10, 10, 15, 10, 10, 10, 10, 10,
          3, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
         10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
         10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
         10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
         10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
         10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
         10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
         10, 10, 10, 10, 10, 10]])

### Train/Test split

Split dataset into training and test set

In [10]:
train_size = int(0.8 * len(sentences_encoded.input_ids))
test_size = int(0.2 * len(sentences_encoded.input_ids))
train_sentences = sentences_encoded[:train_size]
train_targets = targets_encoded[:train_size]
test_sentences = sentences_encoded[train_size:train_size+test_size]
test_targets = targets_encoded[train_size:train_size+test_size]
(f"Train sentences: {len(train_targets)}", f"Test sentences: {len(test_targets)}")

('Train sentences: 18289', 'Test sentences: 4572')

In [11]:
class NERDataset(Dataset):
    def __init__(self, sentences, labels):
        self.sentences = sentences
        self.labels = labels

    def __getitem__(self, index):
        ids = torch.tensor(self.sentences[index].ids)
        mask = torch.tensor(self.sentences[index].attention_mask)
        labels = self.labels[index].clone()

        return {
            'ids': ids,
            'mask': mask,
            'tags': labels
        }

    def __len__(self):
        return len(self.labels)

training_set = NERDataset(train_sentences, train_targets)
testing_set = NERDataset(test_sentences, test_targets)

training_loader = DataLoader(training_set, batch_size=16, shuffle=True)
testing_loader = DataLoader(testing_set, batch_size=16, shuffle=True)

## Train model

Load a pretrained Bert model to fine-tune for multi-class classification of NER tags

In [12]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = BertForTokenClassification.from_pretrained(PRETRAINED, num_labels=len(tag2idx))
model = model.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=2e-05)

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from t

In [13]:
# training loop
for epoch in range(1, 3):
    print("epoch:", epoch)
    model.train()
    for i, data in enumerate(training_loader, 0):
        data = {k: v.to(device) for k, v in data.items()}
        output = model(data["ids"], attention_mask=data["mask"], labels=data["tags"])
        loss = output[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if i % 100 == 0:
            print("loss:", round(loss.detach().cpu().item(), 5))

epoch: 1
loss: 2.88024
loss: 1.80421
loss: 1.15632
loss: 1.01469
loss: 0.89944
loss: 0.98938
loss: 0.92672
loss: 0.7426
loss: 0.62529
loss: 0.72075
loss: 0.69993
loss: 0.59267
epoch: 2
loss: 0.71214
loss: 0.74181
loss: 0.65504
loss: 0.50718
loss: 0.36026
loss: 0.32124
loss: 0.38675
loss: 0.39118
loss: 0.43116
loss: 0.44601
loss: 0.53579
loss: 0.42692


## Test model

Compute classification metric of Bert model on test set

In [14]:
from sklearn.metrics import accuracy_score

model.eval()
all_preds, all_trues = [], []

for data in testing_loader:
    data = {k: v.to(device) for k, v in data.items()}
    with torch.no_grad():
        output = model(data["ids"], attention_mask=data["mask"], labels=data["tags"])
    loss = output[0]
    logits = output[1].detach().cpu()
    mask = data["mask"].cpu()

    label_ids = data["tags"].cpu()
    pred_ids = torch.argmax(logits, dim=-1)

    for i in range(pred_ids.shape[0]):
        # remove pad predictions
        pred_ids_non_pad = pred_ids[i, mask[i]]
        label_ids_non_pad = label_ids[i, mask[i]]
        all_preds.append(pred_ids_non_pad)
        all_trues.append(label_ids_non_pad)

all_preds = torch.cat(all_preds)
all_trues = torch.cat(all_trues)
accuracy = accuracy_score(all_trues, all_preds)
print("Test Accuracy:", round(accuracy, 3))

Test Accuracy: 0.897
