# Named entities recognition with transformers

<a target="_blank" href="https://colab.research.google.com/github/jaspock/me/blob/main/docs/materials/transformers/assets/notebooks/nerbert.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

Code written by Juan Antonio Pérez in 2024.

This notebook presents 


In [1]:
%pip install torch

Note: you may need to restart the kernel to use updated packages.


## Mini-batch preparation

In [2]:
import torch
import itertools

def make_batch(input_sentences, output_tags, word_index, tag_index, max_len, batch_size, device):
    input_batch = []
    output_batch = []
    data_cycle = itertools.cycle(zip(input_sentences, output_tags))

    # to-do: adjust T to be minimum of the actual max length of the batch or max_len

    while True:
        for s,t in data_cycle:
            words = s.split()
            tags = t.split()
            assert len(words) == len(tags)
            inputs = [word_index[n] for n in words]
            inputs = inputs + [0] * (max_len - len(inputs))  # padded inputs
            tags = [tag_index[n] for n in tags]
            tags = tags + [0] * (max_len - len(tags))  # padded outputs
            input_batch.append(inputs)
            output_batch.append(tags)

            if len(input_batch) == batch_size:
                yield torch.LongTensor(input_batch, device=device), torch.LongTensor(output_batch, device=device)
                input_batch = []
                output_batch = []

## Import our transformer code

In [3]:
%%capture

import os
colab = bool(os.getenv("COLAB_RELEASE_TAG"))
if not os.path.isfile('transformer.ipynb') and colab:
    %pip install wget
    %wget https://raw.githubusercontent.com/jaspock/minGPT/master/transformer.ipynb

%pip install nbformat
%run './transformer.ipynb'

set_seed(42)

## Corpus preprocessing

In [4]:
input_sentences = [
    "The cat sat on the mat .",
    "I love eating pizza .",
    "John is running in the park .",
    "She gave him a beautiful gift .",
    "They are playing soccer together .",
    "The cat is eating pizza in the park ."
]

output_tags = [
    "DET NOUN VERB ADP DET NOUN PUNCT",
    "PRON VERB VERB NOUN PUNCT",
    "PROPN AUX VERB ADP DET NOUN PUNCT",
    "PRON VERB PRON DET ADJ NOUN PUNCT",
    "PRON AUX VERB NOUN ADV PUNCT",
    "DET NOUN AUX VERB NOUN ADP DET NOUN PUNCT"
]

word_list = list(set(" ".join(input_sentences).split()))
word_index = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}
special_tokens = len(word_index) 
for i, w in enumerate(word_list):
    word_index[w] = i + special_tokens
index_word = {i: w for i, w in enumerate(word_index)}
input_vocab_size = len(word_index)
tag_list = list(set(" ".join(output_tags).split()))
tag_index = {'[PAD]': 0}  # padding index must be 0
for i, t in enumerate(tag_list):
    tag_index[t] = i + 1
index_tag = {i:t for i, t in enumerate(tag_index)}
output_vocab_size = len(tag_index)
print("input vocab size: %d" % input_vocab_size)
print("output vocab size: %d" % output_vocab_size)

input vocab size: 31
output vocab size: 11


## Model training

In [7]:
n_layer = 2
n_head = 2
n_embd =  64
embd_pdrop = 0.1
resid_pdrop = 0.1
attn_pdrop = 0.1
batch_size = 3
max_len = 12
lr = 0.001
training_steps = 1000
eval_steps = 100

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = EncoderTransformer(n_embd=n_embd, n_head=n_head, n_layer=n_layer, input_vocab_size=input_vocab_size, output_vocab_size=output_vocab_size, 
                max_len=max_len, embd_pdrop=embd_pdrop, attn_pdrop=attn_pdrop, resid_pdrop=resid_pdrop)
print(input_vocab_size)
model.to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=lr, steps_per_epoch=training_steps, epochs=1, anneal_strategy='cos')

model.train()
step = 0
for inputs, outputs in make_batch(input_sentences=input_sentences, output_tags=output_tags, word_index=word_index, 
                                    tag_index=tag_index, max_len=max_len, batch_size=batch_size, device=device):
    print(inputs)
    padding_mask = inputs == 0
    optimizer.zero_grad()
    logits = model(inputs)
    loss = criterion(logits.view(-1,logits.size(-1)), outputs.view(-1)) 
    if i % eval_steps == 0:
        print(f'Step [{i}/{training_steps}], loss: {loss.item():.4f}')
    loss.backward()
    optimizer.step()
    scheduler.step()
    step = step + 1
    if (step==training_steps):
        break

number of parameters: 0.10M
31
tensor([[19, 18, 28, 11,  4, 29, 15,  0,  0,  0,  0,  0],
        [23, 25,  7, 12, 15,  0,  0,  0,  0,  0,  0,  0],
        [ 8,  5, 16, 17,  4, 27, 15,  0,  0,  0,  0,  0]])


IndexError: index out of range in self

## Model evaluation

In [None]:
# predict tags
model.eval()
inputs, outputs = make_batch(input_sentences=input_sentences, output_tags=output_tags, word_index=word_index, tag_index=tag_index, max_len=max_len, batch_size=batch_size, device=device).__next__()
print(inputs,outputs)
logits = model(inputs)
_, indices = torch.max(logits, dim=-1)
predict_tags, true_tags, input_words = [], [], []  # 3 lists are required, not one
for i in range(batch_size):
    predict_tags.append(" ".join([index_tag[each.item()] for each in indices[i]]))
    true_tags.append(" ".join([index_tag[each.item()] for each in outputs[i]]))
    input_words.append(" ".join([index_word[each.item()] for each in inputs[i]]))
print("Input:\n", "\n".join(input_words))
print("Prediction: \n", "\n".join(predict_tags))
print("Target: \n", "\n".join(true_tags))