# MultiNERD NER model evalution
Evaluating Transformer-based NER model for MultiNERD dataset downloaded from  🤗 Hub.

In [None]:
# !pip install huggingface_hub==0.19.4
# !pip install datasets==2.15.0
# !pip install seqeval==1.2.2
# !pip install evaluate==0.4.1
# !pip install torch==2.0.0
# !pip install transformers[torch]==4.35.2
# !pip install tqdm==4.66.1

import json
import os

from tqdm import tqdm
from datasets import load_dataset
import evaluate
from transformers import RobertaTokenizerFast
from transformers import AutoModelForTokenClassification
import torch
os.environ["TOKENIZERS_PARALLELISM"]="true"

In [None]:
test_dataset = load_dataset("Babelscape/multinerd",split='test')
test_dataset = test_dataset.filter(lambda x: x['lang'] == 'en')
print(f"Total Sequences in EN test dataset: {len(test_dataset)}")

with open('./config.json', 'r') as f:
    config = json.load(f)

In [None]:
is_systemB = True #False for system A. True for system B.

if is_systemB:
    model_checkpoint = "./roberta-base-finetuned-ner-B/checkpoint-4000"
    tokenizer = RobertaTokenizerFast.from_pretrained(model_checkpoint,add_prefix_space=True)
else:    
    model_checkpoint = "./roberta-base-finetuned-ner-A/checkpoint-4000"
    tokenizer = RobertaTokenizerFast.from_pretrained(model_checkpoint,add_prefix_space=True)


## Utility functions

In [None]:
def system_labels(is_systemB, examples):

  if is_systemB:

    systemB_ids = [int(k) for k in config['systemBid'].keys()]
    ner_tags = []
    for labels in examples['ner_tags']:
      ner_tags.append([ label if label in systemB_ids else 0 for label in labels])

    examples['ner_tags'] = ner_tags

  return examples['ner_tags']


def tokenize_and_align_labels(examples, label_all_tokens = False):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    ner_tags = system_labels(is_systemB, examples)
    for i, label in enumerate(ner_tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs


## Predictions

In [None]:

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, device_map="auto")

tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

metric = evaluate.load("seqeval")
predictions = []
references = []

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for tokens, labels in tqdm(zip(test_dataset['tokens'], tokenized_test_dataset['labels']), total= len(test_dataset['tokens'])):

    tokenized_inputs = tokenizer(tokens, truncation=True, is_split_into_words=True, return_tensors="pt")
    tokenized_inputs = tokenized_inputs.to(device)
    with torch.no_grad():
        logits = model(**tokenized_inputs).logits
        
    prediction = torch.argmax(logits, dim=2)
    prediction = [model.config.id2label[t.item()] for t in prediction[0]]
    predictions.append(prediction)
    references.append(labels) 
    

In [None]:
# Remove ignored index (special tokens)
true_predictions = [
    [p for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, references)
]
true_references = [
    [model.config.id2label[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, references)
]

metric.add_batch(references=true_references, predictions=true_predictions)
metric.compute(mode='strict', scheme='IOB2',zero_division=0)