# MultiNERD NER model finetuning
Finetuning Transformer-based NER model for MultiNERD dataset downloaded from  🤗 Hub.

In [None]:
# !pip install huggingface_hub==0.19.4
# !pip install datasets==2.15.0
# !pip install transformers[torch]==4.35.2
# !pip install seqeval==1.2.2
# !pip install evaluate==0.4.1
# !pip install matplotlib==3.7.1
# !pip install collections

import json
import numpy as np
import os
import  matplotlib.pyplot as plt
import collections
from datasets import load_dataset, load_metric
from transformers import RobertaTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
os.environ["TOKENIZERS_PARALLELISM"]="true"

## Read dataset

In [None]:
with open('./config.json', 'r') as f:
    config = json.load(f)

model_checkpoint = "roberta-base"
batch_size = 32
label_list = list(config['label2id'].keys())

In [None]:
# dataset = load_dataset("Babelscape/multinerd") #uncomment for full dataset
train_dataset = load_dataset("Babelscape/multinerd",split='train[:50%]')
eval_dataset = load_dataset("Babelscape/multinerd",split='validation[:50%]')
test_dataset = load_dataset("Babelscape/multinerd",split='test[:50%]')
print(f"Total Sequences in 50% training dataset: {len(train_dataset)}")
print(f"Total Sequences in 50% validation dataset: {len(eval_dataset)}")
print(f"Total Sequences in 50% test dataset: {len(test_dataset)}")

In [None]:
train_dataset = train_dataset.filter(lambda x: x['lang'] == 'en')
eval_dataset = eval_dataset.filter(lambda x: x['lang'] == 'en')
test_dataset = test_dataset.filter(lambda x: x['lang'] == 'en')
print(f"Total Sequences in 50% EN training dataset: {len(train_dataset)}")
print(f"Total Sequences in 50% EN validation dataset: {len(eval_dataset)}")
print(f"Total Sequences in 50% EN test dataset: {len(test_dataset)}")

# Drop all unused columns, only keep "tokens", "ner_tags"
train_dataset = train_dataset.remove_columns(('lang'))

## EDA

In [None]:
#Label distribution:
label_counter = collections.Counter()
for tags in train_dataset['ner_tags']:
  for tag in tags:
    label_counter[tag] += 1

#rearrange keys
label_list_new = []
for key in list(label_counter.keys()):
  label_list_new.append(label_list[key])

# creating the bar plot
labels, counts = zip(*label_counter.items())
# print(labels,counts)
fig, ax = plt.subplots(figsize = (15, 5))
bar_container = ax.bar(labels, counts, color ='maroon', width = 0.8, log = True)

ax.set_xlabel('NER tags')
ax.set_title('Tag distribution in training dataset')
ax.bar_label(bar_container, fmt='{:,.0f}', rotation=90, padding=3)
ax.set_xticks(labels, labels=label_list_new, rotation=30)
ax.set_ylim([0, 100_000_000])
plt.show()

## Preprocess dataset


In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained(model_checkpoint,add_prefix_space=True) # for RoBERTa models
is_systemB = True #False for system A. True for system B.

In [None]:
def system_labels(is_systemB, examples):

  if is_systemB:

    systemB_ids = [int(k) for k in config['systemBid'].keys()]
    ner_tags = []
    for labels in examples['ner_tags']:
      ner_tags.append([ label if label in systemB_ids else 0 for label in labels])

    examples['ner_tags'] = ner_tags

  return examples['ner_tags']

In [None]:
def tokenize_and_align_labels(examples,label_all_tokens = False):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    ner_tags = system_labels(is_systemB, examples)
    for i, label in enumerate(ner_tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [None]:
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)

## Fine-tuning model

In [None]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint,
                                                        num_labels=len(label_list),
                                                        id2label= config['id2label'],
                                                        label2id= config['label2id'])

model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    output_dir = f"{model_name}-finetuned-ner-A",
    learning_rate=5e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    warmup_ratio=0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy = "steps",
    save_strategy="steps",
    eval_steps=500,
    logging_steps=500,
    save_total_limit=1,
    dataloader_num_workers=2,
    push_to_hub=False,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

## Evaluation

In [None]:
predictions, labels, _ = trainer.predict(tokenized_eval_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results