# Finetuning RoBERTa for NER: Use Model
 

***

## Imports

In [1]:
from transformers import (BertTokenizerFast,
                          RobertaTokenizerFast,
                          AutoTokenizer,
                          BertForTokenClassification,
                          RobertaForTokenClassification,
                          DataCollatorForTokenClassification, 
                          AutoModelForTokenClassification, 
                          TrainingArguments, Trainer)
from datasets import load_dataset, load_metric, concatenate_datasets, DatasetDict
from pprint import pprint
import numpy as np
import pickle
import torch
import os

## Load Dataset

In [2]:
data_path = "./data/dataset_processed.pkl"
with open(data_path, 'rb') as pickle_file:
    dataset = pickle.load(file=pickle_file)

## Load Model and Tokenizer

Information about model variants can be found here: https://huggingface.co/docs/transformers/model_doc/roberta

In [3]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [4]:
label_list = dataset["train"].features[f"ner_tags"].feature.names

In [5]:
model_name = "xlm-roberta-large" #"bert-base-multilingual-cased" #xlm-roberta-large
tokenizer = AutoTokenizer.from_pretrained(f"{model_name}", add_prefix_space=True) #AutoTokenizer(use_fast = True)
#model = AutoModelForTokenClassification.from_pretrained(f"{model_name}")

## Use Fine-tuned Model:

Load checkpoint:

In [6]:
model_tuned = AutoModelForTokenClassification.from_pretrained("./results/checkpoint-final/")

In [7]:
model_tuned.config

XLMRobertaConfig {
  "_name_or_path": "./results/checkpoint-final/",
  "architectures": [
    "XLMRobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "B-LOC": 5,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab

Set correct class labels:

In [None]:
# label_names = dataset["train"].features[f"ner_tags"].feature.names

# id2label = {id : label for id, label in enumerate(label_names)}
# label2id = {label: id for id, label in enumerate(label_names)}

# model_tuned.config.id2label = id2label
# model_tuned.config.label2id = label2id

In [8]:
model_tuned.config.id2label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC'}

In [47]:
def printPrediction(inputs, predictions, tokenizer):
    token_ids = list(inputs["input_ids"][0])
    tokens_classes = predictions
    #results = []

    for token_id, token_class in zip(token_ids, tokens_classes): 

        token_text = tokenizer.decode(int(token_id))
        #print(int(token_id),"\t", token_text,"\t", token_class)
        print("{: >10} {: >10} {: >10}".format(int(token_id), token_text, token_class))
        #results.append((int(token_id), token_text, token_class))

In [48]:
text = "Für Richard Phillips Feynman war es immer wichtig in New York, die unanschaulichen Gesetzmäßigkeiten der Quantenphysik Laien und Studenten nahezubringen und verständlich zu machen."

inputs = tokenizer(
    text, 
    add_special_tokens=False, return_tensors="pt"
)

with torch.no_grad():
    logits = model_tuned(**inputs).logits

predicted_token_class_ids = logits.argmax(-1)

# Note that tokens are classified rather then input words which means that
# there might be more predicted token classes than words.
# Multiple token classes might account for the same word
predicted_tokens_classes = [model_tuned.config.id2label[t.item()] for t in predicted_token_class_ids[0]]

printPrediction(inputs, predicted_tokens_classes, tokenizer)

     10333        Für          O
     22758    Richard      B-PER
    165458    Phillip      I-PER
         7          s      I-PER
       563          F      I-PER
     28950        eyn      I-PER
       669        man      I-PER
      1631        war          O
       198         es          O
      3807      immer          O
     31097    wichtig          O
        23         in          O
      2356        New      B-LOC
      5753       York      I-LOC
         4          ,          O
        68        die          O
        51         un          O
        66         an          O
     32854      schau          O
     12512     lichen          O
     64086     Gesetz          O
     86756      mäßig          O
     21888       keit          O
        33         en          O
       122        der          O
     75344      Quant          O
        33         en          O
     34053        phy          O
     14383        sik          O
     12460        Lai          O
        33

In [49]:
text = "In December 1903 in France the Royal Swedish Academy of Sciences awarded Pierre Curie, Marie Curie, and Henri Becquerel the Nobel Prize in Physics"

inputs = tokenizer(
    text, 
    add_special_tokens=False, return_tensors="pt"
)

with torch.no_grad():
    logits = model_tuned(**inputs).logits

predicted_token_class_ids = logits.argmax(-1)

# Note that tokens are classified rather then input words which means that
# there might be more predicted token classes than words.
# Multiple token classes might account for the same word
predicted_tokens_classes = [model_tuned.config.id2label[t.item()] for t in predicted_token_class_ids[0]]

printPrediction(inputs, predicted_tokens_classes, tokenizer)

       360         In          O
     14487   December          O
    106355       1903          O
        23         in          O
      9942     France      B-LOC
        70        the          O
     25674      Royal      B-ORG
    187951    Swedish      I-ORG
     62903    Academy      I-ORG
       111         of      I-ORG
     28745    Science      I-ORG
         7          s      I-ORG
     70318      award          O
       297         ed          O
     58807     Pierre      B-PER
     17065        Cur      I-PER
       478         ie      I-PER
         4          ,          O
     24479      Marie      B-PER
     17065        Cur      I-PER
       478         ie      I-PER
         4          ,          O
       136        and          O
     80640      Henri      B-PER
       873         Be      I-PER
       238          c      I-PER
       944        que      I-PER
      7962        rel      I-PER
        70        the          O
     34676      Nobel      B-ORG
      2319