# Finetuning RoBERTa for NER: Preprocess Corpus
 

## Imports

In [1]:
from transformers import (BertTokenizerFast,
                          RobertaTokenizerFast,
                          AutoTokenizer,
                          BertForTokenClassification,
                          RobertaForTokenClassification,
                          DataCollatorForTokenClassification, 
                          AutoModelForTokenClassification, 
                          TrainingArguments, Trainer)
from datasets import load_dataset, concatenate_datasets, DatasetDict
import pickle
import torch
import os

## Load Tokenizer

**Load Model and Tokenizer:**

Information about model variants can be found here: https://huggingface.co/docs/transformers/model_doc/roberta

In [2]:
model_name = "xlm-roberta-large" #"bert-base-multilingual-cased" #xlm-roberta-large
tokenizer = AutoTokenizer.from_pretrained(f"{model_name}", add_prefix_space=True) #AutoTokenizer(use_fast = True)
#model = AutoModelForTokenClassification.from_pretrained(f"{model_name}")

## Load Dataset

In [3]:
data_path = "./data/dataset_multilingual.pkl"
with open(data_path, 'rb') as pickle_file:
    dataset = pickle.load(file=pickle_file)

## Tokenize Dataset

### Tokenize a Single Sample:

In [9]:
example = dataset["train"][50]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True,add_special_tokens=False)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['▁List', '▁of', '▁Fly', 'w', 'heel', '▁', ',', '▁Sh', 'y', 'ster', '▁', ',', '▁and', '▁Fly', 'w', 'heel', '▁(', '▁1990', '▁radio', '▁series', '▁)', '▁episode', 's']


Sample after Tokenization:

In [10]:
tokenized_input

{'input_ids': [32036, 111, 25066, 434, 144009, 6, 4, 7525, 53, 1515, 6, 4, 136, 25066, 434, 144009, 15, 11704, 5977, 36549, 1388, 50094, 7], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Word IDs:

In [11]:
tokenized_input.word_ids()

[0, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 7, 7, 7, 8, 9, 10, 11, 12, 13, 13]

### Tokenize Whole Dataset

In [12]:
def tokenizeInputs(inputs):
    
    tokenized_inputs = tokenizer(inputs["tokens"], max_length = 512, truncation=True, is_split_into_words=True, add_special_tokens=False)
    word_ids = tokenized_inputs.word_ids()
    ner_tags = inputs["ner_tags"]
    labels = [ner_tags[word_id] for word_id in word_ids]
    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs

In [13]:
example = dataset["train"][100]
tokenizeInputs(example)

{'input_ids': [32036, 111, 5369, 23, 30089], 'attention_mask': [1, 1, 1, 1, 1], 'labels': [3, 4, 4, 4, 4]}

In [14]:
tokenized_dataset = dataset.map(tokenizeInputs)

  0%|          | 0/40000 [00:00<?, ?ex/s]

  0%|          | 0/20000 [00:00<?, ?ex/s]

  0%|          | 0/20000 [00:00<?, ?ex/s]

**Shuffle Dataset:**

In [15]:
tokenized_dataset = tokenized_dataset.shuffle()

**Count of Tokens in the Training Set:**

In [16]:
token_count = 0
for sample in tokenized_dataset["train"]:
    token_count = token_count + len(sample["labels"])
    
print("Tokens in Training Set:", token_count)

Tokens in Training Set: 556022


**Remove unnecessary columns:**

In [17]:
tokenized_dataset = tokenized_dataset.remove_columns(["tokens", "ner_tags", "langs", "spans"])

**Save processed Dataset:**

In [18]:
data_path = "./data/dataset_processed.pkl"
with open(data_path, 'wb') as pickle_file:
    pickle.dump(obj = dataset, file=pickle_file)