# Finetuning RoBERTa for NER: Compile Corpus

***

## Imports

In [1]:
from transformers import (BertTokenizerFast,
                          RobertaTokenizerFast,
                          AutoTokenizer,
                          BertForTokenClassification,
                          RobertaForTokenClassification,
                          DataCollatorForTokenClassification, 
                          AutoModelForTokenClassification, 
                          TrainingArguments, Trainer)
from datasets import load_dataset, concatenate_datasets, DatasetDict
import pickle
import torch
import os

## Load Tokenizer

**Load Model and Tokenizer:**

Information about model variants can be found here: https://huggingface.co/docs/transformers/model_doc/roberta

In [2]:
model_name = "xlm-roberta-large" #"bert-base-multilingual-cased" #xlm-roberta-large
tokenizer = AutoTokenizer.from_pretrained(f"{model_name}", add_prefix_space=True) #AutoTokenizer(use_fast = True)
#model = AutoModelForTokenClassification.from_pretrained(f"{model_name}")

## Download Dataset for Finetuning

See:
* Dataset on Huggingface: https://huggingface.co/datasets/wikiann
* Load Datasets: https://huggingface.co/docs/datasets/v2.4.0/en/package_reference/loading_methods

In [3]:
# Specify list of languages
languages = ["en","de", "fr", "es", "zh"]
languages = ["en", "de"]
#languages = ["en"]
dataset_name = "wikiann"

# Downloa first language
dataset_train = load_dataset(dataset_name, languages[0],  split="train")
dataset_valid = load_dataset(dataset_name, languages[0],  split="validation")
dataset_test =  load_dataset(dataset_name, languages[0],  split="test")
languages.pop(0)

# Merge with additional languages
for language in languages:
    
    print(f"Download Dataset for Language {language}")
    
    # Combine train splits
    dataset_train_new = load_dataset(dataset_name, language,  split="train")
    dataset_train = concatenate_datasets([dataset_train, dataset_train_new])

    # Combine validation splits
    dataset_valid_new = load_dataset(dataset_name, language,  split="validation")
    dataset_valid = concatenate_datasets([dataset_valid, dataset_valid_new])
    
    # Combine test splits
    dataset_test_new = load_dataset(dataset_name, language,  split="test")
    dataset_test = concatenate_datasets([dataset_test, dataset_test_new])

Downloading builder script:   0%|          | 0.00/3.94k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading and preparing dataset wikiann/en (download: 223.17 MiB, generated: 8.88 MiB, post-processed: Unknown size, total: 232.05 MiB) to C:\Users\julia\.cache\huggingface\datasets\wikiann\en\1.1.0\4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e...


Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Dataset wikiann downloaded and prepared to C:\Users\julia\.cache\huggingface\datasets\wikiann\en\1.1.0\4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e. Subsequent calls will reuse this data.


Reusing dataset wikiann (C:\Users\julia\.cache\huggingface\datasets\wikiann\en\1.1.0\4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e)
Reusing dataset wikiann (C:\Users\julia\.cache\huggingface\datasets\wikiann\en\1.1.0\4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e)


Download Dataset for Language de
Downloading and preparing dataset wikiann/de (download: 223.17 MiB, generated: 10.51 MiB, post-processed: Unknown size, total: 233.67 MiB) to C:\Users\julia\.cache\huggingface\datasets\wikiann\de\1.1.0\4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e...


Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Dataset wikiann downloaded and prepared to C:\Users\julia\.cache\huggingface\datasets\wikiann\de\1.1.0\4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e. Subsequent calls will reuse this data.


Reusing dataset wikiann (C:\Users\julia\.cache\huggingface\datasets\wikiann\de\1.1.0\4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e)
Reusing dataset wikiann (C:\Users\julia\.cache\huggingface\datasets\wikiann\de\1.1.0\4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e)


In [4]:
dataset = DatasetDict({
    "train":dataset_train,
    "test":dataset_test, 
    "validation":dataset_valid
    })

**Limit Dataset Size for Testing:**

In [5]:
## Sample a subset of datapoints
#num_samples = 1000
#sample_ids = list(range(0,num_samples))
#
## Reduce the size of the dataset
#dataset_train = dataset_train.select(sample_ids)
#dataset_valid = dataset_valid.select(sample_ids)
#dataset_test = dataset_test.select(sample_ids)
#
#print("Training Examples:", len(dataset_train))

**Save combined Dataset:**

In [6]:
data_path = "./data/dataset_multilingual.pkl"
with open(data_path, 'wb') as pickle_file:
    pickle.dump(obj = dataset, file=pickle_file)

### About the Dataset:

**Splits:**

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 20000
    })
})

**Training Examples:**

In [8]:
print("Dataset Object Type:", type(dataset["train"]))
print("Training Examples:", len(dataset["train"]))

Dataset Object Type: <class 'datasets.arrow_dataset.Dataset'>
Training Examples: 40000


**Sample Structure:**

In [9]:
dataset["train"][95]

{'tokens': ['Bruce', 'Beresford', '(', 'Non-Jew', ')'],
 'ner_tags': [1, 2, 0, 0, 0],
 'langs': ['en', 'en', 'en', 'en', 'en'],
 'spans': ['PER: Bruce Beresford']}

**Class Labels:**

In [10]:
label_list = dataset["train"].features[f"ner_tags"].feature.names
print(label_list)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
