# Finetuning RoBERTa for NER: Compile Corpus

***

## Imports

In [20]:
from transformers import (BertTokenizerFast,
                          RobertaTokenizerFast,
                          AutoTokenizer,
                          BertForTokenClassification,
                          RobertaForTokenClassification,
                          DataCollatorForTokenClassification, 
                          AutoModelForTokenClassification, 
                          TrainingArguments, Trainer)
from datasets import load_dataset, concatenate_datasets, DatasetDict
import pickle
import torch
import os

## Load Tokenizer

**Load Model and Tokenizer:**

Information about model variants can be found here: https://huggingface.co/docs/transformers/model_doc/roberta

In [21]:
model_name = "xlm-roberta-large" #"bert-base-multilingual-cased" #xlm-roberta-large
tokenizer = AutoTokenizer.from_pretrained(f"{model_name}", add_prefix_space=True) #AutoTokenizer(use_fast = True)
#model = AutoModelForTokenClassification.from_pretrained(f"{model_name}")

## Download Dataset for Finetuning

See:
* Dataset on Huggingface: https://huggingface.co/datasets/wikiann
* Load Datasets: https://huggingface.co/docs/datasets/v2.4.0/en/package_reference/loading_methods

In [22]:
# Specify list of languages
#languages = ["en","de", "fr", "es", "zh", "ne"]
languages = ["en", "de"]
#languages = ["en"]
#languages=["ace","af","als","am","an","ang","ar","arc","arz","as","ast","ay","az","ba","bar","bat-smg","be","be-x-old","bg","bh","bn","bo","br","bs","ca","cbk-zam","cdo","ce","ceb","ckb","co","crh","cs","csb","cv","cy","da","de","diq","dv","el","eml","en","eo","es","et","eu","ext","fa","fi","fiu-vro","fo","fr","frr","fur","fy","ga","gan","gd","gl","gn","gu","hak","he","hi","hr","hsb","hu","hy","ia","id","ig","ilo","io","is","it","ja","jbo","jv","ka","kk","km","kn","ko","ksh","ku","ky","la","lb","li","lij","lmo","ln","lt","lv","map-bms","mg","mhr","mi","min","mk","ml","mn","mr","ms","mt","mwl","my","mzn","nap","nds","ne","nl","nn","no","nov","oc","or","os","pa","pdc","pl","pms","pnb","ps","pt","qu","rm","ro","ru","rw","sa","sah","scn","sco","sd","sh","si","simple","sk","sl","so","sq","sr","su","sv","sw","szl","ta","te","tg","th","tk","tl","tr","tt","ug","uk","ur","uz","vec","vep","vi","vls","vo","wa","war","wuu","xmf","yi","yo","zea","zh","zh-classical","zh-min-nan","zh-yue"]

In [23]:
dataset_name = "wikiann"

# Downloa first language
dataset_train = load_dataset(dataset_name, languages[0],  split="train")
dataset_valid = load_dataset(dataset_name, languages[0],  split="validation")
dataset_test =  load_dataset(dataset_name, languages[0],  split="test")
languages.pop(0)

# Merge with additional languages
for language in languages:
    
    print(f"Download Dataset for Language {language}")
    
    # Combine train splits
    dataset_train_new = load_dataset(dataset_name, language,  split="train")
    dataset_train = concatenate_datasets([dataset_train, dataset_train_new])

    # Combine validation splits
    dataset_valid_new = load_dataset(dataset_name, language,  split="validation")
    dataset_valid = concatenate_datasets([dataset_valid, dataset_valid_new])
    
    # Combine test splits
    dataset_test_new = load_dataset(dataset_name, language,  split="test")
    dataset_test = concatenate_datasets([dataset_test, dataset_test_new])

Found cached dataset wikiann (/home/pop529700/.cache/huggingface/datasets/wikiann/en/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e)
Found cached dataset wikiann (/home/pop529700/.cache/huggingface/datasets/wikiann/en/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e)
Found cached dataset wikiann (/home/pop529700/.cache/huggingface/datasets/wikiann/en/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e)


Download Dataset for Language de


Found cached dataset wikiann (/home/pop529700/.cache/huggingface/datasets/wikiann/de/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e)
Found cached dataset wikiann (/home/pop529700/.cache/huggingface/datasets/wikiann/de/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e)
Found cached dataset wikiann (/home/pop529700/.cache/huggingface/datasets/wikiann/de/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e)


**Limit Dataset Size for Testing:**

In [24]:
# Sample a subset of datapoints
#num_samples = 1000
#sample_ids = list(range(0,num_samples))

# Reduce the size of the dataset
#dataset_train = dataset_train.select(sample_ids)
#dataset_valid = dataset_valid.select(sample_ids)
#dataset_test = dataset_test.select(sample_ids)

print("Training Examples:", len(dataset_train))

Training Examples: 40000


In [25]:
dataset = DatasetDict({
    "train":dataset_train,
    "test":dataset_test, 
    "validation":dataset_valid
    })

**Save combined Dataset:**

In [26]:
data_path = "./data/dataset_multilingual.pkl"
with open(data_path, 'wb') as pickle_file:
    pickle.dump(obj = dataset, file=pickle_file)

### About the Dataset:

**Splits:**

In [27]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 20000
    })
})

**Training Examples:**

In [28]:
print("Dataset Object Type:", type(dataset["train"]))
print("Training Examples:", len(dataset["train"]))

Dataset Object Type: <class 'datasets.arrow_dataset.Dataset'>
Training Examples: 40000


**Sample Structure:**

In [29]:
dataset["train"][95]

{'tokens': ['Bruce', 'Beresford', '(', 'Non-Jew', ')'],
 'ner_tags': [1, 2, 0, 0, 0],
 'langs': ['en', 'en', 'en', 'en', 'en'],
 'spans': ['PER: Bruce Beresford']}

**Class Labels:**

In [30]:
label_list = dataset["train"].features[f"ner_tags"].feature.names
print(label_list)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
