# The Dataset

In [6]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [1]:
from datasets import load_dataset, DatasetDict
from collections import defaultdict

langs = ["de","fr",'it','en']
fracs = [.629, .229,.084,.059]

panx_ch = defaultdict(DatasetDict)

In [2]:
for lang, frac in zip(langs, fracs):
    #Load monolingual corpus
    ds = load_dataset("xtreme", name = f"PAN-X.{lang}")
    # Shuffle and downsample each split according to spoken proportion
    for split in ds:
        panx_ch[lang][split] = (
            ds[split].shuffle(seed= 0).select(range(int(frac * ds[split].num_rows)))
        )
        

Found cached dataset xtreme (/home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-e5ddf09f1ae095ec.arrow
Loading cached shuffled indices for dataset at /home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-25e7e2dd003d0fa6.arrow
Loading cached shuffled indices for dataset at /home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-73a95bc0accfea8b.arrow
Found cached dataset xtreme (/home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-6ff29513007ec78b.arrow
Loading cached shuffled indices for dataset at /home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-c5c9a4fc19dfd7d6.arrow
Loading cached shuffled indices for dataset at /home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-9711ab25936b81b7.arrow
Found cached dataset xtreme (/home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-daa9a1770078307c.arrow
Loading cached shuffled indices for dataset at /home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-5e244c05031bab3c.arrow
Loading cached shuffled indices for dataset at /home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-497ee15c12bff58d.arrow
Found cached dataset xtreme (/home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-757845faa9fa6949.arrow
Loading cached shuffled indices for dataset at /home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-305cefc7ffa49fd9.arrow
Loading cached shuffled indices for dataset at /home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-e5ec5e6ba7c1237d.arrow


Checking if the data are successfully collected

In [3]:
import pandas as pd

pd.DataFrame({lang: [panx_ch[lang]['train'].num_rows] for lang in langs}, index = ["Number of training examples"])

Unnamed: 0,de,fr,it,en
Number of training examples,12580,4580,1680,1180


By design, we have more examples in German than all other languages combined, so we'll use it as a starting point from which to perform **zero-shot cross-lingual transfer** to French, Italian, and English. Let's inspect one of the examples in the German corpus:

In [4]:
element = panx_ch['de']['train'][0]
for key, value in element.items():
    print(f"{key}: {value}")


tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']


As with out previous encounters with **Dataset** objects, the keys of our example correspond to the column names of an `Arrow` table, while the values denote the entries in each column.<br>
In particular, we see that the **ner_tags** column corresponds to the mapping of each entity to a class ID. This is a bit cryptic to the human eye, so let's create a new column with the familiar **LOC**, **PER**, and **ORG** tags.<br>
To do this, the first thing to notice is that our **Dataset** object has a features attribute that specifies the underlying data types associated with each column

In [5]:
for key, value in panx_ch['de']['train'].features.items():
    print(f"{key}:{value}")

tokens:Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags:Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs:Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)


We can use **ClassLabel.int2str()** method that we encountered in Chapter 2 to create a new column in our training set with class names for each tag. We'll use the **map()** method to return a *dict* with the key corresponding to the new column name and the value as a *list* of class names:

In [6]:
tags = panx_ch['de']['train'].features['ner_tags'].feature
tags

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)

In [7]:
def create_tab_names(batch):
    return {"ner_tags_str":[tags.int2str(idx) for idx in batch['ner_tags']]}

panx_de = panx_ch['de'].map(create_tab_names)

Loading cached processed dataset at /home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-441470a51f73641d.arrow
Loading cached processed dataset at /home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-cbdba25c4df6cec4.arrow
Loading cached processed dataset at /home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-5955cb68d6fdd056.arrow


In [8]:
de_example = panx_de['train'][0]
pd.DataFrame([de_example['tokens'], de_example['ner_tags_str']], index = ['Tokens','Tags'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,2.000,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
Tags,O,O,O,O,B-LOC,I-LOC,O,O,B-LOC,B-LOC,I-LOC,O


As a quick check that we don't have any unusual imbalance in the tags, let's calculate the frequencies of each entity across each split:

In [9]:
from collections import Counter

split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
    for row in dataset['ner_tags_str']:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient = "index")

Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071


# Multilingual Transformers

Multilingual transformers involve similar architectures and training procedures as their monolingual counterparts, except that the corpus used for pretraining consists of documents in many languages. A remarkable feature of this approach is that despite receiving no explicit information to differentiate among the languages, the resulting linguistic representations are able to generalize well across languages, the resulting linguistic representations are able to generalize well across languages for a variety of downstream tasks. In some cases, this ability to perform cross-lingual transfer can produce results that are competitive with those of monolingual models, which circumvents the need to train one model per language!

To measure the progress of cross-lingual transfer for NER, the CoNLL-2002 and CoNLL-2003 datasets are often used as a benchmark for English, Dutch, Spanish and German. This benchmark consists of news articles annotated with the same **LOC**, **PER**, and **ORG** categories as PAN-X, but it contains an additional **MISC** label for miscellaneous entities that do not belong to the previous three groups. Multiligual transformer models are usually evaluated in three different ways:

* en: Fine-une on the English traiing data and then evaluate on each language's test set
* each: Fine-tune and evaluate on monolingual test data to measure per-language performance
* all: Fine-tune on all the training data to evaluate on all on each language's test set

We consider **XLM-RoBERTa** model in this exercise. The RoBERTa part of the model's name refers to the fact that the pretraining approach is the same as for the monolingual RoBERTa models. RoBERTa's developers improved on several aspects of BERT, in particular by **removing the next sentence prediction task altogether**. XLM-R also **drops the language embeddings used in XLM** and uses **SentencePiece** to tokenize the raw texts directly. Besides its multilingual nature, a notable difference between XLM-R and RoBERTa is the size of the respective vocabularies: **250,000** tokens versus **55,000**.

# A Closer Look at Tokenization

Instead of using a WordPiece tokenizer as in BERT, XLM-R uses a tokenizer called ***SentencePiece*** that is trained on the raw text of all one hundred languages. To get a feel for how SentencePiece compares to WordPiece, let's load the BERT and XLM-R tokenizers in the usual way with huggingface Transformers

In [10]:
from transformers import AutoTokenizer

bert_model_name = 'bert-base-cased'; xlmr_model_name = 'xlm-roberta-base'
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

In [11]:
text = "Jack Sparrow loves New York!"
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()

In [12]:
pd.DataFrame([bert_tokens, xlmr_tokens], ['BERT','XLM-R'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
BERT,[CLS],Jack,Spa,##rrow,loves,New,York,!,[SEP],
XLM-R,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>


# Creating a Custom Model for Token Classification

In [13]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

In [14]:
class XMLRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig # it ensures that the standard XML-R settings are used when we initialize a new model
    
    def __init__(self, config):
        super().__init__(config) # loading of pretrained weights
        self.num_labels = config.num_labels
        # Load model body
        self.roberta = RobertaModel(config, add_pooling_layer = False)
        # Set up token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # Load and initialize weights
        self.init_weights()
        
    def forward(self, input_ids = None, attention_mask = None, token_type_ids = None, labels = None, **kwargs):
        # Use model body to get encoder representation
        outputs = self.roberta(input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids, **kwargs)
#         Apply classifier to encoder representation
        sequence_output = self.dropout(outputs[0]) #<s>
        logits = self.classifier(sequence_output) # logits is the output before softmax function
#         Calculate Loss
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.veiw(-1))
#         Return model output object
        return TokenClassifierOutput(loss = loss, logits = logits, hidden_states = outputs.hidden_states, attentions = outputs.attentions)

### Loading a Custom Model

In [15]:
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tags2index = {tag: idx for idx, tag in enumerate(tags.names)}

In [16]:
from transformers import AutoConfig

xmlr_config = AutoConfig.from_pretrained(xlmr_model_name,
                                        num_labels = tags.num_classes,
                                        id2label = index2tag,
                                        label2id = tags2index)

In [17]:
import torch

device = torch.device("cpu")
xlmr_model = XMLRobertaForTokenClassification.from_pretrained(xlmr_model_name, config = xmlr_config).to(device)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XMLRobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XMLRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XMLRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XMLRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['roberta.embeddings.position_

In [18]:
inputs = xlmr_tokenizer(text, return_tensors = 'pt')
pd.DataFrame([xlmr_tokens, inputs.input_ids[0].numpy()], index = ['Tokens','Input IDs'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Input IDs,0,21763,37456,15555,5161,7,2356,5753,38,2


In [19]:
outputs = xlmr_model(inputs.input_ids.to(device)).logits
predictions = torch.argmax(outputs, dim = -1)

In [20]:
len(xlmr_tokens), outputs.size()

(10, torch.Size([1, 10, 7]))

In [21]:
preds = [tags.names[p] for p in predictions[0].numpy()]
pd.DataFrame([xlmr_tokens, preds], index = ['Tokens','Tags'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Tags,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG


In [22]:
def tag_text(text, tags, model, tokenizer):
    # Get tokens with special characters
    tokens = tokenizer(text).tokens()
    # Encode the sequence into IDs
    input_ids = xlmr_tokenizer(text, return_tensors = 'pt').input_ids.to(device)
    # Get predictions as distribution over 7 possible classes
    outputs = model(input_ids)[0]
    # Take argmax to get most likely class per token
    predictions = torch.argmax(outputs, dim = 2)
    # Convert to DataFrame
    preds = [tags.names[p] for p in predictions[0].numpy()]
    return pd.DataFrame([tokens, preds], index = ['Tokens','Tags'])

In [23]:
tag_text(text, tags, xlmr_model, xlmr_tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Tags,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG


### Tokenizing the whole dataset

Note that HuggingFace Datasets provides a fast way to tokenize a Dataset object with the map() operation. To achieve this, recall that we first need to define a function with the minimal signature:
<code>
> function(examples: Dict[str, List]) -> Dict[str, List]
</code>

In [24]:
inputs.word_ids(0)

[None, 0, 1, 1, 2, 2, 3, 4, 4, None]

In [25]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlmr_tokenizer(examples['tokens'], truncation = True,
                                     is_split_into_words = True)
    labels = []
    for idx, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index = idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [26]:
def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels, batched = True, remove_columns = ['langs','ner_tags','tokens'])

In [27]:
panx_de_encoded = encode_panx_dataset(panx_ch['de'])

Loading cached processed dataset at /home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-5144036351e4ce58.arrow


Map:   0%|          | 0/6290 [00:00<?, ? examples/s]

Loading cached processed dataset at /home/leepark/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-7698d8a93416f954.arrow


### Performance Measure

In [30]:
import numpy as np

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis = 2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []
    
    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx  in range(seq_len):
            # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])
        labels_list.append(example_labels)
        preds_list.append(example_preds)
    return preds_list, label_list

# Fine-Tuining XLM-RoBERTa

In [32]:
from transformers import TrainingArguments

num_epochs = 3
batch_size = 24
logging_steps = len(panx_de_encoded['train']) // batch_size
model_name = f"{xlmr_model_name}-finetuned-panx-de"

training_args = TrainingArguments(
    output_dir = model_name,
    log_level = "error",
    num_train_epochs = num_epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size  = batch_size,
    evaluation_strategy         = 'epoch',
    save_steps = 1e6,
    weight_decay = 0.01,
    disable_tqdm = False,
    logging_steps = logging_steps,
    push_to_hub = False
)

In [34]:
from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions, eval_pred.lable_ids)
    return {"f1": f1_score(y_true, y_pred)}

In [35]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [36]:
def model_init():
    return (XMLRobertaForTokenClassification.from_pretrained(xlmr_model_name, config = xlmr_config).to(device))

In [37]:
from transformers import Trainer

trainer = Trainer(model_init = model_init, args = training_args, 
                 data_collator = data_collator, compute_metrics = compute_metrics,
                 train_dataset = panx_de_encoded['train'])

NameError: name 'XLMRobertaForTokenClassification' is not defined