In [283]:
!pip install datasets
!pip install transformers
!pip install seqeval



In [267]:
from collections import defaultdict

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from seqeval.metrics import f1_score

from datasets import load_dataset
from datasets import DatasetDict

from transformers import (
    AutoTokenizer,
    AutoConfig,
    XLMRobertaConfig,
    RobertaModel,
    RobertaPreTrainedModel,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification,
)
from transformers.modeling_outputs import TokenClassifierOutput
from huggingface_hub import notebook_login

In [268]:
xlmr_model_name = "xlm-roberta-base"

device = "cuda" if torch.cuda.is_available() else "cpu"

### Get multilingual dataset.
- We are going to be using PAN-X, a multilingual dataset for named entity recognition. It contains 14 languages, including English, Spanish, French, German, and Italian. The dataset is available in the `datasets` library.

In [167]:
langs = ["de", "fr", "it", "en"]
fracs = [0.629, 0.229, 0.084, 0.059]

panx_ch = defaultdict(DatasetDict)
for lang, frac in zip(langs, fracs):
    ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
    for split in ds:
        panx_ch[lang][split] = (
            ds[split].shuffle(seed=0).select(range(int(frac * ds[split].num_rows)))
        )

Found cached dataset xtreme (C:/Users/e10115582/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)
100%|██████████| 3/3 [00:00<00:00, 28.42it/s]
Loading cached shuffled indices for dataset at C:\Users\e10115582\.cache\huggingface\datasets\xtreme\PAN-X.de\1.0.0\29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4\cache-e5ddf09f1ae095ec.arrow
Loading cached shuffled indices for dataset at C:\Users\e10115582\.cache\huggingface\datasets\xtreme\PAN-X.de\1.0.0\29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4\cache-25e7e2dd003d0fa6.arrow
Loading cached shuffled indices for dataset at C:\Users\e10115582\.cache\huggingface\datasets\xtreme\PAN-X.de\1.0.0\29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4\cache-73a95bc0accfea8b.arrow
Found cached dataset xtreme (C:/Users/e10115582/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56

In [168]:
print("Number of training examples per language:")
print({k: panx_ch[k]["train"].num_rows for k in panx_ch})

print("------------------------------------------------")
print("Example of a training example:")
for k, v in panx_ch["de"]["train"][0].items():
    print(f"{k}: {v}")

    print("------------------------------------------------")
print("Target labels:")
for k in panx_ch["de"]["train"].features:
    print(f"{k}: {panx_ch['de']['train'].features[k]}")

print("------------------------------------------------")
print("tags:")
tags = panx_ch["de"]["train"].features["ner_tags"].feature
print(tags.names)

Number of training examples per language:
{'de': 12580, 'fr': 4580, 'it': 1680, 'en': 1180}
------------------------------------------------
Example of a training example:
tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
------------------------------------------------
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
------------------------------------------------
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']
------------------------------------------------
Target labels:
tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
------------------------------------------------
tags:
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']


In [169]:
def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

# Create a new column with the tag names
panx_de = panx_ch["de"]["train"].map(create_tag_names)

print("------------------------------------------------")
print("Example of a training example:")
for k, v in panx_de[0].items():
    print(f"{k}: {v}")

Loading cached processed dataset at C:\Users\e10115582\.cache\huggingface\datasets\xtreme\PAN-X.de\1.0.0\29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4\cache-aa23e79447eff40d.arrow


------------------------------------------------
Example of a training example:
tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']
ner_tags_str: ['O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'B-LOC', 'B-LOC', 'I-LOC', 'O']


### Define the Tokenizer.
- We are going to be working with XLM-Roberta, a multilingual transformer model. We will use the `AutoTokenizer` class to load the tokenizer for this model.

In [269]:
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

# Let's tokenize a text as an example.
text = "Jack Sparrow loves New York."
tokenized_text = xlmr_tokenizer(text).tokens()
input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)

print("Original text:", text)
print("Tokenized text:", tokenized_text)
print("Input IDs:", input_ids)

Original text: Jack Sparrow loves New York.
Tokenized text: ['<s>', '▁Jack', '▁Spar', 'row', '▁love', 's', '▁New', '▁York', '.', '</s>']
Input IDs: tensor([[    0, 21763, 37456, 15555,  5161,     7,  2356,  5753,     5,     2]])


### Define the Model.
- We will define our custom model based on the Roberta model.
- We will initialize the weights with the `xlm-roberta-base` model.
- The shape of the output will be (batch_size, sequence_length, num_classes)b.

In [270]:
class XLMRobertaForTokenClassifier(RobertaPreTrainedModel):
    """
    This class overrides the RobertaPreTrainedModel to adapt it to token classification tasks.

    Parameters
    ----------
    config : XLMRobertaConfig
        The configuration class to use when initializing the model.
        If not provided, a default configuration will be used.
    """

    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # We set add_pooling_layer=False, so all hidden_states are returned
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()

    def forward(self, input_ids=None, atention_mask=None, token_type_ids=None, labels=None, **kwargs):
        outputs = self.roberta(input_ids, attention_mask=atention_mask, token_type_ids=token_type_ids, **kwargs)
        sequence_outputs = self.dropout(outputs[0])
        logits = self.classifier(sequence_outputs)

        # Compute loss
        loss = None
        if labels is not None:
            print("labels:", labels)
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        return TokenClassifierOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )

In [279]:
index2tag = {i: tag for i, tag in enumerate(tags.names)}
tag2index = {tag: i for i, tag in enumerate(tags.names)}

xlmr_config = AutoConfig.from_pretrained(
    xlmr_model_name, num_labels=tags.num_classes, id2label=index2tag, label2id=tag2index
)

xlmr_model = XLMRobertaForTokenClassifier.from_pretrained(xlmr_model_name, config=xlmr_config).to(device)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassifier: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassifier from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassifier from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassifier were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['roberta.embeddings.position_ids', 'classifie

### Get the model's predictions for our simple example.

In [218]:
outputs = xlmr_model(input_ids=input_ids).logits
# We should have n_predictions for each token in the input where n_predictions is the number of classes.
print("Length of the input:", input_ids.shape[-1])
print("Number of predictions for each token:", outputs.shape[-1])
print("Shape of the output logits:", outputs.shape)

print("------------------------------------------------")
predictions = torch.argmax(outputs, dim=-1)
print("Predictions:", predictions)

predictions_tags = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([tokenized_text, predictions_tags], index=["token", "prediction"])

Length of the input: 10
Number of predictions for each token: 7
Shape of the output logits: torch.Size([2, 10, 7])
------------------------------------------------
Predictions: tensor([[3, 4, 3, 4, 3, 3, 4, 4, 4, 3],
        [3, 4, 3, 4, 3, 3, 4, 4, 4, 3]])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
token,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,.,</s>
prediction,B-ORG,I-ORG,B-ORG,I-ORG,B-ORG,B-ORG,I-ORG,I-ORG,I-ORG,B-ORG


### Tokenize one sentence of the dataset.
- We will tokenize the first sentence of the dataset.

In [222]:
words, labels = panx_de[0]["tokens"], panx_de[0]["ner_tags"]
tokenize_input = xlmr_tokenizer(words, is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenize_input["input_ids"])
word_ids = tokenize_input.word_ids()
print("Original text:", words)
print("Tokenized text:", tokens)
print("Word IDs:", word_ids)

Original text: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
Tokenized text: ['<s>', '▁2.000', '▁Einwohner', 'n', '▁an', '▁der', '▁Dan', 'zi', 'ger', '▁Buch', 't', '▁in', '▁der', '▁polni', 'schen', '▁Wo', 'i', 'wod', 'schaft', '▁Po', 'mmer', 'n', '▁', '.', '</s>']
Word IDs: [None, 0, 1, 1, 2, 3, 4, 4, 4, 5, 5, 6, 7, 8, 8, 9, 9, 9, 9, 10, 10, 10, 11, 11, None]


### Tokenize and encode the whole dataset.

In [234]:
# Tokenize and align labels.
def tokenize_and_align_labels(examples):
    """
    Tokenize and align labels with the tokens
    
    Parameters
    ----------
    examples: dict
        A dictionary containing the input text and its labels. The keys should be "tokens" and "ner_tags".
    
    Returns
    -------
    tokenized_inputs: dict
        A dictionary containing the tokenized input text and its labels.
        The keys should be "input_ids", "attention_mask", "token_type_ids", "labels".
    """
    tokenized_inputs = xlmr_tokenizer(
        examples["tokens"], is_split_into_words=True, truncation=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [238]:
panx_de_encoded = panx_ch["de"].map(tokenize_and_align_labels, batched=True, remove_columns=["langs", "tokens", "ner_tags"])

Loading cached processed dataset at C:\Users\e10115582\.cache\huggingface\datasets\xtreme\PAN-X.de\1.0.0\29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4\cache-8b730678fd2e19c8.arrow
Loading cached processed dataset at C:\Users\e10115582\.cache\huggingface\datasets\xtreme\PAN-X.de\1.0.0\29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4\cache-585312b80652c464.arrow
Loading cached processed dataset at C:\Users\e10115582\.cache\huggingface\datasets\xtreme\PAN-X.de\1.0.0\29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4\cache-ef15f8c81f788b25.arrow


### Align predictions.

In [262]:
def align_predictions(predictions, labels_ids):
    """
    Align predictions to use with seqeval by creating a list of list with strings.

    Parameters
    ----------
    predictions: torch.Tensor
        The predictions of the model.
    labels_ids: torch.Tensor
        The labels of the dataset.

    Returns
    -------
    preds_list: list
        A list of list with the predictions.
    labels_list: list
        A list of list with the labels.
    """
    preds = np.argmax(predictions, dim=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []
    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            if labels_ids[batch_idx, seq_idx] != -100:
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])
                example_labels.append(index2tag[labels_ids[batch_idx][seq_idx]])
        preds_list.append(example_preds)
        labels_list.append(example_labels)

    return preds_list, labels_list

### Fine Tuning our NER model.
- We will use the same model as before, but this time we will fine tune it on our own data.

In [255]:
training_args = TrainingArguments(
    output_dir=f"{xlmr_model_name}-finetuned-panx-de",
    log_level="error",
    num_train_epochs=3,
    per_device_eval_batch_size=24,
    per_device_train_batch_size=24,
    evaluation_strategy="epoch",
    save_steps=1e6,
    weight_decay=0.01,
    disable_tqdm=False,
    logging_steps=len(panx_de_encoded["train"]) // 24,
    push_to_hub=True)

In [260]:
# Log in to the Hugging Face Hub
notebook_login()

ImportError: The `notebook_login` function can only be used in a notebook (Jupyter or Colab) and you need the `ipywidgets` module: `pip install ipywidgets`.

### Define Trainer.
- We will use the `Trainer` class to train our model.
- We need to add the `compute_metrics` function to the `Trainer` class. This function will compute the f1 of our model.
- We will work with DataCollatorForTokenClassification, which will automatically pad our inputs and create the attention masks.

In [285]:
def compute_metrics(eval_pred):
    """Compute the F1 score using the seqeval library. """
    predictions, labels = align_predictions(eval_pred.predictions, eval_pred.label_ids)
    return {"f1": f1_score(predictions, labels)}

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

def model_init():
    return XLMRobertaForTokenClassifier.from_pretrained(xlmr_model_name, config=xlmr_config).to(device)

In [286]:
trainer = Trainer(
    model_init=model_init,
    model=xlmr_model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=panx_de_encoded["train"],
    eval_dataset=panx_de_encoded["validation"],
    tokenizer=xlmr_tokenizer)

trainer.train()
trainer.push_to_hub(commit_message="Add XLM-RoBERTa finetuned on PAN-X")



OSError: Token is required (`token=True`), but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.