<a href="https://colab.research.google.com/github/juanprida/nlp_with_transformers/blob/main/04_multilingual_named_entity_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
!pip install transformers
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.0-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[

In [2]:
from collections import defaultdict

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from seqeval.metrics import f1_score

from datasets import load_dataset
from datasets import DatasetDict

from transformers import (
    AutoTokenizer,
    AutoConfig,
    XLMRobertaConfig,
    RobertaModel,
    RobertaPreTrainedModel,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification,
)
from transformers.modeling_outputs import TokenClassifierOutput
from huggingface_hub import notebook_login

In [3]:
xlmr_model_name = "xlm-roberta-base"

device = "cuda" if torch.cuda.is_available() else "cpu"

### Get multilingual dataset.
- We are going to be using PAN-X, a multilingual dataset for named entity recognition. It contains 14 languages, including English, Spanish, French, German, and Italian. The dataset is available in the `datasets` library.

In [4]:
langs = ["de", "fr", "it", "en"]
fracs = [0.629, 0.229, 0.084, 0.059]

panx_ch = defaultdict(DatasetDict)
for lang, frac in zip(langs, fracs):
    ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
    for split in ds:
        panx_ch[lang][split] = (
            ds[split].shuffle(seed=0).select(range(int(frac * ds[split].num_rows)))
        )

Downloading builder script:   0%|          | 0.00/37.5k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/593k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/105k [00:00<?, ?B/s]

Downloading and preparing dataset xtreme/PAN-X.de to /root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4...


Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset xtreme downloaded and prepared to /root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset xtreme/PAN-X.fr to /root/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4...


Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset xtreme downloaded and prepared to /root/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset xtreme/PAN-X.it to /root/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4...


Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset xtreme downloaded and prepared to /root/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset xtreme/PAN-X.en to /root/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4...


Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset xtreme downloaded and prepared to /root/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
print("Number of training examples per language:")
print({k: panx_ch[k]["train"].num_rows for k in panx_ch})

print("------------------------------------------------")
print("Example of a training example:")
for k, v in panx_ch["de"]["train"][0].items():
    print(f"{k}: {v}")

    print("------------------------------------------------")
print("Target labels:")
for k in panx_ch["de"]["train"].features:
    print(f"{k}: {panx_ch['de']['train'].features[k]}")

print("------------------------------------------------")
print("tags:")
tags = panx_ch["de"]["train"].features["ner_tags"].feature
print(tags.names)

Number of training examples per language:
{'de': 12580, 'fr': 4580, 'it': 1680, 'en': 1180}
------------------------------------------------
Example of a training example:
tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
------------------------------------------------
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
------------------------------------------------
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']
------------------------------------------------
Target labels:
tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
------------------------------------------------
tags:
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']


In [6]:
def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

# Create a new column with the tag names
panx_de = panx_ch["de"]["train"].map(create_tag_names)

print("------------------------------------------------")
print("Example of a training example:")
for k, v in panx_de[0].items():
    print(f"{k}: {v}")

Map:   0%|          | 0/12580 [00:00<?, ? examples/s]

------------------------------------------------
Example of a training example:
tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']
ner_tags_str: ['O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'B-LOC', 'B-LOC', 'I-LOC', 'O']


### Define the Tokenizer.
- We are going to be working with XLM-Roberta, a multilingual transformer model. We will use the `AutoTokenizer` class to load the tokenizer for this model.

In [7]:
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

# Let's tokenize a text as an example.
text = "Jack Sparrow loves New York."
tokenized_text = xlmr_tokenizer(text).tokens()
input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)

print("Original text:", text)
print("Tokenized text:", tokenized_text)
print("Input IDs:", input_ids)

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Original text: Jack Sparrow loves New York.
Tokenized text: ['<s>', '▁Jack', '▁Spar', 'row', '▁love', 's', '▁New', '▁York', '.', '</s>']
Input IDs: tensor([[    0, 21763, 37456, 15555,  5161,     7,  2356,  5753,     5,     2]],
       device='cuda:0')


### Define the Model.
- We will define our custom model based on the Roberta model.
- We will initialize the weights with the `xlm-roberta-base` model.
- The shape of the output will be (batch_size, sequence_length, num_classes)b.

In [8]:
class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # Load model body
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        # Set up token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # Load and initialize weights
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, 
                labels=None, **kwargs):
        # Use model body to get encoder representations
        outputs = self.roberta(input_ids, attention_mask=attention_mask,
                               token_type_ids=token_type_ids, **kwargs)
        # Apply classifier to encoder representation
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        # Calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        # Return model output object
        return TokenClassifierOutput(loss=loss, logits=logits, 
                                     hidden_states=outputs.hidden_states, 
                                     attentions=outputs.attentions)

In [10]:
index2tag = {i: tag for i, tag in enumerate(tags.names)}
tag2index = {tag: i for i, tag in enumerate(tags.names)}

xlmr_config = AutoConfig.from_pretrained(
    xlmr_model_name, num_labels=tags.num_classes, id2label=index2tag, label2id=tag2index
)

xlmr_model = XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name, config=xlmr_config).to(device)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.weight', 'classif

### Get the model's predictions for our simple example.

In [11]:
outputs = xlmr_model(input_ids=input_ids).logits
# We should have n_predictions for each token in the input where n_predictions is the number of classes.
print("Length of the input:", input_ids.shape[-1])
print("Number of predictions for each token:", outputs.shape[-1])
print("Shape of the output logits:", outputs.shape)

print("------------------------------------------------")
predictions = torch.argmax(outputs, dim=-1)
print("Predictions:", predictions)

predictions_tags = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([tokenized_text, predictions_tags], index=["token", "prediction"])

Length of the input: 10
Number of predictions for each token: 7
Shape of the output logits: torch.Size([1, 10, 7])
------------------------------------------------
Predictions: tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 0]], device='cuda:0')


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
token,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,.,</s>
prediction,O,B-PER,B-PER,B-PER,B-PER,B-PER,B-PER,B-PER,B-PER,O


### Tokenize one sentence of the dataset.
- We will tokenize the first sentence of the dataset.

In [12]:
words, labels = panx_de[0]["tokens"], panx_de[0]["ner_tags"]
tokenize_input = xlmr_tokenizer(words, is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenize_input["input_ids"])
word_ids = tokenize_input.word_ids()
print("Original text:", words)
print("Tokenized text:", tokens)
print("Word IDs:", word_ids)

Original text: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
Tokenized text: ['<s>', '▁2.000', '▁Einwohner', 'n', '▁an', '▁der', '▁Dan', 'zi', 'ger', '▁Buch', 't', '▁in', '▁der', '▁polni', 'schen', '▁Wo', 'i', 'wod', 'schaft', '▁Po', 'mmer', 'n', '▁', '.', '</s>']
Word IDs: [None, 0, 1, 1, 2, 3, 4, 4, 4, 5, 5, 6, 7, 8, 8, 9, 9, 9, 9, 10, 10, 10, 11, 11, None]


### Tokenize and encode the whole dataset.

In [13]:
# Tokenize and align labels.
def tokenize_and_align_labels(examples):
    """
    Tokenize and align labels with the tokens
    
    Parameters
    ----------
    examples: dict
        A dictionary containing the input text and its labels. The keys should be "tokens" and "ner_tags".
    
    Returns
    -------
    tokenized_inputs: dict
        A dictionary containing the tokenized input text and its labels.
        The keys should be "input_ids", "attention_mask", "token_type_ids", "labels".
    """
    tokenized_inputs = xlmr_tokenizer(
        examples["tokens"], is_split_into_words=True, truncation=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True, 
                                      is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [14]:
def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels, batched=True, 
                      remove_columns=['langs', 'ner_tags', 'tokens'])

In [15]:
# hide_output
panx_de_encoded = encode_panx_dataset(panx_ch["de"])

Map:   0%|          | 0/12580 [00:00<?, ? examples/s]

Map:   0%|          | 0/6290 [00:00<?, ? examples/s]

Map:   0%|          | 0/6290 [00:00<?, ? examples/s]

### Align predictions.

In [25]:
def align_predictions(predictions, labels_ids):
    """
    Align predictions to use with seqeval by creating a list of list with strings.

    Parameters
    ----------
    predictions: torch.Tensor
        The predictions of the model.
    labels_ids: torch.Tensor
        The labels of the dataset.

    Returns
    -------
    preds_list: list
        A list of list with the predictions.
    labels_list: list
        A list of list with the labels.
    """
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []
    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            if labels_ids[batch_idx, seq_idx] != -100:
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])
                example_labels.append(index2tag[labels_ids[batch_idx][seq_idx]])
        preds_list.append(example_preds)
        labels_list.append(example_labels)

    return preds_list, labels_list

### Fine Tuning our NER model.
- We will use the same model as before, but this time we will fine tune it on our own data.

In [17]:
training_args = TrainingArguments(
    output_dir=f"{xlmr_model_name}-finetuned-panx-de", log_level="error", num_train_epochs=3, 
    per_device_train_batch_size=24, 
    per_device_eval_batch_size=24, evaluation_strategy="epoch", 
    save_steps=1e6, weight_decay=0.01, disable_tqdm=False, 
    logging_steps=len(panx_de_encoded["train"]) // 24, push_to_hub=True)

In [18]:
# Log in to the Hugging Face Hub
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


### Define Trainer.
- We will use the `Trainer` class to train our model.
- We need to add the `compute_metrics` function to the `Trainer` class. This function will compute the f1 of our model.
- We will work with DataCollatorForTokenClassification, which will automatically pad our inputs and create the attention masks.

In [22]:
def compute_metrics(eval_pred):
    """Compute the F1 score using the seqeval library. """
    predictions, labels = align_predictions(eval_pred.predictions, eval_pred.label_ids)
    return {"f1": f1_score(predictions, labels)}

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

def model_init():
    return XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name, config=xlmr_config).to(device)

In [23]:
#hide
%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [None]:
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=panx_de_encoded["train"],
    eval_dataset=panx_de_encoded["validation"],
    tokenizer=xlmr_tokenizer)

trainer.train()
trainer.push_to_hub(commit_message="Add XLM-RoBERTa finetuned on PAN-X")

/content/xlm-roberta-base-finetuned-panx-de is already a clone of https://huggingface.co/pridaj/xlm-roberta-base-finetuned-panx-de. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch,Training Loss,Validation Loss,F1
1,0.3375,0.221574,0.795244
2,0.1749,0.199633,0.820555
3,0.1094,0.192816,0.838753


Adding files tracked by Git LFS: ['tokenizer.json']. This may take a bit of time if the files are large.


Upload file pytorch_model.bin:   0%|          | 32.0k/1.03G [00:00<?, ?B/s]

Upload file runs/Feb25_17-17-50_b01a78359f55/1677345815.278529/events.out.tfevents.1677345815.b01a78359f55.168…

Upload file runs/Feb25_17-17-50_b01a78359f55/events.out.tfevents.1677345815.b01a78359f55.1687.2: 100%|########…

Upload file tokenizer.json:   0%|          | 32.0k/16.3M [00:00<?, ?B/s]

Upload file runs/Feb25_17-17-50_b01a78359f55/events.out.tfevents.1677345569.b01a78359f55.1687.0: 100%|########…

Upload file training_args.bin: 100%|##########| 3.43k/3.43k [00:00<?, ?B/s]

Upload file runs/Feb25_17-17-50_b01a78359f55/1677345569.2712407/events.out.tfevents.1677345569.b01a78359f55.16…