# Datasets

In [1]:
from datasets import get_dataset_config_names

xtreme_subsets = get_dataset_config_names('xtreme')

In [2]:
print(f'{len(xtreme_subsets)} configurations available')

183 configurations available


In [3]:
panx_subsets = [s for s in xtreme_subsets if s.startswith('PAN')]
print(f'{len(panx_subsets)} sub-datasets available')
print(panx_subsets[:5])

40 sub-datasets available
['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg', 'PAN-X.bn', 'PAN-X.de']


In [4]:
from datasets import load_dataset

load_dataset('xtreme', 'PAN-X.de')

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

Create a realistic Swiss corpus by oversampling German, French while undersampling Italian and English

In [5]:
from collections import defaultdict
from datasets import DatasetDict

langs = ['de', 'fr', 'it', 'en']
fracs = [0.629, 0.229, 0.084, 0.059]

panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    ds = load_dataset('xtreme', name=f'PAN-X.{lang}')
    for split in ds:
        n = int(frac * ds[split].num_rows)
        panx_ch[lang][split] = ds[split].shuffle(seed=0).select(range(n))

In [6]:
# display how many samples by language
import pandas as pd

pd.DataFrame({lang: [panx_ch[lang]['train'].num_rows] for lang in langs}, index=['Number of samples'])

Unnamed: 0,de,fr,it,en
Number of samples,12580,4580,1680,1180


Zero-shot cross-lingual transfer.

In [7]:
pd.DataFrame(panx_ch['de']['train'][0])

Unnamed: 0,tokens,ner_tags,langs
0,2.000,0,de
1,Einwohnern,0,de
2,an,0,de
3,der,0,de
4,Danziger,5,de
5,Bucht,6,de
6,in,0,de
7,der,0,de
8,polnischen,5,de
9,Woiwodschaft,5,de


In [8]:
tags = panx_ch['de']['train'].features['ner_tags'].feature
print(tags)

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [9]:
def create_tag_names(batch):
    return {'ner_tags_str': [tags.int2str(idx) for idx in batch['ner_tags']]}
panx_de = panx_ch['de'].map(create_tag_names)

In [10]:
de_example = panx_de['train'][0]
pd.DataFrame(de_example['tokens'], de_example['ner_tags_str'])

Unnamed: 0,0
O,2.000
O,Einwohnern
O,an
O,der
B-LOC,Danziger
I-LOC,Bucht
O,in
O,der
B-LOC,polnischen
B-LOC,Woiwodschaft


In [11]:
from collections import Counter
split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
 for row in dataset["ner_tags_str"]:
    for tag in row:
        if tag.startswith("B"):
            tag_type = tag.split("-")[1]
            split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")


Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071


# Tokenization

In [12]:
# Compare word-piece and sentence-piece tokenizers

from transformers import AutoTokenizer
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
xlmr_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

text = 'Jack Sparrow loves New York!'
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()

In [13]:
pd.DataFrame([bert_tokens, xlmr_tokens], index=["BERT", "XLM-R"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
BERT,[CLS],Jack,Spa,##rrow,loves,New,York,!,[SEP],
XLM-R,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>


In [14]:
"".join(xlmr_tokens).replace(u"\u2581", " ")

'<s> Jack Sparrow loves New York!</s>'

# Transformers Anatomy

In [15]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig
    
    def __init__(self, config):
        super().__init__(config)
        
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)


In [16]:
from transformers import AutoConfig

index2tag = { idx: tag for idx, tag in enumerate(tags.names)}
tag2index = { tag: idx for idx, tag in enumerate(tags.names)}

xlmr_model_name = 'xlm-roberta-base'
xlmr_config = AutoConfig.from_pretrained(
    xlmr_model_name,
    num_labels = tags.num_classes,
    id2label=index2tag,
    label2id=tag2index
)

In [17]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
xlmr_model = XLMRobertaForTokenClassification.from_pretrained(
    xlmr_model_name,
    config=xlmr_config
).to(device)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
input_ids = xlmr_tokenizer.encode(text, return_tensors='pt')
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Input IDs,0,21763,37456,15555,5161,7,2356,5753,38,2


In [19]:
outputs = xlmr_model(input_ids.to(device)).logits
print(outputs)
print(outputs.shape)

tensor([[[ 0.7144,  0.6577, -0.2817,  0.3147,  0.8232, -0.8751,  0.5841],
         [ 0.6824,  0.8137, -0.2686,  0.3035,  1.1198, -0.7924,  0.3815],
         [ 0.6361,  0.7677, -0.3205,  0.2813,  1.0549, -0.8959,  0.3879],
         [ 0.6709,  0.8359, -0.3279,  0.2787,  1.0872, -0.7483,  0.3807],
         [ 0.7196,  0.7173, -0.3254,  0.2570,  1.0550, -0.7852,  0.3468],
         [ 0.6775,  0.8243, -0.2941,  0.3196,  1.0873, -0.6922,  0.4649],
         [ 0.6555,  0.8379, -0.2358,  0.3278,  1.1532, -0.7413,  0.4121],
         [ 0.7254,  0.7859, -0.2388,  0.3090,  1.1012, -0.6920,  0.3436],
         [ 0.7365,  0.7293, -0.3000,  0.3587,  1.0453, -0.7539,  0.2059],
         [ 0.7560,  0.6070, -0.2456,  0.2276,  0.7926, -0.9179,  0.5847]]],
       grad_fn=<ViewBackward0>)
torch.Size([1, 10, 7])


In [20]:
predictions = torch.argmax(outputs, -1)
print(predictions)
print(predictions.shape)

tensor([[4, 4, 4, 4, 4, 4, 4, 4, 4, 4]])
torch.Size([1, 10])


In [21]:
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens, preds], index=["Tokens", "Tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Tags,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG


In [22]:
def tag_text(text, tags, model, tokenizer):
    # Get tokens with special characters
    tokens = tokenizer(text).tokens()
    # Encode the sequence into IDs
    input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
    # Get predictions as distribution over 7 possible classes
    outputs = model(input_ids)[0]
    # Take argmax to get most likely class per token
    predictions = torch.argmax(outputs, dim=2)
    # Convert to DataFrame
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

In [23]:
tag_text('Kamala Harris is the worst presidental candidate of all time!', tags, xlmr_model, xlmr_tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
Tokens,<s>,▁Kamal,a,▁Harris,▁is,▁the,▁worst,▁president,al,▁candidat,e,▁of,▁all,▁time,!,</s>
Tags,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG


# Tokenize dataset

In [24]:
words, labels = de_example['tokens'], de_example['ner_tags']

pd.DataFrame([words, labels], index=["Words", "Labels"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Words,2.0,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
Labels,0.0,0,0,0,5,6,0,0,5,5,6,0


In [25]:
tokenized_input = xlmr_tokenizer(de_example["tokens"], is_split_into_words=True)
print(tokenized_input.input_ids)

[0, 70101, 176581, 19, 142, 122, 2290, 708, 1505, 18363, 18, 23, 122, 127474, 15439, 13787, 14, 15263, 18917, 663, 6947, 19, 6, 5, 2]


In [26]:
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input.input_ids)
pd.DataFrame([tokens], index=["Tokens"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>


In [27]:
word_ids = tokenized_input.word_ids()
pd.DataFrame([tokens, word_ids], index=["Tokens", "Word IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,


Assign -100 to IGN Labels.

In [28]:
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
    if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
    elif word_idx != previous_word_idx:
        label_ids.append(labels[word_idx])
    previous_word_idx = word_idx
labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]
index = ["Tokens", "Word IDs", "Label IDs", "Labels"]
pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,
Label IDs,-100,0,0,-100,0,0,5,-100,-100,6,...,5,-100,-100,-100,6,-100,-100,0,-100,-100
Labels,IGN,O,O,IGN,O,O,B-LOC,IGN,IGN,I-LOC,...,B-LOC,IGN,IGN,IGN,I-LOC,IGN,IGN,O,IGN,IGN


In [29]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlmr_tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
    )
    labels = []
    for idx, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [30]:
def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels, batched=True, remove_columns=['langs', 'ner_tags', 'tokens'])

In [31]:
panx_de_encoded = encode_panx_dataset(panx_de)

Map:   0%|          | 0/6290 [00:00<?, ? examples/s]

In [32]:
pd.DataFrame(panx_de_encoded['train'])

Unnamed: 0,ner_tags_str,input_ids,attention_mask,labels
0,"[O, O, O, O, B-LOC, I-LOC, O, O, B-LOC, B-LOC,...","[0, 70101, 176581, 19, 142, 122, 2290, 708, 15...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, -100, 0, 0, 5, -100, -100, 6, -10..."
1,"[O, O, O, B-ORG, O, O, O, B-ORG, I-ORG, O, O]","[0, 727, 8644, 39119, 23706, 20101, 24, 9703, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 3, -100, 0, -100, -100, 0, 0, ..."
2,"[O, O, O, O, B-PER, I-PER, O, B-PER, I-PER, O,...","[0, 230978, 33, 542, 12389, 47323, 1225, 96513...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, -100, 0, 0, -100, 0, 1, 2, -100, -10..."
3,"[O, O, B-ORG, I-ORG, O, O]","[0, 242, 5106, 46231, 13, 20130, 4432, 5106, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[-100, 0, 0, 3, -100, 4, -100, 0, 0, -100]"
4,"[O, B-PER, I-PER, I-PER, I-PER]","[0, 1392, 6765, 821, 5, 436, 5, 3253, 1728, 2]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[-100, 0, 1, 2, -100, 2, -100, 2, -100, -100]"
...,...,...,...,...
12575,"[B-PER, I-PER, O, O, O, O, O, O]","[0, 94069, 160794, 6, 4, 93563, 2235, 122, 479...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[-100, 1, 2, 0, -100, 0, 0, 0, 0, -100, -100, ..."
12576,"[O, O, O, O, O, O, O, O, O, O, O, B-ORG, I-ORG...","[0, 2991, 7418, 98555, 72, 5445, 1329, 33415, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, -100, 0, 0, 0, 0, 0, 0, 0, ..."
12577,"[O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O]","[0, 1310, 104998, 56, 491, 714, 219160, 19, 41...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 5, -100, 0, 0, 0, -100, 0, 0, -100, ..."
12578,"[O, B-LOC, O, B-LOC, O, O, O, O, O, O, O]","[0, 360, 7145, 11, 18339, 165, 128267, 402, 49...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 5, -100, -100, 0, 5, -100, 0, -100, ..."


# Metrics

In [33]:
from seqeval.metrics import classification_report

y_true = [
    ["O", "O", "O", "B-MISC", "I-MISC", "I-MISC", "O"],
    ["B-PER", "I-PER", "O"]
]
y_pred = [
    ["O", "O", "B-MISC", "I-MISC", "I-MISC", "I-MISC", "O"],
    ["B-PER", "I-PER", "O"]
]

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

        MISC       0.00      0.00      0.00         1
         PER       1.00      1.00      1.00         1

   micro avg       0.50      0.50      0.50         2
   macro avg       0.50      0.50      0.50         2
weighted avg       0.50      0.50      0.50         2



Define function to convert output to a list that `seqeval` can process.

In [34]:
import numpy as np
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []
    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
        # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
        example_preds.append(index2tag[preds[batch_idx][seq_idx]])
    labels_list.append(example_labels)
    preds_list.append(example_preds)
    return preds_list, labels_list

# Fine-tune XLM-RoBERTa

In [36]:
from transformers import TrainingArguments

num_epochs = 3
batch_size = 24
logging_steps = len(panx_de_encoded["train"]) // batch_size
model_name = f'{xlmr_model_name}-finetuned-panx-de'

training_args = TrainingArguments(
    output_dir=model_name,
    learning_rate=2.5e-5,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_steps=1e6,
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=True,
    log_level="error",
)

In [37]:
from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions, eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

In [38]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [39]:
def model_init():
    return XLMRobertaForTokenClassification.from_pretrained(
        xlmr_model_name,
        config=xlmr_config
    ).to(device)

In [41]:
from transformers import Trainer

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=panx_de_encoded["train"],
    eval_dataset=panx_de_encoded["validation"],
    tokenizer=xlmr_tokenizer
)
# trainer.train()

In [None]:
trainer.push_to_hub(commit_message="Training completed!")
text_de = 'Jeff Dean ist ein Informatiker bei Google in Kalifornien'
tag_text(text_de, tags, trainer.model, xlmr_tokenizer)