In [66]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras 
import transformers
from transformers import pipeline, AutoTokenizer, AutoModel, TFAutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments, TFAutoModelForSequenceClassification, AutoConfig
from datasets import load_dataset, get_dataset_config_names, DatasetDict
from huggingface_hub import list_datasets, notebook_login,login
import matplotlib.pyplot as plt
import torch
from torch import nn
from math import sqrt
import torch.nn.functional as F
from torch.nn.functional import cross_entropy
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score
from bertviz.transformers_neuron_view import BertModel
from bertviz.neuron_view import show
from bertviz import head_view
from collections import defaultdict, Counter

from transformers import XLMRobertaConfig, DataCollatorForTokenClassification
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel
from seqeval.metrics import classification_report

### Load the dataset ###
The XTREME dataset consists of Wiki articles in many languages.

In [2]:
# 1. Get the configurations of the dataset
xtreme_subsets = get_dataset_config_names("xtreme")
print(f"There are {len(xtreme_subsets)} configurations.")

There are 183 configurations.


In [3]:
# 2. We look at the subsets that start with "PAN"
# The last 2 letters represent the language
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
print(panx_subsets[:3])

# example loading the German one
load_dataset("xtreme",name="PAN-X.de")

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [4]:
# 3. To simulate real life cases, we will create a dataset with multiple languages' texts included, and the language portion will be imbalanced.
langs = ["de","fr","it","en"] # We choose 4 difference languages
fracs = [0.629,0.229,0.084,0.059] # Decide the potion of each language's text in the dataset

# Return a DatasetDict if a key does not exist
panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs,fracs):
    ds = load_dataset("xtreme",name=f"PAN-X.{lang}")
    # Shuffle and sample according the fraction
    for split in ds:
        panx_ch[lang][split] = (ds[split].shuffle(seed=0).select(range(int(frac * ds[split].num_rows))))

pd.DataFrame({lang:[panx_ch[lang]["train"].num_rows] for lang in langs},
             index = ['Number of training examples'])
# Apparently there're way more German (de) texts than the rest of the 3 languages


Unnamed: 0,de,fr,it,en
Number of training examples,12580,4580,1680,1180


In [5]:
# Look at some of German elements as example

element = panx_ch['de']['train'][0]
for key, value in element.items():
    print(f"{key}: {value}") # In the original dataset, NER tags are mapped to integer indexes

for key, value in panx_ch['de']['train'].features.items(): # The original strings of the classes are saved in the features of the dataset
    print(f"{key}: {value}")

tags = panx_ch['de']['train'].features['ner_tags'].feature
print(tags)

tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']
tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [6]:
# 4. Add a tag column for the text and string of the ner_tag

def create_tag_names(batch):
    return {"ner_tags_str":[tags.int2str(idx) for idx in batch['ner_tags']]}

panx_de = panx_ch['de'].map(create_tag_names)

de_example = panx_de["train"][0]
pd.DataFrame([de_example["tokens"],de_example["ner_tags_str"]],["Tokens","Tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,2.000,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
Tags,O,O,O,O,B-LOC,I-LOC,O,O,B-LOC,B-LOC,I-LOC,O


In [7]:
# Quick check if there's any imbalance issue on NER tags

split2freqs = defaultdict(Counter)

for split, dataset in panx_de.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type]+=1

pd.DataFrame.from_dict(split2freqs,orient="index")

Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071


### XLMR Model Tokenization ###

In [8]:
# xlmr use a tokenizer called SentencePiece tokenizer that is trained on 100 different languages

bert_model_name = "bert-base-cased"
xlmr_model_name = "xlm-roberta-base"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

text = "Jack Sparrow loves New York!"
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()
print(bert_tokens)
print(xlmr_tokens)

['[CLS]', 'Jack', 'Spa', '##rrow', 'loves', 'New', 'York', '!', '[SEP]']
['<s>', '▁Jack', '▁Spar', 'row', '▁love', 's', '▁New', '▁York', '!', '</s>']


### Method1: Create a Custom Model for Multi-languages NER Task ###
We will try to build a model ourselves that the transformer ***body part will be from pre-train ROBERTA model and the head (task layer) part will be built by us***. In the transformer library, there're ***also all kinds of robust models with body and head parts already built all together***. For instance, if you are not interested in building the head layer yourself, you can directly load the ***BertForSequenceClassification*** model from AutoModel. 

In [9]:
class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # Load Roberta model body
        self.roberta = RobertaModel(config,add_pooling_layer=False)

        # Set up classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size,config.num_labels)

        # Load and initialize weights
        self.init_weights()
    
    def forward(self,input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):

        # Use model body to get encoder output
        outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)

        # Apply classifier to encoder output
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)

        # Calculate loss
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1,self.num_labels),labels.view(-1))

        # Return model output object
        return TokenClassifierOutput(loss=loss,logits=logits,
                                     hidden_states=outputs.hidden_states,
                                     attentions=outputs.attentions)


### Method2: Directly Loading Custom Model ###

In [13]:
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

# Load the pre-trained model default configuration
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
                                         num_labels=tags.num_classes,
                                         id2label=index2tag,label2id=tag2index)

In [14]:
# Create the model with default config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_model = (XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name,config=xlmr_config).to(device))

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
input_ids = xlmr_tokenizer.encode(text, return_tensors="pt")
pd.DataFrame([xlmr_tokens,input_ids[0].numpy()],index=["Tokens","Input IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Input IDs,0,21763,37456,15555,5161,7,2356,5753,38,2


In [24]:
# Inference and predict with the default weight and configs
outputs = xlmr_model(input_ids.to(device)).logits
predictions = torch.argmax(outputs,dim=-1)
print(f"Number of tokens in sequence: {len(xlmr_tokens)}")
print(f"Shape of outputs: {outputs.shape}") # The shape of the output is [ batch_size, num_tokens, num_tags ]

# From the result, we can clearly tell the model with random weights and without fine-tuning on our own data will not result in a good predictions
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens,preds],index=['tokens','tags'])


Number of tokens in sequence: 10
Shape of outputs: torch.Size([1, 10, 7])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
tags,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG,B-LOC,B-LOC,B-LOC,B-ORG,B-ORG


In [36]:
# Create a function to do everything

def tag_text(text,tags,model,tokenizer):

    # Get the tokens
    tokens = tokenizer(text).tokens()

    # Encode the sequence into IDs
    input_ids = xlmr_tokenizer(text,return_tensors="pt").input_ids.to(device)
    # or input_ids = xlmr_tokenizer.encode(text,return_tensors="pt")

    # Inference
    outputs = model(input_ids).logits

    # Get the most likely class based on logits
    predictions = torch.argmax(outputs,dim=-1)

    # Convert output to dataframe
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens,preds],index=["Tokens","Tags"])

tag_text(text,tags,xlmr_model,xlmr_tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Tags,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG,B-LOC,B-LOC,B-LOC,B-ORG,B-ORG


### Tokenize the entire multilanguage dataset ###

In [44]:
words, labels = de_example["tokens"], de_example["ner_tags"]

# Given that the dataset is already splitted into words, we need to tell out tokenozer is already splitted is_split_into_words=True
tokenized_input = xlmr_tokenizer(de_example["tokens"],is_split_into_words=True)

# xlmr tokenizer might do sub-word splitting. To easily catch which tokens are actually the same word, the tokenizer also provides word_ids to match words
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
pd.DataFrame([tokens],index=["Tokens"])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>


In [46]:
# Show word_ids
word_ids = tokenized_input.word_ids()
pd.DataFrame([tokens,word_ids],index=["Tokens","Word IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,


In [56]:
# While classifing the tokens, we want the model not to train and classify on sub-word tokens.
# Specifically, we don't want cross-entropy loss to calculate based on sub-word tokens.
# Hence, we need to replace label_id (ner tag id) of the sub-words with -100, because nn.CrossEntropyLoss() will ignore any index = -100
previous_word_idx = None
label_ids = []

for word_idx in word_ids:
    if word_idx is None or word_idx==previous_word_idx:
        label_ids.append(-100)
    elif word_idx!=previous_word_idx:
        label_ids.append(labels[word_idx])
    previous_word_idx = word_idx

labels = [ index2tag[l] if l != -100 else "IGN" for l in label_ids ]
pd.DataFrame([tokens,word_ids,label_ids,labels],index=['Tokens','Word Ids','Label IDs','Labels'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word Ids,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,
Label IDs,-100,0,0,-100,0,0,5,-100,-100,6,...,5,-100,-100,-100,6,-100,-100,0,-100,-100
Labels,IGN,O,O,IGN,O,O,B-LOC,IGN,IGN,I-LOC,...,B-LOC,IGN,IGN,IGN,I-LOC,IGN,IGN,O,IGN,IGN


In [58]:
# Create a function that will do everything, tokenization and replace subword label id
def tokenize_and_align_labels(examples):

    # Tokenization
    tokenized_input = xlmr_tokenizer(examples["tokens"],truncation=True,is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_input.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx==previous_word_idx:
                label_ids.append(-100)
            elif word_idx!=previous_word_idx:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_input['labels'] = labels
    return tokenized_input

# Create a function that will map the function to entire dataset
def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels,batched=True,
                      remove_columns = ['langs','ner_tags','tokens'])

panx_de_encoded = encode_panx_dataset(panx_ch['de'])

Map:   0%|          | 0/12580 [00:00<?, ? examples/s]

Map:   0%|          | 0/6290 [00:00<?, ? examples/s]

Map:   0%|          | 0/6290 [00:00<?, ? examples/s]

### Performance Measure ###
The ***seqeval*** module is designed to measure classification result based on lists of lists.

In [61]:
# Example

y_true = [["O","O","O","B-MISC","I-MISC","I-MISC","O"],
          ["B-PER","I-PER","O"]]
y_pred = [["O","O","B-MISC","I-MISC","I-MISC","I-MISC","O"],
          ["B-PER","I-PER","O"]]

print(classification_report(y_true,y_pred))

              precision    recall  f1-score   support

        MISC       0.00      0.00      0.00         1
         PER       1.00      1.00      1.00         1

   micro avg       0.50      0.50      0.50         2
   macro avg       0.50      0.50      0.50         2
weighted avg       0.50      0.50      0.50         2



In [62]:
# Create an functio that could create a list of true label and prediction labels, excluding subword labels

def align_predictions(predictions,label_ids):
    preds = np.argmax(predictions,axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):

            # Ignore label_ids = -100
            if label_ids[batch_idx,seq_idx]!=-100:
                example_labels.append(index2tag[label_ids[batch_idx,seq_idx]])
                example_preds.append(index2tag[preds[batch_idx,seq_idx]])
        
        labels_list.append(example_labels)
        preds_list.append(example_preds)
    
    return preds_list,labels_list

### Fine tune model based on German subset only ###
We will first only fine tune the model on one of the language (German) only.

In [63]:
# Set the training Arguments
num_epochs = 3
batch_size = 24
logging_steps = len(panx_de_encoded['train'])//batch_size
model_name = f"{xlmr_model_name}-finetuned-panx-de"

training_args = TrainingArguments(output_dir=model_name,
                                  log_level="error",
                                  num_train_epochs=num_epochs,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  eval_strategy="epoch",
                                  save_steps=1e6,
                                  weight_decay=0.01,
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True)

In [64]:
# Login to HuggingFace
login(token="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")

In [65]:
# Define error matric calculation
def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions,
                                       eval_pred.label_ids)
    return {"f1":f1_score(y_true,y_pred)}

In [68]:
# Remember while doing input text tokenization, we do truncation and paddings.
# Yet, for NER task, our output is also a list of tags. Hence, we will also need to do padding on the NER tags.
# Transformers provide a function called DataCollatorForTokenClassification to complete this part for token classification task.
# We just need to add data collator after the tokenizer
data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [74]:
# Initialize the model for fine-tune
# This method avoil initialize a new model for every trainer.
# This method loads an untrained model and is called at the beginning of the train() call
def model_init():
    return (XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name,config=xlmr_config).to(device))

In [75]:
# Set up the trainer
# If you recall in the previous text classification example, we use model = pretrained_model_object ,instead of model_init = model_func
# Using model_init = model_func will make you re-train the entire model.
# Using model = pretrained_model_object will directly fine-tune on the pre-trained model
trainer = Trainer(model_init=model_init,
                  args=training_args,
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  train_dataset=panx_de_encoded['train'],
                  eval_dataset=panx_de_encoded['validation'],
                  tokenizer = xlmr_tokenizer)

trainer.train()
trainer.push_to_hub(commit_message="Training completed!")

  trainer = Trainer(model_init=model_init,


In [None]:
# Inference example
text_de = "Jeff Dean ist ein Informatiker bei Google in Kalifornien"
tag_text(text_de,tags,trainer.model,xlmr_tokenizer)