In [12]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras 
import transformers
from transformers import pipeline, AutoTokenizer, AutoModel, TFAutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments, TFAutoModelForSequenceClassification, AutoConfig
from datasets import load_dataset, get_dataset_config_names, DatasetDict
from huggingface_hub import list_datasets, notebook_login,login
import matplotlib.pyplot as plt
import torch
from torch import nn
from math import sqrt
import torch.nn.functional as F
from torch.nn.functional import cross_entropy
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score
from bertviz.transformers_neuron_view import BertModel
from bertviz.neuron_view import show
from bertviz import head_view
from collections import defaultdict, Counter

from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel

### Load the dataset ###
The XTREME dataset consists of Wiki articles in many languages.

In [2]:
# 1. Get the configurations of the dataset
xtreme_subsets = get_dataset_config_names("xtreme")
print(f"There are {len(xtreme_subsets)} configurations.")

There are 183 configurations.


In [3]:
# 2. We look at the subsets that start with "PAN"
# The last 2 letters represent the language
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
print(panx_subsets[:3])

# example loading the German one
load_dataset("xtreme",name="PAN-X.de")

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [4]:
# 3. To simulate real life cases, we will create a dataset with multiple languages' texts included, and the language portion will be imbalanced.
langs = ["de","fr","it","en"] # We choose 4 difference languages
fracs = [0.629,0.229,0.084,0.059] # Decide the potion of each language's text in the dataset

# Return a DatasetDict if a key does not exist
panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs,fracs):
    ds = load_dataset("xtreme",name=f"PAN-X.{lang}")
    # Shuffle and sample according the fraction
    for split in ds:
        panx_ch[lang][split] = (ds[split].shuffle(seed=0).select(range(int(frac * ds[split].num_rows))))

pd.DataFrame({lang:[panx_ch[lang]["train"].num_rows] for lang in langs},
             index = ['Number of training examples'])
# Apparently there're way more German (de) texts than the rest of the 3 languages


Unnamed: 0,de,fr,it,en
Number of training examples,12580,4580,1680,1180


In [5]:
# Look at some of German elements as example

element = panx_ch['de']['train'][0]
for key, value in element.items():
    print(f"{key}: {value}") # In the original dataset, NER tags are mapped to integer indexes

for key, value in panx_ch['de']['train'].features.items(): # The original strings of the classes are saved in the features of the dataset
    print(f"{key}: {value}")

tags = panx_ch['de']['train'].features['ner_tags'].feature
print(tags)

tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']
tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [6]:
# 4. Add a tag column for the text and string of the ner_tag

def create_tag_names(batch):
    return {"ner_tags_str":[tags.int2str(idx) for idx in batch['ner_tags']]}

panx_de = panx_ch['de'].map(create_tag_names)

de_example = panx_de["train"][0]
pd.DataFrame([de_example["tokens"],de_example["ner_tags_str"]],["Tokens","Tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,2.000,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
Tags,O,O,O,O,B-LOC,I-LOC,O,O,B-LOC,B-LOC,I-LOC,O


In [7]:
# Quick check if there's any imbalance issue on NER tags

split2freqs = defaultdict(Counter)

for split, dataset in panx_de.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type]+=1

pd.DataFrame.from_dict(split2freqs,orient="index")

Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071


### XLMR Model Tokenization ###

In [11]:
# xlmr use a tokenizer called SentencePiece tokenizer that is trained on 100 different languages

bert_model_name = "bert-base-cased"
xlmr_model_name = "xlm-roberta-base"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

text = "Jack Sparrow loves New York!"
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()
print(bert_tokens)
print(xlmr_tokens)

['[CLS]', 'Jack', 'Spa', '##rrow', 'loves', 'New', 'York', '!', '[SEP]']
['<s>', '▁Jack', '▁Spar', 'row', '▁love', 's', '▁New', '▁York', '!', '</s>']


### Method1: Create a Custom Model for Multi-languages NER Task ###
We will try to build a model ourselves that the transformer ***body part will be from pre-train ROBERTA model and the head (task layer) part will be built by us***. In the transformer library, there're ***also all kinds of robust models with body and head parts already built all together***. For instance, if you are not interested in building the head layer yourself, you can directly load the ***BertForSequenceClassification*** model from AutoModel. 

In [21]:
class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # Load Roberta model body
        self.roberta = RobertaModel(config,add_pooling_layer=False)

        # Set up classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size,config.num_labels)

        # Load and initialize weights
        self.init_weights()
    
    def forward(self,input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):

        # Use model body to get encoder output
        outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)

        # Apply classifier to encoder output
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)

        # Calculate loss
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1,self.num_labels),labels.view(-1))

        # Return model output object
        return TokenClassifierOutput(loss=loss,logits=logits,
                                     hidden_states=outputs.hidden_states,
                                     attentions=outputs.attentions)


### Method2: Directly Loading Custom Model ###

In [26]:
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
                                         num_labels=tags.num_classes,
                                         id2label=index2tag,label2id=tag2index)

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_model = (XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name,config=xlmr_config).to(device))

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
input_ids = xlmr_tokenizer.encode(text, return_tensors="pt")
pd.DataFrame([xlmr_tokens,input_ids[0].numpy()],index=["Tokens","Input IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Input IDs,0,21763,37456,15555,5161,7,2356,5753,38,2
