In [None]:
#installing the required libraries

!pip install transformers datasets seqeval matplotlib torch pandas numpy -q

In [None]:
import numpy as np

from transformers import EvalPrediction
from transformers import TrainingArguments
from transformers import DataCollatorForTokenClassification
from transformers import Trainer
from transformers import AutoConfig
from transformers import AutoTokenizer

from collections import defaultdict
from collections import Counter

import torch.nn as nn
from matplotlib import pyplot as plt
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

import torch
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [None]:

from seqeval.metrics import f1_score
from seqeval.metrics import classification_report


from datasets import get_dataset_config_names
from datasets import load_dataset
from datasets import DatasetDict



# Importing the required Data and Preprocessing

XTREME is a benchmark for the evaluation of the cross-lingual generalization ability of pre-trained multilingual models that covers 40 typologically diverse languages and includes nine tasks.

In [None]:
xtreme_subsets = get_dataset_config_names("xtreme")
print(f"XTREME has {len(xtreme_subsets)} configurations.")

XTREME has 183 configurations.


In [None]:
#fetching the pan-x configurations from all the xtreme configurations
panx_subsets = {s for s in xtreme_subsets if s.startswith("PAN")}
panx_subsets = sorted(panx_subsets)
panx_subsets

['PAN-X.af',
 'PAN-X.ar',
 'PAN-X.bg',
 'PAN-X.bn',
 'PAN-X.de',
 'PAN-X.el',
 'PAN-X.en',
 'PAN-X.es',
 'PAN-X.et',
 'PAN-X.eu',
 'PAN-X.fa',
 'PAN-X.fi',
 'PAN-X.fr',
 'PAN-X.he',
 'PAN-X.hi',
 'PAN-X.hu',
 'PAN-X.id',
 'PAN-X.it',
 'PAN-X.ja',
 'PAN-X.jv',
 'PAN-X.ka',
 'PAN-X.kk',
 'PAN-X.ko',
 'PAN-X.ml',
 'PAN-X.mr',
 'PAN-X.ms',
 'PAN-X.my',
 'PAN-X.nl',
 'PAN-X.pt',
 'PAN-X.ru',
 'PAN-X.sw',
 'PAN-X.ta',
 'PAN-X.te',
 'PAN-X.th',
 'PAN-X.tl',
 'PAN-X.tr',
 'PAN-X.ur',
 'PAN-X.vi',
 'PAN-X.yo',
 'PAN-X.zh']

In [None]:
#according to wiki, english is most spoken language (non native speakers included) in europe w 260m speakers
load_dataset('xtreme', name='PAN-X.en')

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

We will fine tune our model on english, and then perform zero shot learning with the other 4 most widely spoken european language.

In [None]:
languages = ['en','fr','de','it','es']
speakers = [260,210,170,82,76] #in millions
def convert_to_percentage(arr):
  total = sum(arr)
  return [100 * count / total for count in arr]
percentages = convert_to_percentage(speakers)
print(percentages)

[32.581453634085214, 26.31578947368421, 21.303258145363408, 10.275689223057643, 9.523809523809524]


In [None]:
#loading datasets for the selected languages, shuffling and selecting a percentage of samples, then storing them in a combined dataset dictionary.
panx_ds_combined = defaultdict(DatasetDict)

for lang, per in zip(languages,percentages):
  ds = load_dataset('xtreme', name=f'PAN-X.{lang}')
  for train_test_val in ds:
    num_samples = min(int(per/100 * len(ds[train_test_val])), len(ds[train_test_val]))
    panx_ds_combined[lang][train_test_val] = (
        ds[train_test_val].shuffle(seed=42).select(range(num_samples)) #shuffle to ensure we dont accidentally bias our splits. it randomly shuffles the col values. #select returns rows acc to list of indices
    )

In [None]:
panx_ds_combined

defaultdict(datasets.dataset_dict.DatasetDict,
            {'en': DatasetDict({
                 train: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 6516
                 })
                 validation: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 3258
                 })
                 test: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 3258
                 })
             }),
             'fr': DatasetDict({
                 train: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 5263
                 })
                 validation: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 2631
                 })
                 test: Dataset({
                     features: ['tokens', 'ner_tags', 'lan

In [None]:
pd.DataFrame({lang: [panx_ds_combined[lang]['train'].num_rows] for lang in languages})

Unnamed: 0,en,fr,de,it,es
0,6516,5263,4260,2055,1904


In [None]:
#fine tuning xlm roberta on english. later we'll do the cross lingual zero shot learning on the other datasets

In [None]:
example = panx_ds_combined['en']['train'][0]
for key,val in example.items():
  print(f"{key}:{val}") #key column name of arrow table. val is entry in each column

tokens:["''", 'January', '21', "''", '–', 'Nanny', 'and', 'the', 'Professor']
ner_tags:[0, 0, 0, 0, 0, 1, 2, 2, 2]
langs:['en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en']


In [None]:
#checking underlying datatypes.
for key,val in panx_ds_combined['en']["train"].features.items():
  print(f"{key}:{val}")


tokens:Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags:Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs:Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)


In [None]:
#seq class specifies the field contains a list of features. so, for ner tags it is a list of class labels
tags = panx_ds_combined['en']['train'].features["ner_tags"].feature
tags

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)

In [None]:
#converting numerical NER tag indices to their string representations for the English dataset and mapping the function to the dataset
def create_tag_names(batch):
  return {"ner_tags_str":[tags.int2str(idx) for idx in batch['ner_tags']]}

panx_en = panx_ds_combined['en'].map(create_tag_names)
panx_en

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 6516
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 3258
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 3258
    })
})

In [None]:
panx_en['train'][0]

{'tokens': ["''",
  'January',
  '21',
  "''",
  '–',
  'Nanny',
  'and',
  'the',
  'Professor'],
 'ner_tags': [0, 0, 0, 0, 0, 1, 2, 2, 2],
 'langs': ['en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en'],
 'ner_tags_str': ['O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'I-PER']}

In [None]:
en_example = panx_en['train'][0]
pd.DataFrame([en_example['tokens'], en_example['ner_tags_str']], ['Tokens','Tags'])

Unnamed: 0,0,1,2,3,4,5,6,7,8
Tokens,'',January,21,'',–,Nanny,and,the,Professor
Tags,O,O,O,O,O,B-PER,I-PER,I-PER,I-PER


In [None]:
#counting ner tag splits for each 3

def get_ner_tag_counts(dataset):
  split2freqs = defaultdict(Counter)
  for split, ds in dataset.items():
    for row in ds['ner_tags_str']:
      for tag in row:
        if tag.startswith('B'):
          tag_type = tag.split('-')[1]
          split2freqs[split][tag_type] += 1
  return split2freqs

frequencies = get_ner_tag_counts(panx_en)

pd.DataFrame.from_dict(frequencies, orient="index")

Unnamed: 0,PER,LOC,ORG
train,2983,3180,3089
validation,1518,1511,1576
test,1542,1530,1574


# Creating a custom model for token classification

In [None]:
class XLMRobertaforTokenClassification(RobertaPreTrainedModel):
  config_class = XLMRobertaConfig

  def __init__(self, config):
    super().__init__(config) #initializing the parent class with the given configuration
    self.num_labels = config.num_labels

    self.roberta = RobertaModel(config, add_pooling_layer=False) #initialising the roberta model
    self.dropout = nn.Dropout(config.hidden_dropout_prob) #drop layer to prevent overfitting
    self.classifier = nn.Linear(config.hidden_size, config.num_labels)
    self.init_weights()

  def forward(self,input_ids=None,attention_mask=None,token_type_ids=None,labels=None, **kwargs):
    outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs) #forward pass through the roberta model
    sequence_output = self.dropout(outputs[0]);
    logits = self.classifier(sequence_output) #computing logits for each token in the sequence

    #calculating loss if labels are provided
    loss = None
    if labels is not None:
      loss_fun = nn.CrossEntropyLoss()
      loss = loss_fun(logits.view(-1, self.num_labels), labels.view(-1))
    #returning the loss, logits, hidden states, and attention weights
    return TokenClassifierOutput(loss = loss, logits = logits, hidden_states = outputs.hidden_states, attentions = outputs.attentions)

In [None]:
#creating dictionaries mapping index to tag and vice versa
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}
index2tag

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC'}

In [None]:
tag2index

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6}

In [None]:
#we'll store this in autoconfig class (with modified parameters)

xlmr_model_name = 'xlm-roberta-base'
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name) #loading the tokenizer for the specified XLM-Roberta model


In [None]:
#loading the configuration for the specified XLM-Roberta model
#with the number of labels, and mappings from id to label and label to id
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name, num_labels = tags.num_classes, id2label = index2tag, label2id = tag2index)



In [None]:
#setting up a device for PyTorch and loading a pre-trained XLM-Roberta model for token classification. checks if a CUDA-compatible GPU is available on the system. If it is, it sets the device to GPU (cuda); otherwise, it sets it to CPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_my_model = (XLMRobertaforTokenClassification.from_pretrained(xlmr_model_name, config = xlmr_config).to(device))

Some weights of XLMRobertaforTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
example_string = 'My legs are in pain from this weight'
xlmr_tokens = xlmr_tokenizer(example_string).tokens()
xlmr_tokens

['<s>',
 '▁My',
 '▁leg',
 's',
 '▁are',
 '▁in',
 '▁pain',
 '▁from',
 '▁this',
 '▁weight',
 '</s>']

In [None]:
#defining a function to tag text using pretrained model and tokenizer
def tag_text(text, tags, model, tokenizer):
  #tokenize the text, preserving special characters
  tokens = tokenizer(text).tokens()

  # Convert tokens to input IDs and move them to the appropriate device
  input_ids = tokenizer(text, return_tensors = 'pt').input_ids.to(device)
  print('input_ids:', input_ids)

  #get the model outputs for the given input ids
  outputs = model(input_ids)[0]
  print('Shape of Outputs:', outputs.shape)

  #argmax over the tag dimension to get the most likely class for each token
  predictions = torch.argmax(outputs, dim = 2)
  print('Predictions:', predictions)

  #mapping predictions to tag names
  preds = [tags.names[p] for p in predictions[0].cpu().numpy()]

  # Return tokens and their predicted tags as a DataFrame
  return pd.DataFrame([tokens, preds], ['Tokens', 'NER Tags'])

In [None]:
print(tags)
print(xlmr_tokenizer)

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)
XLMRobertaTokenizerFast(name_or_path='xlm-roberta-base', vocab_size=250002, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	250001: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}


In [None]:
tag_text(example_string,tags,xlmr_my_model,xlmr_tokenizer)

input_ids: tensor([[    0,  2646,  6049,     7,   621,    23, 24503,  1295,   903, 57888,
             2]])
Shape of Outputs: torch.Size([1, 11, 7])
Predictions: tensor([[4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
Tokens,<s>,▁My,▁leg,s,▁are,▁in,▁pain,▁from,▁this,▁weight,</s>
NER Tags,I-ORG,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC


Tokenizing texts for ner. we tokenize each word and use the is_split_into_words argument to indicate that our input sequence is already split into words:

In [None]:
en_example

{'tokens': ["''",
  'January',
  '21',
  "''",
  '–',
  'Nanny',
  'and',
  'the',
  'Professor'],
 'ner_tags': [0, 0, 0, 0, 0, 1, 2, 2, 2],
 'langs': ['en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en'],
 'ner_tags_str': ['O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'I-PER']}

In [None]:
words, labels = en_example['tokens'], en_example['ner_tags']

In [None]:
labels

[0, 0, 0, 0, 0, 1, 2, 2, 2]

In [None]:
#is_split_into_words DOES NOT MEAN that the text was already pre-tokenized. It just means that the string was split into words (not tokens), i.e., split on spaces.
tokenized_input = xlmr_tokenizer(en_example['tokens'],is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])
tokens

['<s>',
 "▁''",
 '▁January',
 '▁21',
 "▁''",
 '▁–',
 '▁Nan',
 'ny',
 '▁and',
 '▁the',
 '▁Professor',
 '</s>']

In [None]:
pd.DataFrame([tokens], index=["Tokens"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,<s>,▁'',▁January,▁21,▁'',▁–,▁Nan,ny,▁and,▁the,▁Professor,</s>


In [None]:

word_ids = tokenized_input.word_ids() # Get the word IDs from the tokenized input
print('word_ids ', word_ids)

pd.DataFrame([tokens, word_ids], index=["Tokens", "Word IDs"]) # Create a DataFrame to display tokens along with their corresponding word IDs

word_ids  [None, 0, 1, 2, 3, 4, 5, 5, 6, 7, 8, None]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,<s>,▁'',▁January,▁21,▁'',▁–,▁Nan,ny,▁and,▁the,▁Professor,</s>
Word IDs,,0,1,2,3,4,5,5,6,7,8,


In [None]:
previous_word_idx = None
label_ids = []

for word_idx in word_ids:
    if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100) #usind -100 for tokens not associated with a word index
    elif word_idx != previous_word_idx:
        label_ids.append(labels[word_idx])
    # Update previous_word_idx to the current word index
    previous_word_idx = word_idx

print('label_ids ', label_ids)

#converting label IDs to actual labels or "IGN" for ignored tokens
labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]
index = ["Tokens", "Word IDs", "Label IDs", "Labels"]

pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)

label_ids  [-100, 0, 0, 0, 0, 0, 1, -100, 2, 2, 2, -100]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,<s>,▁'',▁January,▁21,▁'',▁–,▁Nan,ny,▁and,▁the,▁Professor,</s>
Word IDs,,0,1,2,3,4,5,5,6,7,8,
Label IDs,-100,0,0,0,0,0,1,-100,2,2,2,-100
Labels,IGN,O,O,O,O,O,B-PER,IGN,I-PER,I-PER,I-PER,IGN


In [None]:
#function to tokenize the input text, mask certain tokens, and adjust the labels accordingly
def tokenize_mask_modify_labels(examples):
    tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True,
                                      is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Map
Some of the more powerful applications of huggingface Datasets come from using the map() function. The primary purpose of map() is to speed up processing functions. It allows you to apply a processing function to each example in a dataset, independently or in batches. This function can even create new rows and columns.

In [None]:
def encode_panx_dataset(corpus):
    """
    Encode a dataset in-place by tokenizing, masking, and modifying labels.

    Args:
    - corpus (datasets.Dataset): Dataset to be processed, containing columns 'langs', 'ner_tags', and 'tokens'.

    Returns:
    - datasets.Dataset: Processed dataset with tokenized inputs and modified labels, columns 'input_ids', 'attention_mask', and 'labels'.
    """
    # Use the map method to apply tokenize_mask_modify_labels function to each batch
    # of the dataset, removing columns 'langs', 'ner_tags', and 'tokens' in the process.
    return corpus.map(tokenize_mask_modify_labels, batched=True, remove_columns=['langs', 'ner_tags', 'tokens'])


In [None]:
panx_en_encoded = encode_panx_dataset(panx_ds_combined["en"])

Map: 100%|██████████| 3258/3258 [00:00<00:00, 17046.72 examples/s]


In [None]:
# Importing the classification_report function from seqeval.metrics
from seqeval.metrics import classification_report

# True labels (ground truth) for two sequences
y_true = [["O", "O", "O", "B-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]

# Predicted labels for the same sequences as y_true
y_pred = [["O", "O", "B-MISC", "I-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]

# Generating and printing the classification report comparing y_true and y_pred
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

        MISC       0.00      0.00      0.00         1
         PER       1.00      1.00      1.00         1

   micro avg       0.50      0.50      0.50         2
   macro avg       0.50      0.50      0.50         2
weighted avg       0.50      0.50      0.50         2



In [None]:
# eval prediction

In [None]:
def generate_list_compute_metrics(predictions, label_ids):
    # predictions: predicted probabilities or logits (output of the model)
    # label_ids: true label indices for each token in each example

    # Get the predicted labels by taking the argmax along the last axis (which represents classes)
  preds = np.argmax(predictions, axis=2)
  batch_size, seq_len = preds.shape
  predictions_list, true_list = [],[]

  for b_idx in range(batch_size): #iterating over each example in the batch
    example_labels, example_preds = [],[]
    for s_idx in range(seq_len): #iterating over each token in the sequence
      if label_ids[b_idx][s_idx] != -100: #ignoring masked tokens
        example_preds.append(index2tag[preds[b_idx][s_idx]])
        example_labels.append(index2tag[label_ids[b_idx][s_idx]])
    predictions_list.append(example_preds)
    true_list.append(example_labels)
  return predictions_list, true_list

In [None]:

!pip install accelerate -U



In [None]:
num_epochs = 3
batch_size = 24
logging_steps = len(panx_en_encoded["train"]) // batch_size
model_name = f"{xlmr_model_name}-finetuned on panx english"

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=model_name,                    # Directory where model checkpoints and logs will be saved
    log_level="error",                        # Set log level (error, warning, info, debug)
    num_train_epochs=num_epochs,              # Number of training epochs
    per_device_train_batch_size=batch_size,   # Batch size per GPU/TPU core for training
    per_device_eval_batch_size=batch_size,    # Batch size per GPU/TPU core for evaluation
    evaluation_strategy="epoch",              # Evaluate every epoch
    save_steps=1e6,                           # Number of steps before saving model checkpoint
    weight_decay=0.01,                        # Weight decay for regularization
    disable_tqdm=False,                       # Disable tqdm progress bars
    push_to_hub=False,                        # Whether to push to the Hub (if using Transformers Hub)
    logging_steps=logging_steps,              # Log metrics every `logging_steps` steps
)




In [None]:
def compute_metrics(p: EvalPrediction):
  print('eval prediction', p.predictions)
  y_pred,y_true = generate_list_compute_metrics(p.predictions, p.label_ids)
  return{"f1": f1_score(y_true,y_pred)}

In [None]:
data_collator = DataCollatorForTokenClassification(xlmr_tokenizer) #initializing DataCollatorForTokenClassification with XLM-RoBERTa tokenizer

def model_init(): #initialize the token classification model
  return (XLMRobertaforTokenClassification.from_pretrained(xlmr_model_name, config = xlmr_config).to(device))

# Fine Tuning XLMRoberta

In [None]:
#fine tuning

from transformers import Trainer

trainer = Trainer(
    model_init = model_init,
    args = training_args,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    train_dataset = panx_en_encoded["train"],
    eval_dataset = panx_en_encoded["test"],
    tokenizer = xlmr_tokenizer,
)

In [None]:
trainer.train()

 33%|███▎      | 271/816 [25:02<46:19,  5.10s/it]  

{'loss': 0.5416, 'grad_norm': 8.992460250854492, 'learning_rate': 3.339460784313725e-05, 'epoch': 1.0}


 33%|███▎      | 272/816 [25:05<40:15,  4.44s/it]

eval prediction [[[ 1.61168385e+00  1.93553358e-01 -8.20245504e-01 ...  2.82754228e-02
    3.04702103e-01 -1.13533211e+00]
  [ 1.94839060e+00  4.59912419e-01 -2.37556744e+00 ... -1.21153152e+00
    1.54391217e+00 -2.16635203e+00]
  [ 2.65918159e+00 -2.46978331e+00 -2.43305564e-02 ...  2.75960350e+00
   -2.23649883e+00  5.03501534e-01]
  ...
  [-1.00000000e+02 -1.00000000e+02 -1.00000000e+02 ... -1.00000000e+02
   -1.00000000e+02 -1.00000000e+02]
  [-1.00000000e+02 -1.00000000e+02 -1.00000000e+02 ... -1.00000000e+02
   -1.00000000e+02 -1.00000000e+02]
  [-1.00000000e+02 -1.00000000e+02 -1.00000000e+02 ... -1.00000000e+02
   -1.00000000e+02 -1.00000000e+02]]

 [[ 8.16402674e-01 -1.61698431e-01 -6.89865127e-02 ...  2.16685236e-03
   -2.33492002e-01 -2.14633495e-01]
  [-5.39389372e-01  5.62044525e+00 -1.83847201e+00 ... -1.93583179e+00
    1.98128626e-01 -3.60505939e+00]
  [-1.14877224e+00 -1.09169114e+00  5.25410271e+00 ...  2.47328043e+00
   -2.00852394e+00  3.03800404e-03]
  ...
  [-1.0

                                                 
 33%|███▎      | 272/816 [27:35<40:15,  4.44s/it]

{'eval_loss': 0.3296959400177002, 'eval_f1': 0.7362115265441024, 'eval_runtime': 149.718, 'eval_samples_per_second': 21.761, 'eval_steps_per_second': 0.908, 'epoch': 1.0}


 66%|██████▋   | 542/816 [51:54<39:55,  8.74s/it]  

{'loss': 0.2954, 'grad_norm': 4.983207702636719, 'learning_rate': 1.678921568627451e-05, 'epoch': 1.99}


 67%|██████▋   | 544/816 [52:03<28:52,  6.37s/it]

eval prediction [[[ 2.1051674e+00  1.6002744e-02 -4.5070237e-01 ...  4.1323621e-02
   -2.3694363e-01 -1.1207001e+00]
  [ 4.0705748e+00  7.1567631e-01 -2.6463609e+00 ... -1.7420857e+00
    1.2597938e+00 -3.5140505e+00]
  [ 4.7437778e+00 -2.6340892e+00  2.7744853e-01 ...  2.2443707e+00
   -2.7068174e+00 -6.0830522e-01]
  ...
  [-1.0000000e+02 -1.0000000e+02 -1.0000000e+02 ... -1.0000000e+02
   -1.0000000e+02 -1.0000000e+02]
  [-1.0000000e+02 -1.0000000e+02 -1.0000000e+02 ... -1.0000000e+02
   -1.0000000e+02 -1.0000000e+02]
  [-1.0000000e+02 -1.0000000e+02 -1.0000000e+02 ... -1.0000000e+02
   -1.0000000e+02 -1.0000000e+02]]

 [[ 2.0466490e+00 -5.2606082e-01 -2.5325751e-01 ...  4.6759915e-01
   -5.1273954e-01 -9.0093946e-01]
  [ 8.2598329e-03  5.6770787e+00 -1.6933768e+00 ... -1.7214979e+00
   -1.8391693e-01 -3.7484708e+00]
  [-3.5606807e-01 -7.6042694e-01  4.5297313e+00 ...  1.8277693e+00
   -1.9076734e+00 -5.1208615e-01]
  ...
  [-1.0000000e+02 -1.0000000e+02 -1.0000000e+02 ... -1.000000

                                                 
 67%|██████▋   | 544/816 [54:33<28:52,  6.37s/it]

{'eval_loss': 0.28713440895080566, 'eval_f1': 0.772178699978827, 'eval_runtime': 150.7495, 'eval_samples_per_second': 21.612, 'eval_steps_per_second': 0.902, 'epoch': 2.0}


100%|█████████▉| 813/816 [1:20:49<00:15,  5.15s/it]  

{'loss': 0.2097, 'grad_norm': 7.200422763824463, 'learning_rate': 1.8382352941176472e-07, 'epoch': 2.99}


100%|██████████| 816/816 [1:21:01<00:00,  4.40s/it]

eval prediction [[[ 1.46049130e+00 -3.71023789e-02 -5.66521764e-01 ... -6.71732128e-02
    9.17637199e-02 -7.39446223e-01]
  [ 3.83629084e+00  3.14959109e-01 -3.00060081e+00 ... -2.05165267e+00
    1.74679971e+00 -3.00509357e+00]
  [ 4.89869308e+00 -3.02554798e+00 -6.37388110e-01 ...  1.96551132e+00
   -2.31688643e+00 -1.62800461e-01]
  ...
  [-1.00000000e+02 -1.00000000e+02 -1.00000000e+02 ... -1.00000000e+02
   -1.00000000e+02 -1.00000000e+02]
  [-1.00000000e+02 -1.00000000e+02 -1.00000000e+02 ... -1.00000000e+02
   -1.00000000e+02 -1.00000000e+02]
  [-1.00000000e+02 -1.00000000e+02 -1.00000000e+02 ... -1.00000000e+02
   -1.00000000e+02 -1.00000000e+02]]

 [[ 1.82026315e+00 -3.67643476e-01 -2.34368905e-01 ...  1.25174522e-01
   -4.47949737e-01 -7.34403849e-01]
  [ 2.94063449e-01  6.40595722e+00 -1.79854214e+00 ... -2.13840580e+00
   -7.60737360e-01 -4.04679489e+00]
  [-1.01316355e-01 -9.64817047e-01  6.11404943e+00 ...  2.02791357e+00
   -2.63783097e+00 -1.15156198e+00]
  ...
  [-1.0

                                                   
100%|██████████| 816/816 [1:23:38<00:00,  6.15s/it]

{'eval_loss': 0.28108614683151245, 'eval_f1': 0.7790427999156653, 'eval_runtime': 157.1895, 'eval_samples_per_second': 20.727, 'eval_steps_per_second': 0.865, 'epoch': 3.0}
{'train_runtime': 5018.9215, 'train_samples_per_second': 3.895, 'train_steps_per_second': 0.163, 'train_loss': 0.3487163671091491, 'epoch': 3.0}





TrainOutput(global_step=816, training_loss=0.3487163671091491, metrics={'train_runtime': 5018.9215, 'train_samples_per_second': 3.895, 'train_steps_per_second': 0.163, 'total_flos': 394314629981184.0, 'train_loss': 0.3487163671091491, 'epoch': 3.0})

In [None]:
df = pd.DataFrame(trainer.state.log_history)[['epoch','loss','eval_loss','eval_f1']]
df = df.rename(columns={'epoch':'Epoch','loss':'Training Loss','eval_loss':'Validation Loss', 'eval_f1':'F1'})
df['Epoch'] = df['Epoch'].apply(lambda x: round(x))
df['Training Loss'] = df['Training Loss'].ffill()
df[['Validation Loss', 'F1']] = df[['Validation Loss', 'F1']].bfill().ffill()
df.drop_duplicates()

Unnamed: 0,Epoch,Training Loss,Validation Loss,F1
0,1,0.5416,0.329696,0.736212
2,2,0.2954,0.287134,0.772179
4,3,0.2097,0.281086,0.779043


In [None]:
text_en = "John is visiting Germany this summer"
tag_text(text_en, tags, trainer.model, xlmr_tokenizer)

input_ids: tensor([[     0,   4939,     83, 150080, 102126,    903,  51065,      2]])
Shape of Outputs: torch.Size([1, 8, 7])
Predictions: tensor([[0, 1, 0, 0, 5, 0, 0, 0]])


Unnamed: 0,0,1,2,3,4,5,6,7
Tokens,<s>,▁John,▁is,▁visiting,▁Germany,▁this,▁summer,</s>
NER Tags,O,B-PER,O,O,B-LOC,O,O,O


In [None]:
valid_set_batch = panx_en_encoded["validation"]
valid_set_batch

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3258
})

In [None]:
valid_set_batch.features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [None]:
valid_set_batch.features["input_ids"]

Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None)

In [None]:
valid_set_batch.features['attention_mask']

Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)

In [None]:
from torch.nn.functional import cross_entropy

def forward_loss_labels(batch):
    # Create features dictionary for batch
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]

    # Use data_collator to prepare batch
    batch = data_collator(features)

    # Move input_ids, attention_mask, and labels to device
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)

    # Disable gradient computation for inference
    with torch.no_grad():
        # Forward pass through the model
        output = trainer.model(input_ids, attention_mask)
        # Predicted labels (indices with maximum probability)
        predicted_label = torch.argmax(output.logits, axis=-1).cpu().numpy()

    # Calculate cross-entropy loss
    loss = cross_entropy(output.logits.view(-1, 7), labels.view(-1), reduction="none")
    # Reshape loss to match batch size
    loss = loss.view(len(input_ids), -1).cpu().numpy()

    # Return dictionary with loss and predicted labels
    return {"loss": loss, "predicted_label": predicted_label}


In [None]:
valid_set_with_loss = valid_set_batch.map(forward_loss_labels, batched=True, batch_size=32)
df = valid_set_with_loss.to_pandas()

Map: 100%|██████████| 3258/3258 [02:33<00:00, 21.23 examples/s]


In [None]:
df.shape

(3258, 5)

In [None]:
df.head

<bound method NDFrame.head of                                               input_ids  \
0     [0, 5106, 235474, 14, 15491, 15619, 152, 106, ...   
1     [0, 353, 3459, 26708, 13, 78833, 5106, 339, 5,...   
2     [0, 4687, 1902, 39395, 5470, 678, 40137, 2548,...   
3     [0, 38348, 59338, 15, 1735, 38662, 192859, 138...   
4     [0, 636, 330, 122807, 242, 7, 314, 60635, 31, ...   
...                                                 ...   
3253  [0, 242, 5106, 6, 22905, 724, 5106, 242, 20, 5...   
3254  [0, 360, 7270, 6, 4, 764, 435, 678, 8055, 13, ...   
3255  [0, 242, 5106, 13684, 48585, 15819, 41975, 510...   
3256            [0, 57035, 38026, 6, 4, 23213, 1760, 2]   
3257            [0, 26521, 19175, 159, 157, 685, 56, 2]   

                                         attention_mask  \
0                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]   
1               [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]   
2     [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
3                        

# Cross Lingual Transfer (Zero Shot Lerning)

In [None]:
#function to get f1 score of some dataset on some trainer
def get_f1(trainer,dataset):
    return trainer.predict(dataset).metrics["test_f1"]

f1_scores = defaultdict(dict)
f1_scores["en"]["en"] = get_f1(trainer, panx_en_encoded["test"])
print(f"F1 Score of [en] model on [en] dataset : {f1_scores['en']['en']:.3f}")

100%|██████████| 136/136 [02:31<00:00,  1.08it/s]

eval prediction [[[ 1.46049130e+00 -3.71023789e-02 -5.66521764e-01 ... -6.71732128e-02
    9.17637199e-02 -7.39446223e-01]
  [ 3.83629084e+00  3.14959109e-01 -3.00060081e+00 ... -2.05165267e+00
    1.74679971e+00 -3.00509357e+00]
  [ 4.89869308e+00 -3.02554798e+00 -6.37388110e-01 ...  1.96551132e+00
   -2.31688643e+00 -1.62800461e-01]
  ...
  [-1.00000000e+02 -1.00000000e+02 -1.00000000e+02 ... -1.00000000e+02
   -1.00000000e+02 -1.00000000e+02]
  [-1.00000000e+02 -1.00000000e+02 -1.00000000e+02 ... -1.00000000e+02
   -1.00000000e+02 -1.00000000e+02]
  [-1.00000000e+02 -1.00000000e+02 -1.00000000e+02 ... -1.00000000e+02
   -1.00000000e+02 -1.00000000e+02]]

 [[ 1.82026315e+00 -3.67643476e-01 -2.34368905e-01 ...  1.25174522e-01
   -4.47949737e-01 -7.34403849e-01]
  [ 2.94063449e-01  6.40595722e+00 -1.79854214e+00 ... -2.13840580e+00
   -7.60737360e-01 -4.04679489e+00]
  [-1.01316355e-01 -9.64817047e-01  6.11404943e+00 ...  2.02791357e+00
   -2.63783097e+00 -1.15156198e+00]
  ...
  [-1.0

100%|██████████| 136/136 [02:32<00:00,  1.12s/it]

F1 Score of [en] model on [en] dataset : 0.779





since we fine tuned the model on english only, the f1 score obtained here is same as the one we obtained earlier. so we can make an assumption that wit will be similar for the other languages, if the model is fine tuned on their dataset.

In [None]:
text_de = "Die Deutsche Bank hat ihren Hauptsitz in Frankfurt"
tag_text(text_de, tags, trainer.model, xlmr_tokenizer)


input_ids: tensor([[     0,    622,  35473,   4932,   1256,  22667,  47582, 105173,     23,
          37061,      2]])
Shape of Outputs: torch.Size([1, 11, 7])
Predictions: tensor([[0, 0, 3, 4, 0, 0, 0, 0, 0, 5, 0]])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
Tokens,<s>,▁Die,▁Deutsche,▁Bank,▁hat,▁ihren,▁Haupt,sitz,▁in,▁Frankfurt,</s>
NER Tags,O,O,B-ORG,I-ORG,O,O,O,O,O,B-LOC,O


In [None]:
def evaluate_lang_performance(lang, trainer):
    # Encode the PAN-X dataset for the specified language
    panx_ds = encode_panx_dataset(panx_ds_combined[lang])

    # Evaluate the trained model using the test set of the encoded PAN-X dataset
    return get_f1(trainer, panx_ds["test"])

In [None]:
f1_scores["en"]["de"] = evaluate_lang_performance("de", trainer)
print(f"F1 Score of [en] model on [de] dataset : {f1_scores['en']['de']:.3f}")

Map: 100%|██████████| 4260/4260 [00:00<00:00, 9801.62 examples/s]
Map: 100%|██████████| 2130/2130 [00:00<00:00, 12083.20 examples/s]
Map: 100%|██████████| 2130/2130 [00:00<00:00, 12186.12 examples/s]
100%|██████████| 89/89 [01:46<00:00,  1.28s/it]

eval prediction [[[ 1.7654796e+00 -5.3948122e-01 -4.3298650e-01 ... -1.2113470e-01
   -7.7366360e-02 -1.5570924e-01]
  [ 7.9152718e+00 -1.4468144e+00 -1.5894629e+00 ... -1.1036086e+00
   -1.3259628e+00 -2.1514266e+00]
  [ 5.9021029e+00 -7.6581806e-01 -1.5934107e+00 ... -1.8940029e+00
   -2.1271126e-01 -3.0267739e+00]
  ...
  [-1.0000000e+02 -1.0000000e+02 -1.0000000e+02 ... -1.0000000e+02
   -1.0000000e+02 -1.0000000e+02]
  [-1.0000000e+02 -1.0000000e+02 -1.0000000e+02 ... -1.0000000e+02
   -1.0000000e+02 -1.0000000e+02]
  [-1.0000000e+02 -1.0000000e+02 -1.0000000e+02 ... -1.0000000e+02
   -1.0000000e+02 -1.0000000e+02]]

 [[ 1.6829864e+00 -1.2592244e-01  2.0880729e-02 ... -1.2303443e-01
   -4.6750200e-01 -5.4393089e-01]
  [ 7.3661137e+00 -1.6074311e+00 -1.6703248e+00 ... -1.1336257e+00
   -7.5777811e-01 -2.0468729e+00]
  [ 6.3137741e+00 -1.7128105e+00 -1.6979858e+00 ... -9.3677235e-01
   -6.0666120e-01 -1.6773344e+00]
  ...
  [-1.0000000e+02 -1.0000000e+02 -1.0000000e+02 ... -1.000000

100%|██████████| 89/89 [01:46<00:00,  1.20s/it]

F1 Score of [en] model on [de] dataset : 0.721





In [None]:
f1_scores["en"]["fr"] = evaluate_lang_performance("fr", trainer)
print(f"F1 Score of [en] model on [fr] dataset : {f1_scores['en']['fr']:.3f}")

Map: 100%|██████████| 5263/5263 [00:00<00:00, 15126.66 examples/s]
Map: 100%|██████████| 2631/2631 [00:00<00:00, 13920.36 examples/s]
Map: 100%|██████████| 2631/2631 [00:00<00:00, 15315.82 examples/s]
100%|██████████| 110/110 [01:52<00:00,  1.16it/s]

eval prediction [[[ 1.1729939e+00  2.7410325e-01  8.8141453e-01 ... -2.2013556e-02
   -3.1532711e-01 -9.0459645e-01]
  [-1.3996539e+00  6.6986141e+00 -1.1572733e+00 ... -1.7158077e+00
   -7.2285783e-01 -3.7401047e+00]
  [-1.2389549e+00 -8.7883365e-01  6.6923647e+00 ...  1.6943277e+00
   -2.3795376e+00 -1.0277795e+00]
  ...
  [-1.0000000e+02 -1.0000000e+02 -1.0000000e+02 ... -1.0000000e+02
   -1.0000000e+02 -1.0000000e+02]
  [-1.0000000e+02 -1.0000000e+02 -1.0000000e+02 ... -1.0000000e+02
   -1.0000000e+02 -1.0000000e+02]
  [-1.0000000e+02 -1.0000000e+02 -1.0000000e+02 ... -1.0000000e+02
   -1.0000000e+02 -1.0000000e+02]]

 [[ 2.9885125e-01 -6.8031669e-01 -1.3299687e-01 ...  1.2182331e+00
   -5.3166848e-01 -3.1925291e-01]
  [-6.8636402e-02  2.4938121e+00 -3.2208400e+00 ... -1.4235836e+00
    1.6480844e+00 -3.6147037e+00]
  [-1.2832096e+00 -2.1283283e+00  1.3356602e+00 ...  4.6180525e+00
   -2.3879809e+00  1.0397177e+00]
  ...
  [-1.0000000e+02 -1.0000000e+02 -1.0000000e+02 ... -1.000000

100%|██████████| 110/110 [01:52<00:00,  1.02s/it]

F1 Score of [en] model on [fr] dataset : 0.749





In [None]:
f1_scores["en"]["it"] = evaluate_lang_performance("it", trainer)
print(f"F1 Score of [en] model on [it] dataset : {f1_scores['en']['it']:.3f}")

Map: 100%|██████████| 2055/2055 [00:00<00:00, 11241.92 examples/s]
Map: 100%|██████████| 1027/1027 [00:00<00:00, 11424.95 examples/s]
Map: 100%|██████████| 1027/1027 [00:00<00:00, 11729.72 examples/s]
100%|██████████| 43/43 [00:43<00:00,  1.00s/it]

eval prediction [[[ 1.24791801e+00 -6.44773617e-03  2.00187206e-01 ... -5.00802398e-02
   -4.15002704e-01 -4.44445252e-01]
  [ 8.10542774e+00 -1.87346852e+00 -1.29669189e+00 ... -1.00024939e+00
   -1.37954640e+00 -1.57459366e+00]
  [-1.33458996e+00  6.41132069e+00 -8.97890806e-01 ... -1.62227225e+00
   -9.66181517e-01 -3.89935017e+00]
  ...
  [-1.00000000e+02 -1.00000000e+02 -1.00000000e+02 ... -1.00000000e+02
   -1.00000000e+02 -1.00000000e+02]
  [-1.00000000e+02 -1.00000000e+02 -1.00000000e+02 ... -1.00000000e+02
   -1.00000000e+02 -1.00000000e+02]
  [-1.00000000e+02 -1.00000000e+02 -1.00000000e+02 ... -1.00000000e+02
   -1.00000000e+02 -1.00000000e+02]]

 [[-3.29244375e-01 -3.91216576e-01  4.52060759e-01 ...  8.12653601e-01
   -1.13679074e-01  5.49406528e-01]
  [-1.51808143e+00  3.04120874e+00 -2.17342997e+00 ... -1.80968094e+00
    3.10376549e+00 -3.18472695e+00]
  [-1.95961642e+00 -1.75708246e+00  3.05272388e+00 ...  3.36874843e+00
   -1.77071524e+00  2.33589578e+00]
  ...
  [-1.0




In [None]:
f1_scores["en"]["es"] = evaluate_lang_performance("es", trainer)
print(f"F1 Score of [en] model on [es] dataset : {f1_scores['en']['es']:.3f}")

Map: 100%|██████████| 1904/1904 [00:00<00:00, 11255.92 examples/s]
Map: 100%|██████████| 952/952 [00:00<00:00, 13727.80 examples/s]
Map: 100%|██████████| 952/952 [00:00<00:00, 12933.43 examples/s]
100%|██████████| 40/40 [00:29<00:00,  1.35it/s]

eval prediction [[[ 7.02387333e-01  4.58234370e-01  2.37537190e-01 ...  5.70428610e-01
   -3.42528522e-01 -1.39344597e+00]
  [ 7.12630987e+00 -1.49028051e+00 -9.70057011e-01 ... -7.07740188e-01
   -1.28223944e+00 -2.15962958e+00]
  [-1.26061380e+00  6.23447180e+00 -1.33859730e+00 ... -1.30007482e+00
   -7.03920484e-01 -3.83252859e+00]
  ...
  [-1.00000000e+02 -1.00000000e+02 -1.00000000e+02 ... -1.00000000e+02
   -1.00000000e+02 -1.00000000e+02]
  [-1.00000000e+02 -1.00000000e+02 -1.00000000e+02 ... -1.00000000e+02
   -1.00000000e+02 -1.00000000e+02]
  [-1.00000000e+02 -1.00000000e+02 -1.00000000e+02 ... -1.00000000e+02
   -1.00000000e+02 -1.00000000e+02]]

 [[ 5.14903069e-01 -9.47425440e-02 -2.18583584e-01 ...  1.31150514e-01
    3.12980294e-01 -1.10850945e-01]
  [ 7.11149836e+00 -1.85019588e+00 -1.41997623e+00 ... -7.93622613e-01
   -9.06164944e-01 -1.65209866e+00]
  [-1.25735533e+00  3.01985312e+00 -2.80025005e+00 ... -2.25481963e+00
    4.52096796e+00 -2.73410296e+00]
  ...
  [-1.0




As we can see, the f1 scores obtained for german, french, italian, spanish are 0.72,0.75,0.77,0.68 respectively, which are quite similar to that of the english dataset. thus we infer that maybe fine tuning the model seperately on these datasets would be a much costlier process for a marginally better result. with better tuning and optimisation, we could get similar or even better results with zero shot learning