<a href="https://colab.research.google.com/github/iamatul1214/NLP/blob/main/Named_Entity_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Initial project setup

#### We are going to use XLM-R state of the art pretrained models. 
By pretraining on huge corpora across many languages, 
these multilingual transformers enable zero-shot cross-lingual transfer. 
This means that a model that is fine-tuned on one language can be applied to others without any further training!


In [None]:
ROOT="/content/drive/MyDrive/Transformer_practice"
import os
os.chdir(ROOT)
!pwd

/content/drive/MyDrive/Transformer_practice


In [None]:
## Now let us clone the official git repo of transformers to install the required libraries and other stuffs.
!git clone https://github.com/nlp-with-transformers/notebooks.git

Cloning into 'notebooks'...
remote: Enumerating objects: 422, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 422 (delta 0), reused 5 (delta 0), pack-reused 416[K
Receiving objects: 100% (422/422), 24.97 MiB | 5.07 MiB/s, done.
Resolving deltas: 100% (190/190), done.
Checking out files: 100% (117/117), done.


In [None]:
## install some libraries from install folder of our cloned repo
ROOT="/content/drive/MyDrive/Transformer_practice/notebooks"
os.chdir(ROOT)
from install import *
install_requirements()

⏳ Installing base requirements ...
✅ Base requirements installed!
⏳ Installing Git LFS ...
✅ Git LFS installed!


In [None]:
## Device setup on Pytorch
from utils import *
setup_chapter()

import pandas as pd
import numpy as np
import torch

print(torch.__version__)

device = "cuda" if torch.cuda.is_available() else 'cpu'
print(device)

No GPU was detected! This notebook can be *very* slow without a GPU 🐢
Go to Runtime > Change runtime type and select a GPU hardware accelerator.
Using transformers v4.11.3
Using datasets v1.16.1
1.11.0+cu113
cpu


In [None]:
## Let's download the benchmark dataset as this model can be used with this data easily and the data is also annonatated already
## Data Ingestion phase
from datasets import get_dataset_config_names
from datasets import load_dataset
from datasets import load_from_disk

xtreme_subsets = get_dataset_config_names("xtreme")
print(f"XTREME has {len(xtreme_subsets)} configurations ")

Downloading:   0%|          | 0.00/9.04k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.1k [00:00<?, ?B/s]

XTREME has 183 configurations


In [None]:
## This dataset is available in many languages, now we will choose the english language here.

panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
print("All Panx languages \n")
print(panx_subsets)

All Panx languages

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg', 'PAN-X.bn', 'PAN-X.de', 'PAN-X.el',
'PAN-X.en', 'PAN-X.es', 'PAN-X.et', 'PAN-X.eu', 'PAN-X.fa', 'PAN-X.fi',
'PAN-X.fr', 'PAN-X.he', 'PAN-X.hi', 'PAN-X.hu', 'PAN-X.id', 'PAN-X.it',
'PAN-X.ja', 'PAN-X.jv', 'PAN-X.ka', 'PAN-X.kk', 'PAN-X.ko', 'PAN-X.ml',
'PAN-X.mr', 'PAN-X.ms', 'PAN-X.my', 'PAN-X.nl', 'PAN-X.pt', 'PAN-X.ru',
'PAN-X.sw', 'PAN-X.ta', 'PAN-X.te', 'PAN-X.th', 'PAN-X.tl', 'PAN-X.tr',
'PAN-X.ur', 'PAN-X.vi', 'PAN-X.yo', 'PAN-X.zh']


In [None]:
en = load_dataset("xtreme", name='PAN-X.en')
print("English dataset \n")
print(en)

  0%|          | 0/3 [00:00<?, ?it/s]

English dataset

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
})


In [None]:
## Data Exploration
### Train data exploration
for i in en['train']:
  print(pd.DataFrame(i))
  break   ## just checking the first row

      tokens  ner_tags langs
0       R.H.         3    en
1   Saunders         4    en
2          (         0    en
3        St.         3    en
4   Lawrence         4    en
5      River         4    en
6          )         0    en
7          (         0    en
8        968         0    en
9         MW         0    en
10         )         0    en


In [None]:
# A sample data from training
en['train'][101]['tokens']

['The', 'International', 'League', 'of', 'Dermatological', 'Societies']

In [None]:
## Let's look each part of it from the dataframe perspective
pd.DataFrame(en['train'][101]).T

Unnamed: 0,0,1,2,3,4,5
tokens,The,International,League,of,Dermatological,Societies
ner_tags,3,4,4,4,4,4
langs,en,en,en,en,en,en


In [None]:
## Now let's look into the NER tags
en['train'].features['ner_tags'].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [None]:
tags = en["train"].features["ner_tags"].feature
tags

ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], names_file=None, id=None)

In [None]:
tags.int2str(0)

'O'

In [None]:
tags.int2str(1)

'B-PER'

### The dataset imported has 3 for training, testing and validation

In [None]:

def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

new_en = en.map(create_tag_names)

  0%|          | 0/10000 [00:00<?, ?ex/s]

  0%|          | 0/10000 [00:00<?, ?ex/s]

  0%|          | 0/20000 [00:00<?, ?ex/s]

In [None]:
de_example = new_en["train"][100]
pd.DataFrame([de_example["tokens"],de_example["ner_tags"] ,de_example["ner_tags_str"]],
['Tokens',"ner_tags" ,'ner_tags_str'])

Unnamed: 0,0,1,2,3,4
Tokens,List,of,years,in,Brazil
ner_tags,3,4,4,4,4
ner_tags_str,B-ORG,I-ORG,I-ORG,I-ORG,I-ORG


In [None]:
new_en

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 20000
    })
})

## XLM-R 
### XLM-R - > 250,000
Instead of using a WordPiece tokenizer, XLM-R uses a tokenizer called SentencePiece.
 Vocab - after tokenization repace with index position
 white spaces as _ to preserve the space

In [None]:
from transformers import AutoTokenizer
xlmr_model_name = "xlm-roberta-base"   ## We are using roberta-base model here
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

In [None]:
## An example of xlmr tokenizer
xlmr_tokenizer("Hello I'm atul and looking for positivity")

{'input_ids': [0, 35378, 87, 25, 39, 99, 202, 136, 16487, 100, 40523, 2481, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
xlmr_tokens = xlmr_tokenizer("Hello I'm atul and looking for positivity").tokens()
xlmr_tokens

['<s>',
 '▁Hello',
 '▁I',
 "'",
 'm',
 '▁at',
 'ul',
 '▁and',
 '▁looking',
 '▁for',
 '▁positiv',
 'ity',
 '</s>']

# Custom Model Building with Pytorch and hugging face
## Model Architecture file

![nlpt_0404](https://user-images.githubusercontent.com/40850370/176989445-bbf9d48d-3244-4176-b7e0-9440a4ff1b37.png)

In [None]:
# we will use RoBERTa as the base model but augmented with settings specific to XLM-R. 
# The config_class ensures that the standard XLM-R settings are used when we initialize a new model.
# Note that we set add_​pool⁠ing_layer=False to ensure all hidden states are returned and not only the one associated with the [CLS] token.
# Finally, we initialize all the weights by calling the init_weights()
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig


    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # Load model body
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        # Set up token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # Load and initialize weights
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                labels=None, **kwargs):
        # Use model body to get encoder representations
        outputs = self.roberta(input_ids, attention_mask=attention_mask,
                               token_type_ids=token_type_ids, **kwargs)
        # Apply classifier to encoder representation
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        # Calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        # Return model output object
        return TokenClassifierOutput(loss=loss, logits=logits,
                                     hidden_states=outputs.hidden_states,
                                     attentions=outputs.attentions)










# Auto Configuration 

### The AutoConfig class contains the blueprint of a model’s architecture. 
###some additional information beyond the model name, 
### including the tags that we will use to label each entity and the mapping of each tag to an ID and vice versa.

In [None]:
index2tag = {idx:tag for idx, tag in enumerate(tags.names)}
index2tag

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC'}

In [None]:
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

In [None]:
tag2index

{'B-LOC': 5,
 'B-ORG': 3,
 'B-PER': 1,
 'I-LOC': 6,
 'I-ORG': 4,
 'I-PER': 2,
 'O': 0}

In [None]:
xlmr_model_name

'xlm-roberta-base'

In [None]:
tags.num_classes    # So we have actually only 7 labels

7

In [None]:
from transformers import AutoConfig

xlmr_config = AutoConfig.from_pretrained(xlmr_model_name, 
                                         num_labels=tags.num_classes,
                                         id2label=index2tag, label2id=tag2index)

In [None]:
xlmr_config

XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

## Quick prediction without training

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


print(device)


xlmr_model = (XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name, config=xlmr_config).to(device))

cpu


In [None]:
new_en["train"][101]

{'langs': ['en', 'en', 'en', 'en', 'en', 'en'],
 'ner_tags': [3, 4, 4, 4, 4, 4],
 'ner_tags_str': ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'],
 'tokens': ['The',
  'International',
  'League',
  'of',
  'Dermatological',
  'Societies']}

In [None]:
" ".join(new_en["train"][100]["tokens"])

'The International League of Dermatological Societies'

In [None]:
text = " ".join(new_en["train"][100]["tokens"])
text

'List of years in Brazil'

In [None]:
input_ids = xlmr_tokenizer.encode(text, return_tensors='pt')

In [None]:
input_ids

tensor([[    0, 32036,   111,  5369,    23, 30089,     2]])

In [None]:
outputs = xlmr_model(input_ids.to(device)).logits

predictions = torch.argmax(outputs, dim = -1)

print(f"Number of tokens in sequence = {len(xlmr_tokens)}")
print(f"Shape of the output = {outputs.shape}")

Number of tokens in sequence = 13
Shape of the output = torch.Size([1, 7, 7])


In [None]:
data = [i.item() for i in predictions[0]]
data

[1, 1, 1, 1, 1, 1, 1]

In [None]:
[index2tag[idx] for idx in data][1:-1]

['B-PER', 'B-PER', 'B-PER', 'B-PER', 'B-PER']

In [None]:
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens, preds], index=["Tokens", "Tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
Tokens,<s>,▁Hello,▁I,',m,▁at,ul,▁and,▁looking,▁for,▁positiv,ity,</s>
Tags,B-PER,B-PER,B-PER,B-PER,B-PER,B-PER,B-PER,,,,,,


In [None]:
predictions

tensor([1, 1, 1, 1, 1, 1, 1])

In [None]:
def tag_text(text, tags, model, tokenizer):
    # Get tokens with special characters
    tokens = tokenizer(text).tokens()
    # Encode the sequence into IDs
    input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
    # Get predictions as distribution over 7 possible classes
    outputs = model(input_ids)[0]
    # Take argmax to get most likely class per token
    predictions = torch.argmax(outputs, dim=2)
    # Convert to DataFrame
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

In [None]:
de_example = new_en["train"][8]

In [None]:
words, labels = de_example["tokens"], de_example["ner_tags"]
words, labels

(['*Inducted',
  'into',
  'the',
  'United',
  'States',
  'Hockey',
  'Hall',
  'of',
  'Fame',
  'in',
  '2015'],
 [0, 0, 0, 3, 4, 4, 4, 4, 4, 0, 0])

In [None]:
tokenized_input = xlmr_tokenizer(de_example["tokens"], is_split_into_words=True)
tokenized_input

{'input_ids': [0, 661, 4153, 77193, 297, 3934, 70, 14098, 46684, 193171, 19449, 111, 52917, 13, 23, 918, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['<s>',
 '▁*',
 'In',
 'duct',
 'ed',
 '▁into',
 '▁the',
 '▁United',
 '▁States',
 '▁Hockey',
 '▁Hall',
 '▁of',
 '▁Fam',
 'e',
 '▁in',
 '▁2015',
 '</s>']

In [None]:
word_ids = tokenized_input.word_ids()
word_ids

[None, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, None]

Why do we choose –100 as the ID to mask subword representations? The reason is that in PyTorch the cross-entropy loss class torch.nn.CrossEntropyLoss has an attribute called ignore_index whose value is –100. This index is ignored during training, so we can use it to ignore the tokens associated with consecutive subwords

In [None]:
[index2tag[idx] for idx in [0, 0, 0, 3, 4, 4, 4, 4, 4, 0, 0]][1:-1]

['O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O']

In [None]:
new_en["train"][8]['tokens']

['*Inducted',
 'into',
 'the',
 'United',
 'States',
 'Hockey',
 'Hall',
 'of',
 'Fame',
 'in',
 '2015']

In [None]:
previous_word_idx = None
label_ids = []

for word_idx in word_ids:
    if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
    elif word_idx != previous_word_idx:
        label_ids.append(labels[word_idx])
    previous_word_idx = word_idx

labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]
index = ["Tokens", "Word IDs", "Label IDs", "Labels"]

pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Tokens,<s>,▁*,In,duct,ed,▁into,▁the,▁United,▁States,▁Hockey,▁Hall,▁of,▁Fam,e,▁in,▁2015,</s>
Word IDs,,0,0,0,0,1,2,3,4,5,6,7,8,8,9,10,
Label IDs,-100,0,-100,-100,-100,0,0,3,4,4,4,4,4,-100,0,0,-100
Labels,IGN,O,IGN,IGN,IGN,O,O,B-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,IGN,O,O,IGN


In [None]:
new_en["train"][8]["ner_tags"]

[0, 0, 0, 3, 4, 4, 4, 4, 4, 0, 0]

In [None]:
# apply this to whole dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True,
                                      is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels, batched=True,
                      remove_columns=['langs', 'ner_tags', 'tokens'])

In [None]:
["wordId","index_id","label id","labels","attention mask"]

['wordId', 'index_id', 'label id', 'labels', 'attention mask']

In [None]:
panx_en_encoded = encode_panx_dataset(new_en) 

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

In [None]:
panx_en_encoded

DatasetDict({
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'ner_tags_str'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'ner_tags_str'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'ner_tags_str'],
        num_rows: 20000
    })
})

# Prediction Matrics

#### Evaluating a NER model is similar to evaluating a text classification model, and it is common to report results for precision, recall, and F1-score. The only subtlety is that all words of an entity need to be predicted correctly in order for a prediction to be counted as correct.

In [None]:
from seqeval.metrics import classification_report
print(new_en["train"][100]["tokens"])
print(new_en["train"][100]["ner_tags_str"])

['List', 'of', 'years', 'in', 'Brazil']
['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG']


In [None]:
y_true = [["O", "O", "O", "B-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]

y_pred = [["O", "O", "O", "B-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]


print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

        MISC       1.00      1.00      1.00         1
         PER       1.00      1.00      1.00         1

   micro avg       1.00      1.00      1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



In [None]:
import numpy as np

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)

    batch_size, seq_len = preds.shape
    
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):

        example_labels, example_preds = [], []

        for seq_idx in range(seq_len):
            # Ignore label IDs = -100

            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

In [None]:
from transformers import TrainingArguments

num_epochs = 10

batch_size = 24

logging_steps = len(panx_en_encoded["train"].select(range(100))) // batch_size

model_name = f"{xlmr_model_name}-finetuned-panx-en"

training_args = TrainingArguments(
    output_dir=model_name, 
    log_level="error", 
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size, 
    evaluation_strategy="epoch",
    save_steps=1e6, 
    weight_decay=0.01,
    disable_tqdm=False,
    logging_steps=logging_steps
    )

In [None]:
training_args

TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=IntervalStrategy.EPOCH,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=40,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=xlm-roberta-base-finetuned-panx-en/runs/Jul05_15-46-23_0eabd064ad57,
logging_first_step=False,
logging_nan_

In [None]:
from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions,
                                       eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

In [None]:
# The final step is to define a data collator so we can pad each input sequence 
# to the largest sequence length in a batch. nlpt_pin01 Transformers provides a
# dedicated data collator for token classification that will pad the labels along with the inputs:

from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [None]:
def model_init():
    return (XLMRobertaForTokenClassification
            .from_pretrained(xlmr_model_name, config=xlmr_config)
            .to(device))

In [None]:
from transformers import Trainer

trainer = Trainer(model_init=model_init, args=training_args,
                  data_collator=data_collator, compute_metrics=compute_metrics,
                  train_dataset=panx_en_encoded["train"].select(range(100)),
                  eval_dataset=panx_en_encoded["validation"].select(range(10)),
                  tokenizer=xlmr_tokenizer
                  )

In [None]:
trainer

<transformers.trainer.Trainer at 0x7f9808497690>

In [None]:
trainer.train() 

Epoch,Training Loss,Validation Loss,F1
1,1.6937,1.57917,0.0
2,1.4957,1.401296,0.0
3,1.3842,1.307139,0.095238
4,1.1286,1.221704,0.4
5,0.9129,1.207355,0.216216
6,0.8319,1.142066,0.258065
7,0.8762,1.105881,0.222222
8,0.6551,1.127335,0.285714
9,0.5689,1.062311,0.277778
10,0.4908,1.032184,0.216216


TrainOutput(global_step=50, training_loss=0.9596670389175415, metrics={'train_runtime': 658.8774, 'train_samples_per_second': 1.518, 'train_steps_per_second': 0.076, 'total_flos': 25040710173072.0, 'train_loss': 0.9596670389175415, 'epoch': 10.0})

# Model Prediction 

In [None]:
input_ids = xlmr_tokenizer(new_en["validation"][:10]["tokens"],truncation=True,
                                      is_split_into_words=True)

In [None]:
data = torch.tensor(panx_en_encoded["train"][100]["input_ids"])
data = data.reshape(1,-1)

In [None]:
outputs = xlmr_model(data.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)
print(f"Number of tokens in sequence: {len(data[0])}")
print(f"Shape of outputs: {outputs.shape}")

Number of tokens in sequence: 7
Shape of outputs: torch.Size([1, 7, 7])


In [None]:
" ".join(xlmr_tokenizer.convert_ids_to_tokens(panx_en_encoded["train"][100]["input_ids"]))

'<s> ▁List ▁of ▁years ▁in ▁Brazil </s>'

In [None]:
torch.tensor(panx_en_encoded["train"][100]["input_ids"])

tensor([    0, 32036,   111,  5369,    23, 30089,     2])

In [None]:
outputs

tensor([[[ 0.1758,  0.4810,  0.1972, -0.0777,  0.0460, -0.5583,  0.2570],
         [ 0.1539,  0.6895,  0.2202, -0.2407, -0.0116, -0.4932,  0.0732],
         [ 0.1953,  0.6670,  0.2503, -0.2064, -0.1150, -0.4126,  0.1219],
         [ 0.2117,  0.6354,  0.2805, -0.2283, -0.0319, -0.5018,  0.1179],
         [ 0.1910,  0.6258,  0.3202, -0.2357, -0.0151, -0.4012,  0.1353],
         [ 0.1901,  0.6030,  0.1763, -0.1960,  0.0135, -0.4810,  0.0706],
         [ 0.1555,  0.4572,  0.2192, -0.0744,  0.0383, -0.5259,  0.2606]]],
       grad_fn=<AddBackward0>)

In [None]:
data

tensor([[    0, 32036,   111,  5369,    23, 30089,     2]])

In [None]:
tokens = xlmr_tokenizer.convert_ids_to_tokens(data[0])
" ".join(tokens)

'<s> ▁List ▁of ▁years ▁in ▁Brazil </s>'

In [None]:
pred_tags = [index2tag[i.item()] for i in predictions[0]][1:-1]
pred_tags

['B-PER', 'B-PER', 'B-PER', 'B-PER', 'B-PER']

In [None]:
" ".join([index2tag[i.item()] for i in predictions[0]])

'B-PER B-PER B-PER B-PER B-PER B-PER B-PER'

In [None]:
panx_en_encoded["validation"][0]['labels']

[-100, 3, -100, -100, 4, 4, 0, 5, -100, -100, 6, 6, -100, 6, 6, -100, 0, -100]

In [None]:
xlmr_tokenizer.convert_ids_to_tokens(panx_en_encoded["validation"][0]["input_ids"])

['<s>',
 '▁Si',
 'o',
 'ux',
 '▁Falls',
 '▁Arena',
 '▁(',
 '▁Si',
 'o',
 'ux',
 '▁Falls',
 '▁',
 ',',
 '▁South',
 '▁Da',
 'kota',
 '▁)',
 '</s>']

In [None]:
panx_en_encoded["validation"][0]['ner_tags_str']

['B-ORG',
 'I-ORG',
 'I-ORG',
 'O',
 'B-LOC',
 'I-LOC',
 'I-LOC',
 'I-LOC',
 'I-LOC',
 'O']