# Training fine-tuned NER model

based on Tutorial from https://github.com/huggingface/notebooks/blob/main/examples/token_classification.ipynb

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
%cd "/content/drive/MyDrive/Colab Notebooks/Fuji_Exercise/"

/content/drive/MyDrive/Colab Notebooks/Fuji_Exercise


In [5]:
from pathlib import Path

In [6]:
# Importing basic libraries
import pandas as pd
from ast import literal_eval
import numpy as np

In [7]:
import torch
print(torch. __version__)

1.12.1+cu113


In [8]:
!pip install datasets transformers seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
import transformers

print(transformers.__version__)

4.23.1


In [10]:
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

In [11]:
# Loading pretrained model for fine-tuning

# model_checkpoint = "distilbert-base-uncased"

# model_checkpoint = "PlanTL-GOB-ES/bsc-bio-es"

model_checkpoint = 'MMG/xlm-roberta-large-ner-spanish'

# model_checkpoint = 'chizhikchi/Spanish_disease_finder'

task = "ner"

batch_size = 24                # Adjust if needed to avoid out-of-memory

## Load and pre-process datasets

In [12]:
train_df = pd.read_csv('train.csv', sep='\t', converters={'tokens': literal_eval, 'ner_tags': literal_eval})
train_df

Unnamed: 0,tokens,ner_tags
0,"[Estoy, enferma, y, dejaron, de, pagarme, mi, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[¿, Qué, probabilidad, hay, de, no, pillar, la...","[O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, ..."
2,"[Ok, chicas, .., hoy, vamos, a, hablar, de, ve...","[O, O, O, O, O, O, O, O, B, O, B, I, O, O, B, ..."
3,"[Aprendamos, un, poquito, de, ellos, ., Facili...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B, O, ..."
4,"[AUTOESTIMA, Y, ANSIEDAD, !, \n \n, La, ansied...","[O, O, O, O, O, O, B, I, O, B, I, O, O, B, I, ..."
...,...,...
1994,"[El, próximo, sábado, continuamos, con, el, CU...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1995,"[Estoy, muy, emocionado, porque, uno, de, mis,...","[O, O, O, O, O, O, O, O, B, O, O, B, O, O, O, ..."
1996,"[otra, vez, esq, tengo, un, trauma, así, lo, d...","[O, O, O, O, O, B, O, O, O, O, O]"
1997,"[Los, familiares, de, pacientes, de, esclerosi...","[O, O, O, O, O, B, I, O, O, B, I, O, O, O, O, ..."


In [13]:
test_df = pd.read_csv('test.csv', sep='\t', converters={'tokens': literal_eval, 'ner_tags': literal_eval})
test_df

Unnamed: 0,tokens,ner_tags
0,"["", Las, niñas, con, autismo, se, acaban, conv...","[O, O, O, O, B, O, O, O, O, O, O, B, I, O, O, ..."
1,"[Nuestro, programa, de, diabetes, de, se, diri...","[O, O, O, B, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[Jornadas, de, Evaluación, Neurológica, pediát...","[O, O, O, O, O, O, O, O, O, O, O, B, I, O, O, ..."
3,"[¿, Sabiais, que, Alzheimer, bautizó, la, enfe...","[O, O, O, B, O, O, O, O, O, O, O, O, O, O, B, ..."
4,"[Cómo, se, puede, enseñar, empatía, ?, Cuando,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...
594,"[Con, el, COVID, hay, buenas, y, malas, notici...","[O, O, B, O, O, O, O, O, O, O, O, O, O, O, O, ..."
595,"[La, obesidad, es, una, enfermedad, crónica, q...","[O, B, O, O, B, I, O, O, O, O, O, O, O, O, O, ..."
596,"[👉, El, trasplante, es, una, alternativa, MUY,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
597,"[Saber, que, varios, de, mis, pacientes, que, ...","[O, O, O, O, O, O, O, O, O, B, I, B, I, O, O, ..."


In [14]:
label_list = ['O','B','I']
label_encoding_dict = {'O': 0, 'B': 1, 'I': 2}

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [16]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [17]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [19]:
train_tokenized_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [21]:
print(train_tokenized_dataset[7])

{'tokens': ['Férula', 'dinámica', 'personalizada', 'para', 'lesión', 'de', 'nervio', 'radial', ',', 'realizada', 'por', 'nuestra', 'terapeuta', 'ocupacional', 'Rocío', 'Fernández', '.', 'La', 'férula', 'dinámica', 'facilita', 'el', 'movimiento', 'de', 'extensión', 'de', 'dedos', 'manteniendo', 'la', 'muñeca', 'en', 'extensión', 'con', 'una', 'correcta', 'alineación', '.', '#', 'NEUROAL', '#', 'terapeutaocupacional', 'https://t.co/RXqk2VQBn8', '\n\n\n\n\n\n'], 'ner_tags': ['O', 'O', 'O', 'O', 'B', 'I', 'B', 'I', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'input_ids': [0, 55234, 882, 143, 182635, 183120, 121, 95, 23346, 8, 24030, 846, 4567, 289, 6, 4, 61777, 196, 13966, 89722, 11, 18327, 15736, 2777, 4471, 31, 66995, 6, 5, 239, 116826, 9723, 182635, 33493, 88, 80152, 8, 172989, 8, 8, 1140, 31101, 20736, 21, 842, 27488, 408, 22, 172989, 158, 220, 104390, 10,

## "Fine-tune a pretrained model using HuggingFace Trainer (PyTorch distribution)

In [22]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, 
    num_labels=len(label_list), 
    ignore_mismatched_sizes=True
  )
model

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at MMG/xlm-roberta-large-ner-spanish and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([3, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLMRobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
        

In [23]:
model_name = f'{model_checkpoint.split("/")[-1]}-finetuned-{task}'
model_name

'xlm-roberta-large-ner-spanish-finetuned-ner'

In [26]:
args = TrainingArguments(
    model_name,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
 )

In [27]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [28]:
metric = load_metric("seqeval")

  """Entry point for launching an IPython kernel.


In [29]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    results_dict = {
        'precision': results["overall_precision"],
        'recall': results["overall_recall"],
        'f1': results["overall_f1"],
        'accuracy': results["overall_accuracy"],
         }
    return results_dict

In [30]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=test_tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [31]:
# Evaluate initial model before training
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `XLMRobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 599
  Batch size = 8
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.6814725995063782,
 'eval_precision': 0.07251197445449707,
 'eval_recall': 0.12862874675477934,
 'eval_f1': 0.09274227856717436,
 'eval_accuracy': 0.6951891639420832,
 'eval_runtime': 16.4771,
 'eval_samples_per_second': 36.353,
 'eval_steps_per_second': 4.552}

In [32]:
# Model training on new dataset
trainer.train()

The following columns in the training set don't have a corresponding argument in `XLMRobertaForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `XLMRobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1999
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.109507,0.794657,0.807411,0.800983,0.963989
2,0.120000,0.110385,0.824994,0.844465,0.834616,0.967328
3,0.120000,0.108154,0.851374,0.855794,0.853578,0.970808


The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `XLMRobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 599
  Batch size = 8


Saving model checkpoint to xlm-roberta-large-ner-spanish-finetuned-ner/checkpoint-500
Configuration saved in xlm-roberta-large-ner-spanish-finetuned-ner/checkpoint-500/config.json
Model weights saved in xlm-roberta-large-ner-spanish-finetuned-ner/checkpoint-500/pytorch_model.bin
tokenizer config file saved in xlm-roberta-large-ner-spanish-finetuned-ner/checkpoint-500/tokenizer_config.json
Special tokens file saved in xlm-roberta-large-ner-spanish-finetuned-ner/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `XLMRobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 599
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForTokenClassification.forward` and have been

TrainOutput(global_step=750, training_loss=0.09752429326375325, metrics={'train_runtime': 610.4413, 'train_samples_per_second': 9.824, 'train_steps_per_second': 1.229, 'total_flos': 1077389016165198.0, 'train_loss': 0.09752429326375325, 'epoch': 3.0})

In [33]:
# Evaluate fine-tuned model after training
trainer.evaluate()


The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `XLMRobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 599
  Batch size = 8


{'eval_loss': 0.10815449059009552,
 'eval_precision': 0.8513735618689833,
 'eval_recall': 0.8557941940051923,
 'eval_f1': 0.853578154425612,
 'eval_accuracy': 0.9708080336291453,
 'eval_runtime': 13.5382,
 'eval_samples_per_second': 44.245,
 'eval_steps_per_second': 5.54,
 'epoch': 3.0}

In [34]:
trainer.save_model(f'{model_name}.model')

Saving model checkpoint to xlm-roberta-large-ner-spanish-finetuned-ner.model
Configuration saved in xlm-roberta-large-ner-spanish-finetuned-ner.model/config.json
Model weights saved in xlm-roberta-large-ner-spanish-finetuned-ner.model/pytorch_model.bin
tokenizer config file saved in xlm-roberta-large-ner-spanish-finetuned-ner.model/tokenizer_config.json
Special tokens file saved in xlm-roberta-large-ner-spanish-finetuned-ner.model/special_tokens_map.json


In [45]:
evaluation_results = literal_eval(Path('NER_finetuning_evaluation.txt').read_text(encoding='UTF-8'))
evaluation_results_df = pd.DataFrame(evaluation_results)

SyntaxError: ignored

In [44]:
!ls

1499829205252591620.txt
annotated.csv
bsc-bio-es-finetuned-ner
bsc-bio-es-finetuned-ner.model
Kuteykin_Fuji_Exercise_task1.ipynb
Kuteykin_Fuji_Exercise_task2.ipynb
mentions.tsv
NER_finetuning_evaluation.txt
NER_Finetuning_HuggingFace_model.ipynb
test.csv
train.csv
xlm-roberta-large-ner-spanish-finetuned-ner
xlm-roberta-large-ner-spanish-finetuned-ner.model


In [30]:
evaluation_results_df.sort_values(by='eval_f1', ascending=False)

Unnamed: 0,initial_model,fine-tuned_model,epoch,eval_accuracy,eval_f1,eval_loss,eval_precision,eval_recall,eval_runtime,eval_samples_per_second,eval_steps_per_second
1,chizhikchi/Spanish_disease_finder,Spanish_disease_finder-finetuned-ner,3.0,0.966981,0.694918,0.097974,0.716667,0.674451,5.0157,119.424,7.576
4,PlanTL-GOB-ES/bsc-bio-es,bsc-bio-es-finetuned-ner,3.0,0.964963,0.694551,0.103694,0.69167,0.697456,5.436,110.191,6.99
0,chizhikchi/Spanish_disease_finder,,3.0,0.928799,0.27222,0.653052,0.409377,0.203904,5.6993,105.101,6.667
2,MMG/xlm-roberta-large-ner-spanish,,,0.720785,0.088311,0.878867,0.074205,0.109039,16.9788,35.279,4.417
3,PlanTL-GOB-ES/bsc-bio-es,,,0.200976,0.058632,1.180117,0.033841,0.21924,7.4864,80.012,5.076


## Evaluation of fine-tuned NER model
### on text example (Spanish tweet)

In [31]:
del model

In [32]:
model_finetuned = f'{model_name}.model'

In [33]:
tokenizer_finetuned = AutoTokenizer.from_pretrained(model_finetuned)

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


In [51]:
### Reading plain text from file

file = Path('1499829205252591620.txt')
text = file.read_text(encoding="UTF-8")
print(text)
tokens = tokenizer_finetuned(text)
tokens

Mira yo lo siento mucho pero esto de los diarios de una enfermedad y convertir tus redes en una especie de blog sobre tu depresión, ansiedad, un TCA o movidas así es como chica quédatelo pa ti, ve a terapia y reserva tus cosas privadas pa ti, no queremos saber tó lo q piensas





{'input_ids': [0, 17915, 1985, 418, 8258, 1374, 663, 1306, 262, 316, 10640, 262, 357, 791, 290, 5182, 3044, 4424, 288, 357, 4752, 262, 4272, 547, 842, 3149, 15, 3289, 15, 304, 29385, 318, 604, 25000, 931, 295, 405, 16252, 1526, 20836, 384, 542, 427, 15, 1229, 269, 2352, 290, 10179, 3044, 2380, 10704, 542, 427, 15, 373, 5719, 2303, 4000, 418, 3746, 29844, 202, 202, 202, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [37]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [46]:
torch.tensor(tokens['input_ids']).unsqueeze(0).size()

ValueError: ignored

In [39]:
model = AutoModelForTokenClassification.from_pretrained(model_finetuned, num_labels=len(label_list))

loading configuration file bsc-bio-es-finetuned-ner.model/config.json
Model config RobertaConfig {
  "_name_or_path": "bsc-bio-es-finetuned-ner.model",
  "architectures": [
    "RobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50262
}

loading weig

In [40]:
predictions_tensor = model.forward(input_ids=torch.tensor(tokens['input_ids']).unsqueeze(0), attention_mask=torch.tensor(tokens['attention_mask']).unsqueeze(0))


In [41]:
predictions_argmax = torch.argmax(predictions_tensor.logits.squeeze(), axis=1)

In [42]:
predictions = [label_list[i] for i in predictions_argmax]

In [43]:
words = tokenizer.batch_decode(tokens['input_ids'])


In [44]:
print(words, '\n', predictions)

['<s>', ' Mira', ' yo', ' lo', ' siento', ' mucho', ' pero', ' esto', ' de', ' los', ' diarios', ' de', ' una', ' enfermedad', ' y', ' convertir', ' tus', ' redes', ' en', ' una', ' especie', ' de', ' blog', ' sobre', ' tu', ' depresión', ',', ' ansiedad', ',', ' un', ' TCA', ' o', ' mo', 'vidas', ' así', ' es', ' como', ' chica', ' qué', 'date', 'lo', ' pa', ' ti', ',', ' ve', ' a', ' terapia', ' y', ' reserva', ' tus', ' cosas', ' privadas', ' pa', ' ti', ',', ' no', ' queremos', ' saber', ' tó', ' lo', ' q', ' piensas', '\n', '\n', '\n', '</s>'] 
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'B', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
