# HuggingFace Fine-Tuning NER model

In [2]:
from pathlib import Path
import os
import re
import sys
import random

import numpy as np
from sklearn.model_selection import train_test_split
import torch

from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
from transformers import Trainer
from transformers import TrainingArguments

from datasets import load_metric

from spacy import displacy

print('* loaded all libs')

* loaded all libs


In [3]:
# specify which GPU to use
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

## Pre-processing the dataset

In [4]:
dataset_fn = 'dataset.tsv'
test_size = 0.2
model_name = 'distilbert-base-cased'
fine_tuned_model_path = "./fine-tuned-model/covid19_symp_model"
output_dir = './results'

In [5]:
def read_dataset(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    
    maxlen_tokens = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
            
        if len(tokens) > len(maxlen_tokens):
            maxlen_tokens = tokens
            print("* maxlen of tokens: %d" % (len(maxlen_tokens)))
            
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs


def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels


class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

tokenizer = AutoTokenizer.from_pretrained(model_name)

# parse the texts and tags
texts, tags = read_dataset(dataset_fn)

# split into train val
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=test_size)

# Next, let’s create encodings for our tokens and tags. 
# For the tags, we can start by just create a simple mapping which we’ll use in a moment:
unique_tags = set(tag for doc in tags for tag in doc)
label_list = list(unique_tags)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

print('* train texts: ', len(train_texts))
print('* unique tags: ', len(unique_tags))
print('* tags: ', label_list)

# get encodings
train_encodings = tokenizer(
    train_texts, 
    is_split_into_words=True, 
    return_offsets_mapping=True, 
    padding=True, 
    truncation=True
)
val_encodings = tokenizer(
    val_texts, 
    is_split_into_words=True, 
    return_offsets_mapping=True, 
    padding=True, 
    truncation=True
)

# get labels
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

# get datasets
# we don't want to pass this to the model
train_encodings.pop("offset_mapping") 
val_encodings.pop("offset_mapping")

train_dataset = MyDataset(train_encodings, train_labels)
val_dataset = MyDataset(val_encodings, val_labels)

print('* created dataset for training')

* maxlen of tokens: 58
* maxlen of tokens: 61
* maxlen of tokens: 99
* train texts:  129
* unique tags:  5
* tags:  ['I-TREATMENT', 'B-TREATMENT', 'O', 'I-SYMP', 'B-SYMP']
* created dataset for training


## Fine-tuning the model

In [6]:
training_args = TrainingArguments(
    output_dir=output_dir,           # output directory
    num_train_epochs=50,             # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

model = AutoModelForTokenClassification.from_pretrained(
    model_name, 
    num_labels=len(unique_tags), 
    id2label=id2tag, 
    label2id=tag2id
)

metric = load_metric("seqeval")
return_entity_level_metrics = True
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics
)

trainer.train()
print(trainer.evaluate())

# save the model
model.save_pretrained(fine_tuned_model_path)
print('* done fine-tuning and saved the fine-tuned model')

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this 

Step,Training Loss
10,1.394
20,1.353
30,1.2126
40,1.0466
50,0.8177
60,0.5861
70,0.5734
80,0.4186
90,0.331
100,0.2643




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 33
  Batch size = 256


Configuration saved in ./fine-tuned-model/covid19_symp_model/config.json


{'eval_loss': 0.2671107351779938, 'eval_SYMP_precision': 0.5079365079365079, 'eval_SYMP_recall': 0.64, 'eval_SYMP_f1': 0.5663716814159292, 'eval_SYMP_number': 50, 'eval_TREATMENT_precision': 0.7878787878787878, 'eval_TREATMENT_recall': 0.7878787878787878, 'eval_TREATMENT_f1': 0.7878787878787878, 'eval_TREATMENT_number': 33, 'eval_overall_precision': 0.6041666666666666, 'eval_overall_recall': 0.6987951807228916, 'eval_overall_f1': 0.64804469273743, 'eval_overall_accuracy': 0.9298642533936652, 'eval_runtime': 0.128, 'eval_samples_per_second': 257.756, 'eval_steps_per_second': 7.811, 'epoch': 50.0}


Model weights saved in ./fine-tuned-model/covid19_symp_model/pytorch_model.bin


* done fine-tuning and saved the fine-tuned model


## Predict

In [7]:
# if we have a pre-trained model we could just load it
model = AutoModelForTokenClassification.from_pretrained(fine_tuned_model_path)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print('* loaded the fine-turned model and tokenizer')

loading configuration file ./fine-tuned-model/covid19_symp_model/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "I-TREATMENT",
    "1": "B-TREATMENT",
    "2": "O",
    "3": "I-SYMP",
    "4": "B-SYMP"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-SYMP": 4,
    "B-TREATMENT": 1,
    "I-SYMP": 3,
    "I-TREATMENT": 0,
    "O": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.9.2",
  "vocab_size": 28996
}

loading weights file ./fine-tuned-model/covid19_symp_model/pytorch_model.bin
Al

* loaded the fine-turned model and tokenizer


In [11]:
# the default result doesn't group the entities
# so we need to specify the `aggregation_strategy` to group the entity for display
nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy='first')

texts = [
    "I was feeling bad all over about 7 pm aching and chills all over, and got nausea and extreme headache",
    "Moderna Covid-19 vaccine EUA Soreness at the injection site, body aches, chills, fatigue Treated effectively with ibuprofen",
    "Lightheaded immediately and almost passed out.  Felt a tingling in both hands.",
    "Metallic taste in mouth for about an hour.  8 hours later had the chills, body aches, headache, sinus pressure, and blocked ears.",
    "I feel head is ache and have some kind of breathing issue, but I don't take any aspirin",
    "He got cough last night"
]

# for showing the tags
get_color = lambda: "#"+''.join([random.choice('89ABCDEF') for j in range(6)])

# for spacy display
options = {"ents": [], "colors": {}}
for label in label_list:
    if label == 'O': continue
    ent = label[2:]
    if ent in options['ents']: continue
        
    options['ents'].append(ent)
    color = "linear-gradient(90deg, %s, %s)" % (get_color(), get_color())
    options['colors'][ent] = color

# predict each text
for i,text in enumerate(texts):
    result = nlp(text)
    doc = {
        "text": text,
        "ents": []
    }
    for ent in result:
        doc['ents'].append({
            'start': ent['start'], 
            'end': ent['end'], 
            'label': ent['entity_group']
        })
        
    displacy.render([doc], style='ent', manual=True, options=options)
    print('-'*60)

------------------------------------------------------------


------------------------------------------------------------


------------------------------------------------------------


------------------------------------------------------------


------------------------------------------------------------


------------------------------------------------------------
