**Training of Roberta model for token classification on CHIA dataset**

In [1]:
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from datasets import load_dataset, load_metric
import evaluate
import wandb
import torch

In [None]:
wandb.login()

In [8]:
# dict for the entities (entity to int value)
simple_ent = {"Condition", "Value", "Drug", "Procedure", "Measurement", "Temporal", "Observation", "Person", "Device"}
sel_ent = {
    "O": 0,
    "B-Condition": 1,
    "I-Condition": 2,
    "B-Value": 3,
    "I-Value": 4,
    "B-Drug": 5,
    "I-Drug": 6,
    "B-Procedure": 7,
    "I-Procedure": 8,
    "B-Measurement": 9,
    "I-Measurement": 10,
    "B-Temporal": 11,
    "I-Temporal": 12,
    "B-Observation": 13,
    "I-Observation": 14,
    "B-Person": 15,
    "I-Person": 16,
    "B-Device": 17,
    "I-Device": 18
}

entities_list = list(sel_ent.keys())
sel_ent_inv = {v: k for k, v in sel_ent.items()}

In [9]:
root = '..'
root = './drive/MyDrive/TER-LISN'
data_path = f'{root}/data'
models_path = f'{root}/models'

In [3]:
model_name = "roberta-base"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
# tokenize and align the labels in the dataset
def tokenize_and_align_labels(sentence, tokenizer, flag = 'I'):
    """
    Tokenize the sentence and align the labels
    inputs:
        sentence: dict, the sentence from the dataset
        flag: str, the flag to indicate how to deal with the labels for subwords
            - 'I': use the label of the first subword for all subwords but as intermediate (I-ENT)
            - 'B': use the label of the first subword for all subwords as beginning (B-ENT)
            - None: use -100 for subwords
    outputs:
        tokenized_sentence: dict, the tokenized sentence now with a field for the labels
    """
    tokenized_sentence = tokenizer(sentence['tokens'], is_split_into_words=True, truncation=True)

    labels = []
    for i, labels_s in enumerate(sentence['ner_tags']):
        word_ids = tokenized_sentence.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # if the word_idx is None, assign -100
            if word_idx is None:
                label_ids.append(-100)
            # if it is a new word, assign the corresponding label
            elif word_idx != previous_word_idx:
                label_ids.append(labels_s[word_idx])
            # if it is the same word, check the flag to assign
            else:
                if flag == 'I':
                    if entities_list[labels_s[word_idx]].startswith('I'):
                      label_ids.append(labels_s[word_idx])
                    else:
                      label_ids.append(labels_s[word_idx] + 1)
                elif flag == 'B':
                    label_ids.append(labels_s[word_idx])
                elif flag == None:
                    label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_sentence['labels'] = labels
    return tokenized_sentence

In [7]:
dataset = load_dataset('JavierLopetegui/chia_v1')

In [None]:
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

In [None]:
# tokenize and align the labels in the dataset
train_dataset = train_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, 'I'))
val_dataset = val_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, 'I'))
test_dataset = test_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, 'I'))

In [None]:
# load the model
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(entities_list))

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
model.to(device)

In [None]:
# define the training arguments
args = TrainingArguments(
    report_to = 'wandb',
    run_name = 'chia_ner_with_roberta',
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    overwrite_output_dir = True,
    eval_steps=50,
    save_steps=1000,
    output_dir = 'chia_ner_with_roberta',
)

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
def compute_metrics_tr(p):
    """
    Compute the metrics for the model
    inputs:
        p: tuple, the predictions and the labels
    outputs:
        dict: the metrics
    """
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [entities_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [entities_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
# define the trainer
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_tr
)

In [None]:
wandb.init(project = "Chia_NER")

In [None]:
trainer.train()

In [None]:
model.to('cpu')

In [None]:
torch.save(model, f"{models_path}/roberta-ner-chia.pt")

In [None]:
wandb.finish()