# Finetuning RoBERTa for NER: Train Model
 

***

## Imports

In [2]:
from transformers import (BertTokenizerFast,
                          RobertaTokenizerFast,
                          AutoTokenizer,
                          BertForTokenClassification,
                          RobertaForTokenClassification,
                          DataCollatorForTokenClassification, 
                          AutoModelForTokenClassification, 
                          TrainingArguments, Trainer)
from datasets import load_dataset, load_metric, concatenate_datasets, DatasetDict
from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import dill as pickle
import torch
import math
import os

## Load Dataset

In [3]:
data_path = "./data/dataset_processed.pkl"
with open(data_path, 'rb') as pickle_file:
    dataset = pickle.load(file=pickle_file)

In [4]:
dataset["train"][0]

{'tokens': ['Das',
  'Beispiel',
  'der',
  'Wirtschaft',
  'Japans',
  'zeigt',
  ',',
  'dass',
  'dies',
  'Jahre',
  'oder',
  'Jahrzehnte',
  'dauern',
  'kann',
  '.'],
 'ner_tags': [0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'langs': ['de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de'],
 'spans': ['ORG: Wirtschaft Japans'],
 'input_ids': [1858,
  24212,
  122,
  96913,
  15758,
  7,
  35615,
  6,
  4,
  1421,
  14792,
  12241,
  1367,
  182205,
  13,
  201560,
  1876,
  6,
  5],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [0, 0, 0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

## Load Model and Tokenizer

Information about model variants can be found here: https://huggingface.co/docs/transformers/model_doc/roberta

Load Model which can be finetuned:

In [5]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [6]:
label_list = dataset["train"].features[f"ner_tags"].feature.names

In [7]:
model_name = "xlm-roberta-large" #"bert-base-multilingual-cased" #xlm-roberta-large
tokenizer = AutoTokenizer.from_pretrained(f"{model_name}", add_prefix_space=True) #AutoTokenizer(use_fast = True)
model = AutoModelForTokenClassification.from_pretrained(f"{model_name}", num_labels=len(label_list))

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForTokenClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-

## Define Data Collator

In [8]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

## Define Trainer

About the Model:

see https://github.com/huggingface/transformers/blob/v4.21.1/src/transformers/modeling_utils.py#L829

In [9]:
print("Parameters:", model.num_parameters())
print("Expected Input Dict:", model.main_input_name )

# Estimate FLOPS needed for one training example
sample = dataset["train"][0]
sample["input_ids"] = torch.Tensor(sample["input_ids"])
flops_est = model.floating_point_ops(input_dict = sample, exclude_embeddings = False)

print("FLOPS needed per Training Sample:", flops_est )

Parameters: 558848007
Expected Input Dict: input_ids
FLOPS needed per Training Sample: 63708672798


In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
})

**Define Optimizer:**

See https://huggingface.co/docs/transformers/main_classes/optimizer_schedules#transformers.Adafactor

In [11]:
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

num_epochs = 1
batch_size = 16
num_reports = 10

# A training step is one gradient update. In one step batch_size examples are processed.
# An epoch consists of one full cycle through the training data. 
# This is usually many steps. As an example, if you have 2,000 images and use
# a batch size of 10 an epoch consists of:
gpu_count = torch.cuda.device_count()
num_steps = (len(dataset["train"]) / batch_size / gpu_count) * num_epochs

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-6, weight_decay=0.01, no_deprecation_warning= True)

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps= num_steps 
)

print("Steps:", num_steps)

Steps: 1250.0


**Define Log and Eval Interval:**

In [12]:
report_steps = math.floor(num_steps / num_reports)
print("Eval interval:", report_steps)

Eval interval: 125


**Define Metrics:**

See https://huggingface.co/course/chapter7/2#metrics

In [13]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [14]:
example = dataset["train"][150]
labels = [label_list[i] for i in example[f"labels"]]
metric.compute(predictions=[labels], references=[labels])

{'PER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

Set correct class labels:

In [15]:
label_names = dataset["train"].features[f"ner_tags"].feature.names

id2label = {id : label for id, label in enumerate(label_names)}
label2id = {label: id for id, label in enumerate(label_names)}

model.config.id2label = id2label
model.config.label2id = label2id

Define callback function to evaluate the model:

In [16]:
label_names = model.config.id2label

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_names[l] for l in label  if l != -100] for label in labels]
    #true_predictions = [model.config.id2label[t.item()] for t in predictions]
    
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label)  if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

**Remove unnecessary columns:**

In [17]:
dataset = dataset.remove_columns(["tokens", "ner_tags", "langs", "spans"])

**Set further Training Arguments:**

See https://huggingface.co/docs/transformers/v4.21.2/en/main_classes/trainer#transformers.TrainingArguments

In [18]:
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy = "steps",
    save_steps = report_steps,
    remove_unused_columns = True,
    evaluation_strategy="steps",
    eval_steps = report_steps,
    #load_best_model_at_end=True,
    logging_strategy = "steps",
    logging_steps = report_steps,
    #learning_rate= 2e-5,
    #auto_find_batch_size = True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #gradient_accumulation_steps=4,
    #optim="adamw_torch",
    num_train_epochs=num_epochs,
    #weight_decay=0.01,
    report_to="none",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler),
    compute_metrics=compute_metrics
)

Using cuda_amp half precision backend


## Train Model

GPU used by Kaggle: https://www.nvidia.com/de-de/data-center/tesla-p100/

In [19]:
!nvidia-smi

Sat Jan  7 16:47:36 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A40          Off  | 00000000:01:00.0 Off |                    0 |
|  0%   43C    P0    78W / 300W |   2945MiB / 46068MiB |     42%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A40          Off  | 00000000:43:00.0 Off |                    0 |
|  0%   43C    P8    35W / 300W |     26MiB / 46068MiB |      0%      Default |
|       

In [20]:
trainer.train()

***** Running training *****
  Num examples = 40000
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1250
  Number of trainable parameters = 558848007
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
125,1.1422,0.593166,0.469493,0.517918,0.492518,0.812434
250,0.5423,0.384173,0.667477,0.698618,0.682692,0.881813
375,0.4234,0.327806,0.745336,0.768184,0.756587,0.906123
500,0.3613,0.302047,0.758416,0.771904,0.7651,0.911014
625,0.3239,0.284788,0.781934,0.781281,0.781607,0.916402
750,0.3273,0.282079,0.779709,0.80142,0.790415,0.916028
875,0.2909,0.271596,0.793539,0.80104,0.797272,0.921077
1000,0.2875,0.24813,0.808497,0.814498,0.811486,0.926583
1125,0.2697,0.246678,0.813195,0.808101,0.81064,0.925816
1250,0.2641,0.23919,0.818208,0.817307,0.817757,0.928426


***** Running Evaluation *****
  Num examples = 20000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-125
Configuration saved in ./results/checkpoint-125/config.json
Model weights saved in ./results/checkpoint-125/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-125/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-125/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-250
Configuration saved in ./results/checkpoint-250/config.json
Model weights saved in ./results/checkpoint-250/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-250/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-250/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-375
Configuration saved in ./results/checkpoint-375/config.json
Mode

TrainOutput(global_step=1250, training_loss=0.42325708923339844, metrics={'train_runtime': 1379.2866, 'train_samples_per_second': 29.0, 'train_steps_per_second': 0.906, 'total_flos': 3160037154846528.0, 'train_loss': 0.42325708923339844, 'epoch': 1.0})

In [21]:
eval_results = trainer.evaluate()
print(f"Eval Loss: {eval_results['eval_loss']}")

***** Running Evaluation *****
  Num examples = 20000
  Batch size = 32


Eval Loss: 0.23919013142585754


**Saving the fine tuned model & tokenizer:**

In [22]:
trainer.save_model(f'./results/checkpoint-final/')

Saving model checkpoint to ./results/checkpoint-final/
Configuration saved in ./results/checkpoint-final/config.json
Model weights saved in ./results/checkpoint-final/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-final/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-final/special_tokens_map.json


**Save Training History:**

In [23]:
data_path = "./results/checkpoint-final/training_args.pkl"
with open(data_path, 'wb') as pickle_file:
    pickle.dump(obj = trainer.args, file=pickle_file)

In [24]:
data_path = "./results/checkpoint-final/training_history.pkl"
with open(data_path, 'wb') as pickle_file:
    pickle.dump(obj = trainer.state, file=pickle_file)

**Calculate Accuracy:**

In [25]:
predictions, labels, _ = trainer.predict(dataset["test"])
predictions = np.argmax(predictions, axis=-1)

***** Running Prediction *****
  Num examples = 20000
  Batch size = 32


In [26]:
true_labels = [
    [label_names[l] for l in label  if l != -100] 
    for label in labels
]

true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label)  if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
pprint(results)

{'LOC': {'f1': 0.8512989302927002,
         'number': 21063,
         'precision': 0.843191132637854,
         'recall': 0.8595641646489104},
 'ORG': {'f1': 0.732703281027104,
         'number': 16972,
         'precision': 0.7392060446150156,
         'recall': 0.7263139288239453},
 'PER': {'f1': 0.8670305901740653,
         'number': 14649,
         'precision': 0.872150849564857,
         'recall': 0.8619701003481466},
 'overall_accuracy': 0.9284256743382577,
 'overall_f1': 0.8177570980913494,
 'overall_precision': 0.8182077300193821,
 'overall_recall': 0.8173069622655835}
