# Finetuning RoBERTa for NER: Train Model
 

***

## Imports

In [129]:
from transformers import (BertTokenizerFast,
                          RobertaTokenizerFast,
                          AutoTokenizer,
                          BertForTokenClassification,
                          RobertaForTokenClassification,
                          DataCollatorForTokenClassification, 
                          AutoModelForTokenClassification, 
                          TrainingArguments, Trainer)
from datasets import load_dataset, load_metric, concatenate_datasets, DatasetDict
from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import dill as pickle
import torch
import os

## Load Dataset

In [99]:
data_path = "./data/dataset_processed.pkl"
with open(data_path, 'rb') as pickle_file:
    dataset = pickle.load(file=pickle_file)

In [100]:
dataset["train"][0]

{'tokens': ["''", 'Super', 'World', 'of', 'Sports', "''", "'"],
 'ner_tags': [0, 3, 4, 4, 4, 0, 0],
 'langs': ['en', 'en', 'en', 'en', 'en', 'en', 'en'],
 'spans': ['ORG: Super World of Sports'],
 'input_ids': [5106, 4265, 6661, 111, 39170, 5106, 242],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1],
 'labels': [0, 3, 4, 4, 4, 0, 0]}

## Load Model and Tokenizer

Information about model variants can be found here: https://huggingface.co/docs/transformers/model_doc/roberta

Load Model which can be finetuned:

In [101]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [102]:
label_list = dataset["train"].features[f"ner_tags"].feature.names

In [103]:
model_name = "xlm-roberta-large" #"bert-base-multilingual-cased" #xlm-roberta-large
tokenizer = AutoTokenizer.from_pretrained(f"{model_name}", add_prefix_space=True) #AutoTokenizer(use_fast = True)
model = AutoModelForTokenClassification.from_pretrained(f"{model_name}", num_labels=len(label_list))

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/pop529700/.cache/huggingface/hub/models--xlm-roberta-large/snapshots/b2a6150f8be56457baf80c74342cc424080260f0/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-large",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading file s

## Define Data Collator

In [104]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

## Define Trainer

About the Model:

see https://github.com/huggingface/transformers/blob/v4.21.1/src/transformers/modeling_utils.py#L829

In [105]:
print("Parameters:", model.num_parameters())
print("Expected Input Dict:", model.main_input_name )

# Estimate FLOPS needed for one training example
sample = dataset["train"][0]
sample["input_ids"] = torch.Tensor(sample["input_ids"])
flops_est = model.floating_point_ops(input_dict = sample, exclude_embeddings = False)

print("FLOPS needed per Training Sample:", flops_est )

Parameters: 558848007
Expected Input Dict: input_ids
FLOPS needed per Training Sample: 23471616294


In [106]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

**Define Optimizer:**

See https://huggingface.co/docs/transformers/main_classes/optimizer_schedules#transformers.Adafactor

In [107]:
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

num_epochs = 5
batch_size = 16

# A training step is one gradient update. In one step batch_size examples are processed.
# An epoch consists of one full cycle through the training data. 
# This is usually many steps. As an example, if you have 2,000 images and use
# a batch size of 10 an epoch consists of:
num_steps = (len(dataset["train"]) / batch_size) * num_epochs

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-6, weight_decay=0.01, no_deprecation_warning= True)

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps= num_steps 
)

print("Steps:", num_steps)

Steps: 312.5


**Define Metrics:**

See https://huggingface.co/course/chapter7/2#metrics

In [108]:
metric = load_metric("seqeval")

In [109]:
example = dataset["train"][150]
labels = [label_list[i] for i in example[f"labels"]]
metric.compute(predictions=[labels], references=[labels])

{'LOC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 3},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

Set correct class labels:

In [110]:
label_names = dataset["train"].features[f"ner_tags"].feature.names

id2label = {id : label for id, label in enumerate(label_names)}
label2id = {label: id for id, label in enumerate(label_names)}

model.config.id2label = id2label
model.config.label2id = label2id

Define callback function to evaluate the model:

In [111]:
label_names = model.config.id2label

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_names[l] for l in label  if l != -100] for label in labels]
    #true_predictions = [model.config.id2label[t.item()] for t in predictions]
    
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label)  if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

**Remove unnecessary columns:**

In [112]:
dataset = dataset.remove_columns(["tokens", "ner_tags", "langs", "spans"])

**Set further Training Arguments:**

See https://huggingface.co/docs/transformers/v4.21.2/en/main_classes/trainer#transformers.TrainingArguments

In [113]:
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy= "no",# "epoch",
    #save_steps = 100000,
    remove_unused_columns = True,
    evaluation_strategy="steps",
    eval_steps = 20,
    #load_best_model_at_end=True,
    logging_strategy = "steps",
    logging_steps = 20,
    #learning_rate= 2e-5,
    #auto_find_batch_size = True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #gradient_accumulation_steps=4,
    #optim="adamw_torch",
    num_train_epochs=num_epochs,
    #weight_decay=0.01,
    report_to="none",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler),
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
Using cuda_amp half precision backend


## Train Model

GPU used by Kaggle: https://www.nvidia.com/de-de/data-center/tesla-p100/

In [114]:
!nvidia-smi

Sat Jan  7 12:07:22 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A40          Off  | 00000000:01:00.0 Off |                    0 |
|  0%   47C    P0    80W / 300W |  12091MiB / 46068MiB |     45%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A40          Off  | 00000000:43:00.0 Off |                    0 |
|  0%   49C    P0    82W / 300W |   1223MiB / 46068MiB |      0%      Default |
|       

In [115]:
trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 160
  Number of trainable parameters = 558848007
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,1.7921,1.52925,0.0,0.0,0.0,0.455559
40,1.5263,1.306749,0.068003,0.032231,0.043734,0.508527
60,1.2856,1.086003,0.162504,0.190909,0.175565,0.634634
80,1.0713,0.873305,0.290445,0.347934,0.316601,0.709249
100,0.8797,0.782177,0.39812,0.437603,0.416929,0.759101
120,0.7325,0.66368,0.46148,0.504959,0.482242,0.790751
140,0.6137,0.647533,0.486697,0.566942,0.523764,0.798786
160,0.5734,0.580293,0.526376,0.57314,0.548764,0.820679


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=160, training_loss=1.059322726726532, metrics={'train_runtime': 98.4306, 'train_samples_per_second': 50.797, 'train_steps_per_second': 1.626, 'total_flos': 387727231040688.0, 'train_loss': 1.059322726726532, 'epoch': 5.0})

In [122]:
eval_results = trainer.evaluate()
print(f"Eval Loss: {eval_results['eval_loss']}")

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32


Eval Loss: 0.5802927017211914


**Saving the fine tuned model & tokenizer:**

In [123]:
trainer.save_model(f'./results/checkpoint-final/')

Saving model checkpoint to ./results/checkpoint-final/
Configuration saved in ./results/checkpoint-final/config.json
Model weights saved in ./results/checkpoint-final/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-final/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-final/special_tokens_map.json


**Save Training History:**

In [131]:
data_path = "./results/checkpoint-final/trainer.pkl"
with open(data_path, 'wb') as pickle_file:
    pickle.dump(obj = trainer, file=pickle_file)

TypeError: cannot pickle 'torch._C.Generator' object

In [124]:
data_path = "./results/checkpoint-final/training_args.pkl"
with open(data_path, 'wb') as pickle_file:
    pickle.dump(obj = trainer.args, file=pickle_file)

In [125]:
data_path = "./results/checkpoint-final/training_history.pkl"
with open(data_path, 'wb') as pickle_file:
    pickle.dump(obj = trainer.state, file=pickle_file)

**Calculate Accuracy:**

In [126]:
predictions, labels, _ = trainer.predict(dataset["test"])
predictions = np.argmax(predictions, axis=-1)

***** Running Prediction *****
  Num examples = 1000
  Batch size = 32


In [127]:
true_labels = [
    [label_names[l] for l in label  if l != -100] 
    for label in labels
]

true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label)  if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
pprint(results)

{'LOC': {'f1': 0.5492468134414833,
         'number': 880,
         'precision': 0.5602836879432624,
         'recall': 0.5386363636363637},
 'ORG': {'f1': 0.4060339409176618,
         'number': 758,
         'precision': 0.3877551020408163,
         'recall': 0.4261213720316623},
 'PER': {'f1': 0.6789413118527042,
         'number': 782,
         'precision': 0.6171548117154811,
         'recall': 0.7544757033248082},
 'overall_accuracy': 0.8206789111183995,
 'overall_f1': 0.5487636003956479,
 'overall_precision': 0.5263757115749526,
 'overall_recall': 0.5731404958677686}
