# Package Imports

In [2]:
import numpy as np


from datasets import load_dataset, load_metric
import evaluate
from transformers import (
     AutoTokenizer,
     DataCollatorWithPadding,
     TrainingArguments,
     AutoModelForSequenceClassification,
     Trainer,
     logging,
     AdamW,
     get_scheduler,

)
import torch
from ray import tune, train

# turn off warnings
#logging.set_verbosity_error()

# Meta Variables
- base model
- loss function
- evaluation metrics
- best model metric
- number of trials


In [3]:
"""
Directory Paths:
"""
_path_initial_training = "./initial_training"
_path_secondary_training = "./secondary_training"

"""
Base BERT model to be used during finetuning.
This has to be picked from the pre-trained models on HuggingFace
in order to be compatible with the Trainer API
"""
_base_model = "bert-base-uncased"

"""
Three custom loss functions have been implemented:
  f1: soft-f1 macro score
  mcc: soft-mcc
  wce: weighted cross entropy
  ce: standard cross entropy
"""
_loss_fct = "mcc"

"""
Metrics listed during evaluation:

Note: adjust with desired metrics.
"""
_eval_metrics = ["accuracy", "precision", "recall", "f1", "matthews_correlation"]

"""
Specify which metric should be maximized during hyperparameter-search
Options:
- eval_matthews_correlation
- eval_f1
- eval_loss
- any other metric passed to the compute_metrics function
"""
_metric_best_model = "eval_matthews_correlation"

"""
Number of trials to run during hyperparameter search.
"""
_no_trials = 4

"""
Employ freezing of layers, options:
"unfrozen": all layers unfrozen
"frozen": all transformer layers frozen
"""
_frozen = "unfrozen"

# Setup

This part has to be adjusted to whatever dataset and format used.

Note: DataCollatorWithPadding allows for dynamic padding for individual batches. Only use with GPUs. For TPUs, use max_length padding attribute with Tokenizer instance.

## Load Data

-Note: We use GLUE's Microsoft Research Paraphrase Corpus for testing the functionality of this template

https://huggingface.co/datasets/viewer/?dataset=glue&config=mrpc

Binary Classification Task:
MRPC is a corpus of human annotated sentence pairs used to train a model to determine whether sentence pairs are semantically equivalent.

In [4]:
raw_datasets = load_dataset("glue", "mrpc")

## Load Tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(_base_model)

## Function that returns the Tokenizer so that we can employ data mapping.

Note: Adjust this to desired task.

In [6]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

## Map Dataset with Tokenizer

In [7]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

## Instantiate DataCollator

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training Arguments

Adjust to desired behaviour. Most arguments can be learned during hyperparameter-search.

In [10]:
"""
Create instance of class TrainingArguments. Adjust to desired behaviour.
"""
training_args = TrainingArguments(
    output_dir = _path_initial_training,
    # This was set for testing, when using template I would recommend adding
    # some sort of datatime argument to above meta path variables, to not loose
    # previous learnings.
    overwrite_output_dir = True,
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    logging_strategy = "epoch",
    metric_for_best_model = _metric_best_model,
    )

# Model Initialzation

In [11]:
"""
Here we supply two model init functions, one that freezes all encoder layers and
one that does not.

Pass desired init function to Trainer below.

Gradual unfreezing helps to strike a balance between leveraging pre-trained
knowledge and adapting to task-specific data. By unfreezing layers gradually
during training, the model learns to prioritize retaining general linguistic
knowledge in the early layers while fine-tuning the higher layers to adapt to
task-specific nuances. This mitigates overfitting by allowing the model to
gradually specialize on the new task without abruptly forgetting the
linguistic representations learned during pre-training, resulting in more
effective adaptation and improved generalization to the target task.

Note: When utilizing gradual unfreezing you will have to train the model in
multiple steps. Gradually unfreezing ever more layers during training.
You will observe slower convergence, as such this will take more time.

Note: Depending on the choice of a base model and the desired number of layers
to freeze the model_init_frozen function might have to be adjusted.
To see which layers are available run:

  for name, param in model.named_parameters():
    print(name, param)

Observe entire model architecture and note layers you wish to freeze. Adjust
*conditional statement accordingly.

# https://towardsdatascience.com/transfer-learning-from-pre-trained-models-f2393f124751
"""
def model_init_frozen():
  model = AutoModelForSequenceClassification.from_pretrained(_base_model, num_labels=2, return_dict=True)
  for name, param in model.named_parameters():
    # *conditional statement: currently all encoder layers are frozen
    if ".layer." in name:
      param.requires_grad = False
  return model

def model_init():
  return AutoModelForSequenceClassification.from_pretrained(_base_model, num_labels=2, return_dict=True)

model_inits = {"unfrozen": model_init, "frozen": model_init_frozen}

# Evaluation metrics

In [12]:
"""
Below we specify which performance measures we wish to observe during training
at the end of each step/epoch.
"""

clf_metrics = evaluate.combine(_eval_metrics)

def compute_metrics(eval_preds):
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return clf_metrics.compute(predictions = predictions, references = labels)

# CustomTrainer

Note: When using one of the soft cost functions (F1 or MCC) we observe slower
convergence during training. This might require longer training times.

Note: This template in its current form only works with a binary classification task, but can easily be amended to work with multi-class classification tasks.

In [13]:
class CustomTrainer(Trainer):
  """
  Here we define child-class inheriting the behaviour of Trainer. This allows us
  to overwrite the custom loss function.
  """

  def __init__(self, type_loss, **kwargs):
    # Instantiate Parent Class
    super().__init__(**kwargs)
    # Assign ChildClass Attributes
    self.type_loss = type_loss
    self.loss_fcts = {"wce": self.weighted_cross_entropy, "f1": self.macro_double_soft_f1, "mcc": self.mcc_exp, "ce":super().compute_loss}

  def compute_loss(self, model, inputs, return_outputs=False):
    """
    Overwrite parent's compute_loss, this function will return the desired loss
    function specified during initialization of class.
    """
    return self.loss_fcts[self.type_loss](model, inputs, return_outputs)

  def macro_double_soft_f1(self, model, inputs, return_outputs=False):

      """
      Compute the macro soft F1-score as a cost (average 1 - soft-F1 across
      all labels).
      Use probability values instead of binary predictions.

      https://towardsdatascience.com/the-unknown-benefits-of-using-a-soft-f1-loss-in-classification-systems-753902c0105d
      https://arxiv.org/abs/2108.10566
      https://www.kaggle.com/code/rejpalcz/best-loss-function-for-f1-score-metric/notebook
      """

      # prepare inputs
      y = inputs.pop("labels")
      outputs = model(**inputs)
      logits = outputs.get("logits")
      y_hat = torch.nn.functional.softmax(logits, dim=1)
      # construct soft scores
      tp = (y_hat[:, 1] * y).sum(dim=0)
      fn = (y_hat[:, 0] * y).sum(dim=0)
      fp = (y_hat[:, 1] * (1-y)).sum(dim=0)
      tn = (y_hat[:, 0] * (1-y)).sum(dim=0)
      # calculate cost
      soft_f1_class1 = 2*tp / (2*tp + fn + fp + 1e-16)
      soft_f1_class0 = 2*tn / (2*tn + fn + fp + 1e-16)
      cost_class1 = 1 - soft_f1_class1
      cost_class0 = 1 - soft_f1_class0 # reduce 1 - f1 to maximize f1
      cost = 0.5 * (cost_class1 + cost_class0) # take into account both class 1 and class 0
      # compute average
      loss = cost.mean()
      return (loss, outputs) if return_outputs else loss

  def weighted_cross_entropy(self, model, inputs, return_outputs=False):
    """
    This method employs standard Cross-Entropy but puts different weights on the
    classes.
    With this, should one class be of more importance we can overweigh its
    impact on the loss, thereby indirectly penalizing the other class.
    """
    labels = inputs.pop("labels")
    # forward pass
    outputs = model(**inputs)
    logits = outputs.get("logits")
    # compute loss - adjust weights for classes as desired
    loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0], device=model.device))
    loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
    return (loss, outputs) if return_outputs else loss

  def mcc_exp(self, model, inputs, return_outputs=False):
    """
    Computes a sort of soft MCC score, similiarly to the soft-F1, by using
    probability measures instead of binary predictions.

    https://www.kaggle.com/code/rejpalcz/best-loss-function-for-f1-score-metric/notebook
    https://github.com/vlainic/matthews-correlation-coefficient/tree/master
    https://arxiv.org/abs/2010.13454
    """
    # prepare inputs
    y = inputs.pop("labels")
    outputs = model(**inputs)
    logits = outputs.get("logits")
    y_hat = torch.nn.functional.softmax(logits, dim=1)
    # construct soft scores
    tp = (y_hat[:, 1] * y).sum(dim=0)
    fn = (y_hat[:, 0] * y).sum(dim=0)
    fp = (y_hat[:, 1] * (1-y)).sum(dim=0)
    tn = (y_hat[:, 0] * (1-y)).sum(dim=0)
  # calculate cost
    mcc = (tn * tp - fn * fp)/ torch.sqrt(((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))+ 1e-16)
    loss_mcc = 1 - mcc

    return (loss_mcc, outputs) if return_outputs else loss_mcc

# Initialize CustomTrainer

In [14]:
trainer = CustomTrainer(
    type_loss = _loss_fct,
    model_init = model_inits[_frozen],
    args = training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# (Optional) Create and assign an Optimizer and Scheduler

When using the HuggingFace Trainer API for hyperparameter search, we can no longer use the "optimizer" argument directly. Instead we customize the optimizer and scheduler

Note: This is rather optional, as we could skip the following step and use the defaults. Inclusion in case some custom behaviour is desired.

In [15]:

"""
When using the HugginFace Trainer API for hyperparameter search, we can no longer use
the "optimizer" argument directly. Instead we customize the optimizer and scheduler
"""
optimizer = torch.optim.AdamW(trainer.model.parameters())
lr_scheduler = get_scheduler(
    "linear",
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = training_args.num_train_epochs * tokenized_datasets["train"].num_rows

)

# Uncomment line below if you wish to pass objects to Trainer
"""
Pass instances to Trainer
"""
#trainer.optimizers = (optimizer, lr_scheduler)

'\nPass instances to Trainer\n'

# Hyperparameter Search via Optuna

Adjust hyperparameters and their ranges as desired


Note: warmup_ratio fulfills a somewhat similar role to freezing. It is also often used to stabilize training at the beginning and avoid large weight updates.

https://towardsdatascience.com/state-of-the-art-machine-learning-hyperparameter-optimization-with-optuna-a315d8564de1

https://huggingface.co/docs/transformers/hpo_train

https://github.com/bayesian-optimization/BayesianOptimization



In [16]:
# Define objective function that later selects best model based upon specific metric
def compute_objective(metrics):
  return metrics[_metric_best_model]

# Define search space for hyperparamter tuning
def optuna_hp_space(trial):
  return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 8),
        "weight_decay": trial.suggest_float("weight_decay", 1e-5, 1e-1),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0, 1e-1),
    }

# Run Hyperparameter Search

In [None]:
# Run hyperparameter search
best_run = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space = optuna_hp_space,
    n_trials = _no_trials,
    compute_objective = compute_objective
    )

[I 2023-12-22 18:04:56,681] A new study created in memory with name: no-name-fad3d733-5584-4ee0-adef-dac87629ea35
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Matthews Correlation
1,0.9537,0.861584,0.659314,0.843137,0.616487,0.712215,0.342623
2,0.7804,0.732474,0.737745,0.835938,0.767025,0.8,0.424552
3,0.6384,0.642787,0.759804,0.849421,0.78853,0.817844,0.469541
4,0.5066,0.594873,0.803922,0.81388,0.924731,0.865772,0.522041
5,0.442,0.544282,0.811275,0.836667,0.899642,0.867012,0.547845
6,0.4027,0.539498,0.811275,0.834437,0.903226,0.86747,0.546736


[I 2023-12-22 18:11:46,742] Trial 0 finished with value: 0.5467361742360426 and parameters: {'learning_rate': 4.042625645902345e-06, 'per_device_train_batch_size': 16, 'num_train_epochs': 6, 'weight_decay': 0.046663145203696514, 'warmup_ratio': 0.05677302733685228}. Best is trial 0 with value: 0.5467361742360426.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Matthews Correlation
1,0.9861,0.971853,0.627451,0.782222,0.630824,0.698413,0.234649
2,0.9351,0.922842,0.671569,0.825112,0.659498,0.733068,0.333608


In [None]:
# Outputs best hyperparameters that lead to maximizing the objective function
best_run

We can now reinitialize the above classes with the training arguments contained in best_runs and train the model on both the Training and Validation dataset to measure its performance on the test dataset.

(Optional) If during initial training you froze some layers you can now continue training with a partially/fully unfrozen model.

In [None]:
best_training_args = TrainingArguments(
    _path_secondary_training,
    **best_runs.hyperparameters,
    )

In [None]:
# adjust as described above
def model_init_secondary_run():
  return AutoModelForSequenceClassification.from_pretrained(_base_model, num_labels=2, return_dict=True)
