# Package Imports

In [2]:
import numpy as np
import pandas as pd
import os
import re


from datasets import (
     load_from_disk, 
     load_metric, 
     DatasetDict, 
     load_dataset
)
import evaluate
from transformers import (
     AutoTokenizer,
     DataCollatorWithPadding,
     TrainingArguments,
     AutoModelForSequenceClassification,
     Trainer,
     logging,
     AdamW,
     get_scheduler,

)
import torch
from ray import tune, train
import pickle
import optuna
from datetime import datetime
import utility.utility as util
import utility.CustomTrainer as ct
import utility.ModelConfig as mc
import utility.CustomCallback as cb

# turn off warnings
#logging.set_verbosity_error()

# resets import once changes have been applied
%load_ext autoreload
%autoreload 2

# CWD Path

In [None]:
"""
path to local
"""
path_cwd = os.getcwd()

# Disable TQDM, for output

In [None]:
"""
disable tqdm
"""
_disable_tqdm = False

# Checkpoint Save Strategy

In [None]:
"""
save checkpoints during training, requires a lot of disk space
"""
_save_strategy = "no"

# First Run or Restart of HPS

In [4]:
# flag whether first run or continued study
_flag_first_run = True

# modelconfig file name
_name_config_file = ""

# path to model_config
path_file_modelconfig = os.path.join("modelconfigs", _name_config_file)

# Load ModelConfig, if not first run

In [5]:
model_config = None
if not _flag_first_run:
    with open(os.path.join(path_cwd, path_file_modelconfig), "rb") as f:
        model_config = pickle.load(f)

ss


# Configuration

In [None]:
"""
description of downstream task you want to train you model on
"""
_task = "Binary Classification _ with study object and hps log history"
if not _flag_first_run:
    _task = model_config.task

"""
Base BERT model to be used during finetuning.
This has to be picked from the pre-trained models on HuggingFace
in order to be compatible with the Trainer API
"""
_base_model = "roberta-base"
if not _flag_first_run:
    _base_model = model_config.base_model

"""
Three custom loss functions have been implemented:
  f1: soft-f1 macro score
  mcc: soft-mcc
  wce: weighted cross entropy
  ce: standard cross entropy
"""
_loss_fct = "ce"
if not _flag_first_run:
    _loss_fct = model_config.loss_fct
    
"""
weighting scheme, only relevant when weighted-cross-entropy or other weighted losses are used
"""
_weight_scheme = "rev_prop"
if not _flag_first_run:
    _weight_scheme = model_config.weight_scheme
  

"""
Metrics listed during evaluation:

Note: adjust with desired metrics.
"""
_eval_metrics = ["accuracy", "precision", "recall", "f1", "matthews_correlation"]
if not _flag_first_run:
    _eval_metrics = model_config.eval_metrics
  

"""
Specify which metric should be maximized/minimized during hyperparameter-search
Options:
- eval_matthews_correlation
- eval_f1
- eval_loss
- any other metric passed to the compute_metrics function

also specify direction: "maximize"/"minimize"
"""
_metric_best_model = "eval_matthews_correlation"
if not _flag_first_run:
    _metric_best_model = model_config.metric_best_model
    
_metric_direction = "maximize"
if not _flag_first_run:
    _metric_direction = model_config.metric_direction

"""
Number of trials to run during this run of hyperparameter search.
"""
_no_trials = 2

"""
Employ freezing of layers, options:
"unfrozen": all layers unfrozen
"frozen": all transformer layers frozen
"""
_frozen = "unfrozen"
if not _flag_first_run:
    _frozen = model_config.frozen

"""
location of dataset
"hub": HuggingFace Hub
"local": Local directory
"""
_from_hub = True
if not _flag_first_run:
    _from_hub = model_config.from_hub

"""
name of dataset on Hf-Hub
"""
_dataset_name_hub = "HalaJada/FinStmts_ConsUncons_Sliding_English_SeqClass"
if not _flag_first_run:
    _dataset_name_hub = model_config.dataset_name_hub

"""
name of local dataset
"""
_dataset_name_local = ""
if not _flag_first_run:
    _dataset_name_local = model_config.dataset_name_local
    
"""
flag majority voting, multi-segment approach
"""
_flag_mv = False
if not _flag_first_run:
    _flag_mv = model_config.flag_mv

"""
hps study name
"""
_study_name = "test"
if not _flag_first_run:
    _study_name = model_config.study_name

# Set Global/Meta Variables

In [8]:
"""
timestamp
"""
timestamp = datetime.now().strftime("%d_%m_%y_%H_%M")
if not _flag_first_run:
    timestamp = model_config.timestamp_initial

"""
some model contain '/' characters which create issues with file and directory pathing, we replace them with '__' only for naming purposes
"""
# for saving name in model config we need to make sure that there is no '/' in _base_model
base_model_altered = re.sub(r'/', '___', _base_model)

"""
name of dataset to name model_config
"""
dataset_name = re.sub(r'/', '_',_dataset_name_hub) if _from_hub else _dataset_name_local

"""
Directory Paths:
"""
path_initial_training =  os.path.join("training_data" , base_model_altered, "initial_training" + "_" + timestamp)

"""
Select weighting method when using weighted cost functions.
"""
class_weighting_schemes = {"rev_prop": util.get_reverse_prop_class_weights}

"""
path to folder with local datasets
"""
path_dataset_local = os.path.join("datasets" , _dataset_name_local)

"""
name of file with ModelConfig object
path to folder with modelconfig
"""
file_modelconfig = "ModelConfig_" + base_model_altered + "_" + dataset_name + "_" + timestamp + ".pkl"
path_file_modelconfig = os.path.join("modelconfigs", file_modelconfig)

"""
path to sqlite database with the optuna study parameters
"""
path_study_db = os.path.join("study_dbs", _study_name + "_" + base_model_altered + "_" + dataset_name + "_" + timestamp + ".db")

# Setup

This part has to be adjusted to whatever dataset and format used.

Note: DataCollatorWithPadding allows for dynamic padding for individual batches. Only use with GPUs. For TPUs, use max_length padding attribute with Tokenizer instance.

## Load Data

Either load from a local directory or from the HuggingFace Hub

In [9]:
raw_datasets = util.load_data(_from_hub, _dataset_name_hub, os.path.join(path_cwd, path_dataset_local))

# Determine number of labels/classes

In [5]:
num_labels = util.get_no_labels(raw_datasets)

# Determine Class Weights

In [4]:
class_weights = class_weighting_schemes[_weight_scheme](raw_datasets)
if not _flag_first_run:
    class_weights = model_config.class_weights

## Load Tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(_base_model)

## Function that returns the Tokenizer so that we can employ data mapping.

Note: Adjust this to desired task.

In [6]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

## Map Dataset with Tokenizer

In [7]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

## Instantiate DataCollator

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training Arguments

Adjust to desired behaviour. Most arguments can be learned during hyperparameter-search.

In [9]:
"""
Create instance of class TrainingArguments. Adjust to desired behaviour.
"""
training_args = TrainingArguments(
    output_dir = os.path.join(path_cwd, path_initial_training),
    save_strategy = _save_strategy,
    evaluation_strategy = "epoch",
    logging_strategy = "epoch",
    metric_for_best_model = _metric_best_model,
    disable_tqdm = _disable_tqdm,
    )

# Model Initialzation

In [10]:
"""
Model Initilization

Here we supply two model init functions, one that freezes all encoder layers and
one that does not.

Pass desired init function to Trainer below.

Gradual unfreezing helps to strike a balance between leveraging pre-trained
knowledge and adapting to task-specific data. By unfreezing layers gradually
during training, the model learns to prioritize retaining general linguistic
knowledge in the early layers while fine-tuning the higher layers to adapt to
task-specific nuances. This mitigates overfitting by allowing the model to
gradually specialize on the new task without abruptly forgetting the
linguistic representations learned during pre-training, resulting in more
effective adaptation and improved generalization to the target task.

Note: When utilizing gradual unfreezing you will have to train the model in
multiple steps. Gradually unfreezing ever more layers during training.
You will observe slower convergence, as such this will take more time.

Note: Depending on the choice of a base model and the desired number of layers
to freeze the model_init_frozen function might have to be adjusted.
To see which layers are available run:

  for name, param in model.named_parameters():
    print(name, param)

Observe entire model architecture and note layers you wish to freeze. Adjust
*conditional statement accordingly.

# https://towardsdatascience.com/transfer-learning-from-pre-trained-models-f2393f124751
"""


def model_init_frozen(freeze_layers):
  model = AutoModelForSequenceClassification.from_pretrained(_base_model, num_labels=num_labels, return_dict=True)
  for name, param in model.named_parameters():
    # *conditional statement: currently all encoder layers are frozen
    freeze_layers = ["layer." + str(i) for i in range(11)]
    for fl in freeze_layers:
      if fl in name:
        param.requires_grad = False
  return model

def model_init():
  return AutoModelForSequenceClassification.from_pretrained(_base_model, num_labels=num_labels, return_dict=True)


In [11]:
model_inits = {"unfrozen": model_init, "frozen": model_init_frozen}


# Evaluation Metrics

Below we specify which performance measures we wish to observe during training
at the end of each step/epoch.

And provide a metric function for training.


In [12]:
clf_metrics = evaluate.combine(_eval_metrics)

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return clf_metrics.compute(predictions = predictions, references = labels)

# Initialize CustomTrainer

In [13]:
trainer = ct.CustomTrainer(
    type_loss = _loss_fct,
    model_init = model_inits[_frozen],
    class_weights = class_weights,
    args = training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics,
)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Create and Add Callback to save training logs after each hyperparameter trial

In [None]:
callback = cb.CustomCallback(trainer)
trainer.add_callback(callback)

# (Optional) Create and assign an Optimizer and Scheduler

When using the HuggingFace Trainer API for hyperparameter search, we can no longer use the "optimizer" argument directly. Instead we customize the optimizer and scheduler

Note: This is rather optional, as we could skip the following step and use the defaults. Inclusion in case some custom behaviour is desired.

In [14]:

"""
When using the HugginFace Trainer API for hyperparameter search, we can no longer use
the "optimizer" argument directly. Instead we customize the optimizer and scheduler
"""
optimizer = torch.optim.AdamW(trainer.model.parameters())
lr_scheduler = get_scheduler(
    "linear",
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = training_args.num_train_epochs * tokenized_datasets["train"].num_rows

)

# Uncomment line below if you wish to pass objects to Trainer
"""
Pass instances to Trainer
"""
#trainer.optimizers = (optimizer, lr_scheduler)

'\nPass instances to Trainer\n'

# Hyperparameter Search via Optuna

Adjust hyperparameters and their ranges as desired


Note: warmup_ratio fulfills a somewhat similar role to freezing. It is also often used to stabilize training at the beginning and avoid large weight updates.

https://towardsdatascience.com/state-of-the-art-machine-learning-hyperparameter-optimization-with-optuna-a315d8564de1

https://huggingface.co/docs/transformers/hpo_train

https://github.com/bayesian-optimization/BayesianOptimization



In [15]:
# Define objective function that later selects best model based upon specific metric
def compute_objective(metrics):
  return metrics[_metric_best_model]

# Define search space for hyperparamter tuning
def optuna_hp_space(trial):
  return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 3),
        "weight_decay": trial.suggest_float("weight_decay", 1e-5, 1e-1),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0, 1e-1),
    }

# Run Hyperparameter Search

In [18]:
# Run hyperparameter search
best_run = trainer.hyperparameter_search(
    direction=_metric_direction,
    backend="optuna",
    hp_space = optuna_hp_space,
    n_trials = _no_trials,
    compute_objective = compute_objective,
    study_name=_study_name,
    storage= "sqlite:///" + os.path.join(path_cwd, path_study_db),
    load_if_exists=True,
    )

[I 2024-01-26 00:54:33,832] A new study created in memory with name: no-name-0c47a83c-86ee-4ffe-a94e-1690a621e8e6
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Matthews Correlation
1,0.4859,0.08175,0.98893,0.98893,1.0,0.994434,0.0
2,0.4637,0.080865,0.98893,0.98893,1.0,0.994434,0.0
3,0.5117,0.078562,0.98893,0.98893,1.0,0.994434,0.0
4,0.45,0.057659,0.99139,0.992583,0.998756,0.99566,0.496593
5,0.26,0.058162,0.99016,0.995025,0.995025,0.995025,0.55058
6,0.0913,0.057192,0.99385,0.99382,1.0,0.9969,0.664603
7,0.0621,0.065918,0.99139,0.995031,0.996269,0.995649,0.584934
8,0.0381,0.062628,0.99385,0.995043,0.998756,0.996896,0.677627


[I 2024-01-26 01:51:08,718] Trial 0 finished with value: 0.6776274497306741 and parameters: {'learning_rate': 2.5807768691598085e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 8, 'weight_decay': 0.08270255272065924, 'warmup_ratio': 0.054434248236845895}. Best is trial 0 with value: 0.6776274497306741.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Matthews Correlation
1,0.4391,0.163711,0.98893,0.98893,1.0,0.994434,0.0
2,0.2472,0.11397,0.98893,0.98893,1.0,0.994434,0.0
3,0.249,0.100875,0.98893,0.98893,1.0,0.994434,0.0
4,0.2515,0.097597,0.98893,0.98893,1.0,0.994434,0.0


[I 2024-01-26 02:19:54,556] Trial 1 finished with value: 0.0 and parameters: {'learning_rate': 1.4505137694442436e-06, 'per_device_train_batch_size': 8, 'num_train_epochs': 4, 'weight_decay': 0.0861989407754196, 'warmup_ratio': 0.049334454425836775}. Best is trial 0 with value: 0.6776274497306741.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Matthews Correlation
1,0.3576,0.101555,0.98893,0.98893,1.0,0.994434,0.0
2,0.2414,0.083353,0.98893,0.98893,1.0,0.994434,0.0
3,0.2779,0.081529,0.98893,0.98893,1.0,0.994434,0.0
4,0.2938,0.081625,0.98893,0.98893,1.0,0.994434,0.0
5,0.2936,0.0818,0.98893,0.98893,1.0,0.994434,0.0


[I 2024-01-26 02:55:55,949] Trial 2 finished with value: 0.0 and parameters: {'learning_rate': 3.171986659227783e-06, 'per_device_train_batch_size': 8, 'num_train_epochs': 5, 'weight_decay': 0.03382298794320742, 'warmup_ratio': 0.03144655413905456}. Best is trial 0 with value: 0.6776274497306741.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Matthews Correlation
1,0.3815,0.11415,0.98893,0.98893,1.0,0.994434,0.0
2,0.2371,0.087455,0.98893,0.98893,1.0,0.994434,0.0
3,0.2654,0.082703,0.98893,0.98893,1.0,0.994434,0.0
4,0.2796,0.081736,0.98893,0.98893,1.0,0.994434,0.0
5,0.2796,0.081591,0.98893,0.98893,1.0,0.994434,0.0


[I 2024-01-26 03:31:56,860] Trial 3 finished with value: 0.0 and parameters: {'learning_rate': 2.531985663156513e-06, 'per_device_train_batch_size': 8, 'num_train_epochs': 5, 'weight_decay': 0.020163056253844593, 'warmup_ratio': 0.039555475521001394}. Best is trial 0 with value: 0.6776274497306741.


In [19]:
# Outputs best hyperparameters that lead to maximizing the objective function
best_run

BestRun(run_id='0', objective=0.6776274497306741, hyperparameters={'learning_rate': 2.5807768691598085e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 8, 'weight_decay': 0.08270255272065924, 'warmup_ratio': 0.054434248236845895}, run_summary=None)

# Process hps log history

In [None]:
hps_log_df = util.process_hps_log_history(callbacks.all_log_history)

# Create ModelConfig File

In [78]:
if _flag_first_run:
    model_config = mc.ModelConfig(timestamp = timestamp, 
                              base_model = _base_model, 
                              task = _task, 
                              loss_fct = _loss_fct, 

                              from_hub = _from_hub,
                              dataset_name_hub = _dataset_name_hub,
                              dataset_name_local = _dataset_name_local,
                              path_dataset_local = path_dataset_local, 

                              num_labels = num_labels,
                              weight_scheme = _weight_scheme, 
                              class_weights = class_weights,
                              eval_metrics = _eval_metrics,
                              metric_best_model = _metric_best_model, 
                              metric_direction = _metric_direction,
                              
                              no_trials = _no_trials,  
                              frozen = _frozen,  
                              path_initial_training = path_initial_training,
                              best_run = best_run,
                              hps_log_df = hps_log_df,
                              flag_mv = _flag_mv,
                              study_name = _study_name,
                              path_study_db = path_study_db)
else:
    model_config.no_trials = model_config.no_trials + _no_trials
    model_config.best_run = best_run
    model_config.hps_log_df = util.merge_hps_log_histories(model_config.hps_log_df, hps_log_df)

# Save ModelConfig

In [75]:
with open(os.path.join(path_cwd, path_file_modelconfig), 'wb') as f:
    pickle.dump(model_config, f)