# **SemEval 2024 Task 2: Safe Biomedical Natural Language Inference**
### Topic:  Fine-Tuning Pre-trained Transformer Models for Detecting Annotation Artifacts for Natural Language Inference Tasks
\
Model_1: `BERT-base-cased` \
Model_2: `medicalai/ClinicalBERT` \
Model_3: `pritamdeka/BioBert-PubMed200kRCT` \
Model_4: `allenai/biomed_roberta_base` \
(Model_5: `yikuan8/Clinical-Longformer`)

# **Prerequisites**

## **Installing the required packages**
This is especially important when:
- the required packages/libraries are not previously installed on a local system or
- Google Colab is used

In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate -U
!pip install pyarrow
!pip install -q wandb
!pip install -U ray
!pip install optuna

## **Import libraries**
Import required libraries and check if GPU is available for training.


In [None]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import datasets
import json

import transformers
from transformers import BertForSequenceClassification, AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, RobertaForSequenceClassification, AutoConfig
import torch
import pyarrow as pa

import wandb
import optuna

# test if GPU is available
if torch.cuda.is_available():
    print("GPU available. Training on GPU.")
else:
    print("No GPU available. Training on CPU.")

# **1. Preprocessing**

## **1.1 Mapping Trial IDs to CTR instances**


In [None]:
# mapping class labels to IDs & vice versa
label2id = {"Contradiction": 0, "Entailment": 1}
id2label= {0: "Contradiction", 1: "Entailment"}

# Mapping Trial IDs to CTR instances
def id_to_CTR(raw_texts):
  # dictionary to access study information (CTRs) via trial-ids
  id_to_clinical_trial_record = {}

  for instance in raw_texts:
    id = instance['clinical_trial_id'] # load ID of instance -> key: 'clinical_trial_id'
    id_to_clinical_trial_record[id] = instance

  return id_to_clinical_trial_record

**1.2 Load NLI4CT dataset from Huggingface**
see https://huggingface.co/datasets/bigbio/sem_eval_2024_task_2 for dataset card

In [None]:
# import dataset from Huggingface
annotations = datasets.load_dataset("bigbio/sem_eval_2024_task_2", name="sem_eval_2024_task_2_source") # hypothesis (statements)
raw_texts = datasets.load_dataset("bigbio/sem_eval_2024_task_2", name="sem_eval_2024_task_2_ct")['train'] # CTR instances (premises)

## **1.3 Data Transformation**
Transform the loaded dataset into a Huggingface `Dataset` that can be processed by an object of the `Tokenizer` class.

In [None]:
# transform data to required format to be processed from Tokenizer

def transform_data(annotations, id_to_clinical_trial_record, label2ids, setting):

  data_raw = {"train": [], "validation": [], "practice_test": [], "test": []}

  if setting=="statement_only":  # annotation artifacts
    for data_type in annotations.keys():
      for instance in annotations[data_type]:
          # get the label of the training instance -> e.g. 0 for "contradiction"
          if data_type in ('practice_test', 'test'):
            label = 0  # int label is needed for making predictions but does not influence these
          else:
            label = label2ids[instance['label']]

          statement = instance['statement'] # statement from training data
          text = statement
          relevant_info = {"label": label, "text": text}

          data_raw[data_type].append(relevant_info)

# -----------------------------------------------------depreciated-----------------------------------------------------

  elif setting=="primary_sec_statement":
    for data_type in annotations.keys():
      for instance in annotations[data_type]:
          # get the label of the training instance -> e.g. 0 for "contradiction"
          if data_type in ('practice_test', 'test'):
            label = 0  # int label is needed for making predictions but does not influence these
          else:
            label = label2ids[instance['label']]

          primary_id = instance['primary_id'] # ID of the primary section that supports (or not supports) the statement -> e.g. '00466f98-52b8-41f3-9bf1-2edaad950be9'
          primary_CTR = id_to_clinical_trial_record[primary_id] # get the CTR for the given primary ID -> e.g. 'NCT02504424'
          primary_section = primary_CTR[instance['section_id'].lower().replace(" ", "_")] # content of primary section of CTR

          statement = instance['statement'] # statement from training data

          text = (".".join(primary_section), statement)
          relevant_info = {"label": label, "text": text}

          data_raw[data_type].append(relevant_info)

  elif setting=="secondary_sec_statement":
    for data_type in annotations.keys():
      for instance in annotations[data_type]:
          # get the label of the training instance -> e.g. 0 for "contradiction"
          if data_type in ('practice_test', 'test'):
            label = 0  # int label is needed for making predictions but does not influence these
          else:
            label = label2ids[instance['label']]

          secondary_id = instance['secondary_id'] # ID of the secondary section that supports (or not supports) the statement -> e.g. '00466f98-52b8-41f3-9bf1-2edaad950be9'

          if secondary_id:
            secondary_CTR = id_to_clinical_trial_record[secondary_id] # get the CTR for the given secondary ID -> e.g. 'NCT02504424'
            secondary_section = secondary_CTR[instance['section_id'].lower().replace(" ", "_")] # content of secondary section of CTR

            statement = instance['statement'] # statement from training data

            text = (".".join(secondary_section), statement)
            relevant_info = {"label": label, "text": text}

            data_raw[data_type].append(relevant_info)

  elif setting=="primary_secondary_sec_statement":
    for data_type in annotations.keys():
        for instance in annotations[data_type]:
            # get the label of the training instance -> e.g. 0 for "contradiction"
            if data_type in ('practice_test', 'test'):
              label = 0  # int label is needed for making predictions but does not influence these
            else:
              label = label2ids[instance['label']]

            primary_id = instance['primary_id'] # ID of the primary section that supports (or not supports) the statement -> e.g. '00466f98-52b8-41f3-9bf1-2edaad950be9'
            primary_CTR = id_to_clinical_trial_record[primary_id] # get the CTR for the given primary ID -> e.g. 'NCT02504424'
            primary_section = primary_CTR[instance['section_id'].lower().replace(" ", "_")] # content of primary section of CTR

            secondary_id = instance['secondary_id'] # ID of the secondary section that supports (or not supports) the statement

            if secondary_id:
              secondary_CTR = id_to_clinical_trial_record[secondary_id] # get the CTR for the given secondary ID
              secondary_section = secondary_CTR[instance['section_id'].lower().replace(" ", "_")] # content of secondary section of CTR

              statement = instance['statement'] # statement from training data

              text = (".".join(primary_section), ".".join(secondary_section), statement)
              relevant_info = {"label": label, "text": text}

              data_raw[data_type].append(relevant_info)


  for data_type, data in data_raw.items():
    table = pa.Table.from_pydict({key: [item[key] for item in data] for key in data[0]})
    data_raw[data_type] = datasets.arrow_dataset.Dataset(table)

  return datasets.dataset_dict.DatasetDict(data_raw)

# **2. Training**

## **2.1 Setting *path*-variable to store results**
- If run locally, enter local path
- If Google Colab is used, please mount your Google Drive and enter the required path.

In [None]:
from google.colab import drive
import google.colab

path = "./"

if 'google.colab' in str(get_ipython()):
  drive.mount('/content/drive')

  user_input = str(input("Enter path: "))# set path to directory for saving models
  path = user_input if user_input else path
  print(f"Path: {path}")
else:
  user_input = str(input("Enter path: ")) # set path to directory for saving models
  path = user_input if user_input else path
  print(f"Path: {path}")

## **2.2 Weights & Biases**
Provide login credentials to track experimental results during training. Please enter key after running this cell.

In [None]:
# Login into wandb: copy key from account
wandb.login()

## **2.3 Implementation of Evaluation Metrics**
We report
- Accuracy
- Precision
- Recall
- Macro F1-Score

for the **entailment-class** on the development set using the `sklearn.metric`-library.

In [None]:
# compute metrics for class 1 (entailment)
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions.argmax(-1), eval_pred.label_ids

    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average="macro")
    recall = recall_score(labels, predictions, average="macro")
    f1 = f1_score(labels, predictions, average="macro")

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

## **2.4 Input Parameters**
Setting the model input configurations:
- For NLI task: combinations of hypothesis (*statement*) and premise(s) (*section(s)*)
- For detecting Annotation Artifacts: hypothesis (*statement only*)

Please enter the desired digit.

In [None]:
# set setting
# possible settings
settings = {
    1: "statement_only",
    2: "primary_sec_statement",
    3: "secondary_sec_statement",
    4: "primary_secondary_sec_statement",
}

input_setting = int(input("Choose a setting: \n" +
    f"[1] {settings[1]} \n" +
    f"[2] {settings[2]} \n" +
    f"[3] {settings[3]} \n" +
    f"[4] {settings[4]} \n"))

setting = settings[input_setting] # <- choose your desired setting
print(f"Selected setting: {setting}")

## **2.5 Model Selection**
Setting the model for the task:

- `pritamdeka/BioBert-PubMed200kRCT`
- `medicalai/ClinicalBERT`
- `BERT-base-cased`
- `allenai/biomed_roberta_base`

**Future Work:**
- (`yikuan8/Clinical-Longformer`)

Please enter the desired digit.

In [None]:
# set model
# possible models
models = {
    1: "bert-base-cased", # BERT-base-cased
    2: "medicalai/ClinicalBERT", # medicalai/ClinicalBERT
    3: "pritamdeka/BioBert-PubMed200kRCT", # pritamdeka/BioBert-PubMed200kRCT
    4: "allenai/biomed_roberta_base", # allenai/biomed_roberta_base
    5: "yikuan8/Clinical-Longformer", # yikuan8/Clinical-Longformer
}

input_model = int(input("Choose a model: \n" +
    f"[1] {models[1]} \n" +
    f"[2] {models[2]} \n" +
    f"[3] {models[3]} \n" +
    f"[4] {models[4]} \n" +
    f"[5] {models[5]} \n"))

model_name = models[input_model] # <- choose your desired model
print(f"Selected model: {model_name}")

## **2.6 Fine-Tuned Hyperparameters**
Stored fine-tuned hyperparameters for each model

- `pritamdeka/BioBert-PubMed200kRCT`
- `medicalai/ClinicalBERT`
- `BERT-base-cased`
- `allenai/biomed_roberta_base`

**Future Work:**
- (`yikuan8/Clinical-Longformer`)

---



In [None]:
# # set hyperparameters
# # TODO: new added parameters also need to be added in training_arguments below
train_args = {
   models[1]: { # BERT-base-cased
       settings[1]: { # statement_only
           "epochs": 5, # epochs to train
           "batch_size": 25, # batch size
           "lr": 7.908000560027878e-05, # learning rate
           "padd": "longest", # padding
           "trunc": "only_first", # truncation
           "max_len": 512, # max input length
           "warmup": 0, # warm up steps
           "chosen_seed": 49, # random seed
           "weight_dec": 0.37535936878564713, # weight decay
       },
       settings[2]: { # primary_sec_statement
           "epochs": 3, # epochs to train
           "batch_size": 4, # batch size
           "lr": 0.00000917445256433797, # learning rate
           "padd": "longest", # padding
           "trunc": "only_first", # truncation
           "max_len": 512, # max input length
           "warmup": 0, # warm up steps
           "chosen_seed": 40, # random seed
           "weight_dec": 0.01, # weight decay
       },
   },
   models[2]: { # medicalai/ClinicalBERT
       settings[1]: { # statement_only
           "epochs": 3, # epochs to train
           "batch_size": 16, # batch size
           "lr": 0.00000334675755041166, # learning rate
           "padd": "longest", # padding
           "trunc": "only_first", # truncation
           "max_len": 512, # max input length
           "warmup": 50, # warm up steps
           "chosen_seed": 22, # random seed
           "weight_dec": 0.01, # weight decay
       },
   },
   models[3]: { # pritamdeka/BioBert-PubMed200kRCT
       settings[1]: { # statement_only
           "epochs": 4, # epochs to train
           "batch_size": 20, # batch size
           "lr": 0.00010274905958215669, # learning rate
           "padd": "longest", # padding
           "trunc": "only_first", # truncation
           "max_len": 512, # max input length
           "warmup": 204, # warm up steps
           "chosen_seed": 1, # random seed
           "weight_dec": 0.29127004443098675, # weight decay
       },
   },
   models[4]: { # allenai/biomed_roberta_base
       settings[1]: { # statement_only
           "epochs": 4, # epochs to train
           "batch_size": 20, # batch size
           "lr": 0.0001006811343796169, # learning rate
           "padd": "longest", # padding
           "trunc": "only_first", # truncation
           "max_len": 512, # max input length
           "warmup": 124, # warm up steps
           "chosen_seed": 6, # random seed
           "weight_dec": 0.24031521060055702, # weight decay
       },
   },
   models[5]: { # yikuan8/Clinical-Longformer
       settings[1]: { # statement_only
           "epochs": 5, # epochs to train
           "batch_size": 32, # batch size
           "lr": 2e-5, # learning rate
           "padd": "longest", # padding
           "trunc": "only_first", # truncation
           "max_len": 512, # max input length
           "warmup": 0, # warm up steps
           "chosen_seed": 22, # random seed
           "weight_dec": 0.01, # weight decay
       },
   },
 }

## **2.7 Loading selected Model and Tokenizer**
- selected model is loaded via `AutoModelForSequenceClassification`
- as tokenizer we use `AutoTokenizer` for the selected model
- `tokenize_function` is defined for mapping


In [None]:
# build ID to CTR mapping and transform data in data structure used by Tokenizer and Model
id_to_clinical_trial_record = id_to_CTR(raw_texts)
data = transform_data(annotations, id_to_clinical_trial_record, label2id, setting)

# model und tokenizer initialisation
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True, return_dict=True)

model = model_init()
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding=train_args[model_name][setting]["padd"],
        truncation=train_args[model_name][setting]["trunc"],
        max_length=train_args[model_name][setting]["max_len"])

tokenized_datasets = data.map(tokenize_function, batched=True)

## **2.8 Setting Training Arguments**
`TrainingArguments` takes parameters from the `train_args` defined in 2.6


In [None]:
training_args = TrainingArguments(
  output_dir=f"{path}/{model_name}_{setting}",  # output directory for results
  num_train_epochs=train_args[model_name][setting]["epochs"],  # number of training epochs
  per_device_train_batch_size=train_args[model_name][setting]["batch_size"],  # batch size per device during training
  evaluation_strategy="epoch",  # evaluate in the end of each epoch
  use_cpu=False,
  learning_rate=train_args[model_name][setting]["lr"],
  logging_steps=10, # default 500
  weight_decay=train_args[model_name][setting]["weight_dec"],
  gradient_accumulation_steps=4, # same for all models & settings
  warmup_steps=train_args[model_name][setting]["warmup"],
  load_best_model_at_end=True,
  metric_for_best_model="f1",
  save_strategy="epoch",
  seed=train_args[model_name][setting]["chosen_seed"],
)

## **2.9 Experiment Tracking with Weights and Biases**
Initialize `wandb` with the parameters from the `train_args` defined in 2.6


In [None]:
# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project=model_name.replace("/", "_"), # problems with / in project name
    # track hyperparameters and run metadata
    config={
        "model": model_name,
        "learning_rate": train_args[model_name][setting]["lr"],
        "logging_steps": 10,
        "num_epochs": train_args[model_name][setting]["epochs"],
        "train_samples": setting,
        "architecture": "Pre-Trained-Transfomer",
        "dataset": "bigbio/sem_eval_2024_task_2",
        "batch_size": train_args[model_name][setting]["batch_size"],
        "padding": train_args[model_name][setting]["padd"],
        'truncation': train_args[model_name][setting]["trunc"],
        'max_length': train_args[model_name][setting]["max_len"],
        'warmup_steps': train_args[model_name][setting]["warmup"],
        'seed': train_args[model_name][setting]["chosen_seed"],
        'weight_decay': train_args[model_name][setting]["weight_dec"],
    }
)
wandb.run.name = model_name + "_" + setting # wandb run name

## **2.10 Initialize Trainer**
for evaluation we use `compute_metrics` defined in 2.3


In [None]:
trainer = Trainer(
    args=training_args,  # settings for training (TrainingArguments)
    train_dataset=tokenized_datasets["train"],  # train set
    eval_dataset=tokenized_datasets["validation"], # dev set
    tokenizer=tokenizer,  # Tokenizer
    compute_metrics=compute_metrics, # evaluation metrics
    model_init=model_init,
  )

## **2.11 Hyperparameter Tuning with `optuna`**
- creating `optuna` study for 10 trials
- `objective` is set to minimize loss and maximize F1


In [None]:
def objective(trial):
  	# defining ranges for hyperparameter search
    learning_rate = trial.suggest_float('learning_rate', 5e-5, 1e-2, log=True)
    num_train_epochs = trial.suggest_int('num_train_epochs', 2, 6) # <- change max for larger models
    per_device_train_batch_size = trial.suggest_int('per_device_train_batch_size', 2, 20)
    wd = trial.suggest_float('weight_decay', 0.1, 0.4)
    wus = trial.suggest_int('warm_up_steps', 5, 400)
    random_seed = trial.suggest_int('seed', 0, 40)

    # use these parameters for training
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        evaluation_strategy="epoch",
        use_cpu=False,
        logging_steps=10,
        weight_decay=wd,
        gradient_accumulation_steps=4,
        warmup_steps=wus,
        seed=random_seed,
    )

    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"], # dev set
        tokenizer=tokenizer,
    )

    trainer.train()

    metrics = compute_metrics(trainer.predict(tokenized_datasets["validation"]))

    return trainer.evaluate()['eval_loss'], metrics["f1"]

## **2.12 Train (or Tune)!**
Tuning:
-  set `enable_fine_tuning` to `True` if you want to search for new hyperparameters, default `False`

Training:
- set `enable_fine_tuning` to `False`
- after training, save runs with `wandb`





In [None]:
enable_fine_tuning = False

if enable_fine_tuning:
  study = optuna.create_study(directions=['minimize', 'maximize'])
  study.optimize(objective, n_trials=10)
else:
  # start training
  trainer.train()
  wandb.finish()

## **2.13 Save Model**
Save the model to `path` defined in 2.1 if you like


In [None]:
# only save model if results are better than logged results in wandb
is_saving = int(input("Do you want to save the model? \n" +
    f"[0] no \n" +
    f"[1] yes \n"))

print(f"Saving: yes") if is_saving else print(f"Saving: no")

if is_saving:
  trainer.save_model(f"{path}/{model_name}_{setting}")


---



---



# **3. Predicting**

## **3.1 Setting *path*-variable to load fine-tuned model**
- If saved locally, enter local path to load the model
- If Google Colab is used (and saved in Drive), please mount your Google Drive and enter the required path.

In [None]:
from google.colab import drive
import google.colab

model_path = "./"

if 'google.colab' in str(get_ipython()):
  drive.mount('/content/drive')

  user_input = str(input("Enter path to load the model: "))# set path to directory for saving models
  model_path = user_input if user_input else model_path
  print(f"Path: {model_path}")
else:
  user_input = str(input("Enter path to load the model: ")) # set path to directory for saving models
  model_path = user_input if user_input else model_path
  print(f"Path: {model_path}")

## **3.2 Mapping - Statement to CTR**


In [None]:
# maps all statements to their CTR
statement_to_id_test = {}

count = 0
for instance in annotations["test"]:
  statement_to_id_test[(count, instance["statement"])] = instance["id"]
  count += 1

## **3.3 Loading saved Model and Tokenizer**
- saved model is loaded via `AutoModelForSequenceClassification`
- as tokenizer we use `AutoTokenizer` for the saved model
- `tokenize_function` is defined for mapping


In [None]:
# build ID to CTR mapping and transform data in data structure used by Tokenizer and Model
id_to_clinical_trial_record = id_to_CTR(raw_texts)
data = transform_data(annotations, id_to_clinical_trial_record, label2id, "statement_only")

# Load the model configuration
config = AutoConfig.from_pretrained(model_path, num_labels=2, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True)

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_path, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_path)
train_args = torch.load(f"{model_path}training_args.bin")

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="longest",
        truncation="only_first",
        max_length=512)

tokenized_datasets = data.map(tokenize_function, batched=True)

## **3.4 Predict Labels for Test Set**



In [None]:
# Put the model in evaluation mode
model.eval()

trainer = Trainer(
    model=model,
)

results = trainer.predict(tokenized_datasets["test"])
predicted_labels = results.predictions.argmax(-1)

## **3.5 Inspect Predictions**
- map labels to CTR-IDs of the statements


In [None]:
predicted_results = {}

count = 0  # counter used to differentiate identical CTRs (expect the ID)
for sen, label in zip(tokenized_datasets["test"]["text"], predicted_labels):
  mapped_label = id2label[label]
  id = statement_to_id_test[(count, sen)]
  count += 1
  print(mapped_label, "\t", id)

  predicted_results[id] = mapped_label

## **3.6 Save Predictions**
You can save the dictionary `predicted_results` in a `json`-file if you like:

In [None]:
path_save = str(input("Enter path to save the predictions: "))

with open(path_save + "predictions.json", 'w') as json_file:
  json.dump(predicted_results, json_file)

# Print the path to the saved file
print(f"Results saved to: {path_save}")