In [24]:
import logging
from typing import Optional

from datasets import load_dataset
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer
import typer

In [22]:
import re
import shutil
from pathlib import Path

import sklearn.metrics
import torch
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

def export_labels_to_model(model_name: str, model) -> None:
    """
    Reads from a model configuration to export the labels of the class target to a file in the model's assets folder.
    
    Args:
      model_name (str): The name of the model. This is used to create a directory for the model.
      model: The model to export.
    """
    labels = model.config.label2id
    labels = sorted(labels, key=labels.get)

    model_assets_path = f'models/{model_name}/saved_model/1/assets'

    with open(f'{model_assets_path}/labels.txt', 'w') as f:
        f.write('\n'.join(labels))

def save_model_from_hub(model_name: str) -> None:
    """
    We load the model and tokenizer from the HuggingFace hub, save them to the `models` directory, and then export
    the labels of the model to the directory that contains all the assets.
    
    Args:
      model_name (str): The name of the model you want to save.
    """

    model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model.save_pretrained(f'models/{model_name}', from_tf=True, save_format='tf', saved_model=True)
    tokenizer.save_pretrained(f'models/{model_name}_tokenizer', from_tf=True, save_format='tf')
    export_labels_to_model(model_name, model)

    print(f"Model {model_name} saved.")

def copy_tokenizer_vocab_to_model(model_name):
    """
    We copy the tokenizer's vocabulary to the model's directory, so that we can use the model for
    predictions.

    Args:
        model_name (str): The name of the model you want to use.
    """

    tokenizer_vocab_path = f'models/{model_name}_tokenizer/vocab.txt'
    model_assets_path = f'models/{model_name}/saved_model/1/assets'

    shutil.copyfile(tokenizer_vocab_path, f'{model_assets_path}/vocab.txt')
    

def prepare_model_from_hub(model_name: str, model_dir:str) -> None:
    """
    If the model directory doesn't exist, download the model from the HuggingFace Hub, and copy the tokenizer
    vocab to the model directory so that the format can be digested by Spark NLP.
    
    Args:
      model_name (str): The name of the model you want to use.
      model_dir (str): The directory where the model will be saved.
    """

    model_path = f'{model_dir}/{model_name}'

    if not Path(model_path).is_dir():
        save_model_from_hub(model_name)
        copy_tokenizer_vocab_to_model(model_name)

def get_label_metadata(dataset):
  """
  It takes a dataset and returns a list of labels, a dictionary mapping label ids to labels, and a
  dictionary mapping labels to label ids
  
  Args:
    dataset: the dataset object
  """
  labels = list(set(dataset['train']['label']).union(set(dataset['test']['label'])))
  id2label = dict(enumerate(labels))
  label2id = {label:idx for idx, label in enumerate(labels)}
  return labels, id2label, label2id

def compute_metrics(eval_pred):
  """
  It takes in the predictions and labels from the model, and returns a dictionary of metrics.
  Logits are converted into probabilities following a sigmoid function; then, the predictions are
  converted into binary values by comparing the probabilities to a threshold.
  
  Args:
    eval_pred: a tuple of (predictions, labels)
  
  Returns:
    A dictionary with the accuracy, f1_micro and f1_macro
  """
  #sigmoid_threshold = 0.3
  #print(eval_pred)  
  predictions, labels = eval_pred
  #print( predictions, labels)  
  predicted_labels_index = np.argmax(predictions, axis=-1)
  num_classes = predictions.shape[1]  # Assuming predictions is a 2D array where axis=1 is the number of classes
  predicted_labels = np.eye(num_classes)[predicted_labels_index]  
  print(predicted_labels_index,predicted_labels,labels)  
   # Compute accuracy
  accuracy = (predicted_labels == labels).mean()
   # Compute F1 scores
  f1_micro = sklearn.metrics.f1_score(labels, predicted_labels, average="micro")
  f1_macro = sklearn.metrics.f1_score(labels, predicted_labels, average="macro")
    
  return {
        "accuracy": accuracy,
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "eval_f1": f1_micro
    }

def accuracy_thresh(y_pred, y_true, thresh): 
    """
    It takes in a predicted probability and a true label, and returns the accuracy of the prediction
    
    Args:
      y_pred: the predicted values
      y_true: the ground truth labels
      thresh: the threshold for the prediction to be considered a positive prediction.
    
    Returns:
      The mean of the accuracy of the predictions.
    """
    y_pred = torch.from_numpy(y_pred).sigmoid()
    #print(y_pred)
    y_true = torch.from_numpy(y_true)
    #print(y_true)
    return ((y_pred>thresh)==y_true.bool()).float().mean().item()


def prepare_splits_for_training(dataset, subset_data):
  """Splits and shuffles the dataset into train and test splits.

  Args:
      dataset (DatasetDict): The dataset to split. 
      subset_data (bool, optional): Flag to use a subset of the data.

  Returns:
      Tuple[Dataset]: One dataset object per train, test split.
  """
  fraction = 0.05 if subset_data else 1
  splits = [dataset["train"], dataset["test"]]

  return [
    split.shuffle(seed=42).select(range(int(len(split) * fraction)))
    for split in splits
  ]

def convert_to_tf_dataset(dataset, data_collator, shuffle_flag, batch_size):
  """
  We convert the dataset to a tf.data.Dataset object, which is a TensorFlow object that can be used
  to train a model
  
  Args:
    dataset: The dataset to convert to a tf.data.Dataset.
    data_collator: This is a function that takes in a list of tensors and returns a single tensor.
    shuffle_flag: Whether to shuffle the dataset or not.
    batch_size: The number of samples per batch.
  
  Returns:
    A tf.data.Dataset object
  """
  return (
      dataset.to_tf_dataset(
          columns=["attention_mask", "input_ids", "token_type_ids"],
          label_cols=["labels"],
          shuffle=shuffle_flag,
          collate_fn=data_collator,
          batch_size=batch_size
      )
  )

def preprocess_text(text: str):
    """Cleans and removes special characters from the text."""

    replacements = [
        (r"what's", "what is "),
        (r"won't", "will not "),
        (r"\'s", " "),
        (r"\'ve", " have "),
        (r"can't", "can not "),
        (r"n't", " not "),
        (r"i'm", "i am "),
        (r"\'re", " are "),
        (r"\'d", " would "),
        (r"\'ll", " will "),
        (r"\'scuse", " excuse "),
        (r"\'\n", " "),
        (r"-", " "),
        (r"\'\xa0", " "),
        (r"(@.*?)[\s]", " "),
        (r"&amp;", "&"),
    ]
    
    text = text.lower()
    for pattern, replacement in replacements:
        text = re.sub(pattern, replacement, text)

    text = re.sub(r"\s+", " ", text).strip()
    return text


In [23]:
dataset = load_dataset("Kira-Asimov/gender_clinical_trial", split='train')
print(dataset)

Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['id', 'input', 'label', '__index_level_0__'],
    num_rows: 401406
})


In [25]:
def tokenize(batch):
    """Tokenises the text and creates a numpy array with its assigned labels."""
    text = [preprocess_text(text) for text in batch["input"]]
    encoding = tokenizer(text, max_length=256, padding="max_length", truncation=True)
    
    labels_batch = batch['label']
    #print(labels_batch)
    labels_matrix = np.zeros((len(text), len(labels)))
    for idx, temp in enumerate(labels_batch):
        #print(label)
        label_num= label2id[labels_batch[idx]]
        #print(label_num)
        labels_matrix[idx, label_num] = 1
    #print(labels_matrix)
    encoding["labels"] = labels_matrix.tolist()
    #print(encoding["labels"][0:100])
    return encoding

In [4]:
class MultilabelTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False):
            """
            Custom loss function calculation using BCEWithLogitsLoss, it returns the loss and the outputs if the
            return_outputs flag is set to True
            This function is used during training, evaluation, and prediction; specifically every time a batch is processed.
            The default loss function is here https://github.com/huggingface/transformers/blob/820c46a707ddd033975bc3b0549eea200e64c7da/src/transformers/trainer.py#L2561
            
            Args:
              model: the model we're training
              inputs: a dictionary of input tensors
              return_outputs: if True, the loss and the model outputs are returned. If False, only the loss is
            returned. Defaults to False
            
            Returns:
              The loss and the outputs of the model.
            """
            labels = inputs.pop("labels")
            # forward pass
            outputs = model(**inputs)
            logits = outputs.logits
            # compute custom loss
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
                            labels.float().view(-1, self.model.config.num_labels))
            return (loss, outputs) if return_outputs else loss


# Testing the original model

In [26]:
def instantiate_classifier(model_name,labels, id2label, label2id):
    """
    We're instantiating a BERT model, and then replacing the classification layer with a custom one for our task.
    
    Args:
      labels: a list of all the labels in the dataset
      id2label: a dictionary mapping from label ids to label names
      label2id: a dictionary mapping labels to integers
    
    Returns:
      A model with a classifier that has 3 layers.
    """

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        problem_type="single_label_classification",
        num_labels=len(labels),
        id2label=id2label,
        label2id=label2id,
    )
    model.classifier = nn.Sequential(
        nn.Linear(768, 50),
        nn.ReLU(),
        nn.Linear(50, len(labels))
    )
    return model

In [28]:
from transformers import EarlyStoppingCallback, IntervalStrategy
def training(
    epochs= 3 ,
    output_model_name= 'gender_',
    subset_data: bool = False,
    push_to_hub: bool = False,
    personal_token: Optional[str] = None,
    model_name='domenicrosati/ClinicalTrialBioBert'
):
    """
    Main logic of the fine-tuning process: this function loads the dataset, tokenizes it,
    splits it into train and validation sets, loads the model, trains it, and saves it
    
    Args:
      epochs (int): number of epochs to train for
      output_model_name (str): filename and path to the directory where the model will be saved.
      subset_data (bool): flag to indicate whether to use a subset of the data for testing purposes
      push_to_hub (bool): flag to indicate whether to push the model to the hub
      personal_token (str | None): your personal Hugging Face Hub token
    """
    
    logging.basicConfig(level=logging.INFO)
    dataset_whole=load_dataset("Kira-Asimov/gender_clinical_trial", split='train')
    dataset =  dataset_whole.select(range(361265)).train_test_split(test_size=0.1, seed=42)
    test_dataset=dataset_whole.select(range(40141))
    #print(dataset)
    global labels, id2label, label2id
    labels, id2label, label2id = get_label_metadata(dataset)
    print(labels,id2label, label2id)
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
    dataset_cols = [col for col in dataset["train"].column_names if col not in ["text", "input_ids", "attention_mask", "labels"]]
    tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset_cols)
    #print(tokenized_dataset)
    train_dataset, test_dataset = prepare_splits_for_training(tokenized_dataset, subset_data)
    #print(train_dataset)
    logging.info(f"Train dataset length: {len(train_dataset)}")
    logging.info(f"Test dataset length: {len(test_dataset)}")
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    args = TrainingArguments(
        output_dir=output_model_name,
        evaluation_strategy="steps",
        learning_rate=5e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        weight_decay=0.01,
        data_seed=42,
        num_train_epochs=epochs,
        metric_for_best_model="f1",
        save_total_limit=1,
        load_best_model_at_end=True,
        report_to=["tensorboard"],
        eval_steps=1000,
        save_steps=1000
        
       
    )
    trainer = MultilabelTrainer(
        model=instantiate_classifier(model_name,labels, id2label, label2id),
        args=args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
    )
    
    trainer.train()
    metrics = trainer.evaluate()
    print(metrics)
    predictions = trainer.predict(test_dataset)
    print(predictions)
    trainer.save_model(output_model_name)
    if push_to_hub:
        trainer.push_to_hub()

    return trainer,test_dataset

In [8]:
trainer,test_data=training(model_name='domenicrosati/ClinicalTrialBioBert')

Repo card metadata block was not found. Setting CardData to empty.


['All', 'None', 'Male', 'Female'] {0: 'All', 1: 'None', 2: 'Male', 3: 'Female'} {'All': 0, 'None': 1, 'Male': 2, 'Female': 3}


Map:   0%|          | 0/325138 [00:00<?, ? examples/s]

Map:   0%|          | 0/36127 [00:00<?, ? examples/s]

INFO:root:Train dataset length: 325138
INFO:root:Test dataset length: 36127
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at domenicrosati/ClinicalTrialBioBert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,F1,Accuracy,F1 Micro,F1 Macro
1000,0.1541,0.146124,0.961442,0.980721,0.961442,0.668738
2000,0.1386,0.136292,0.963628,0.981814,0.963628,0.674584
3000,0.1431,0.127135,0.965012,0.982506,0.965012,0.677971
4000,0.1342,0.125617,0.96587,0.982935,0.96587,0.679024
5000,0.1279,0.125586,0.967005,0.983503,0.967005,0.68106
6000,0.1288,0.124586,0.967614,0.983807,0.967614,0.681432
7000,0.1159,0.121664,0.967254,0.983627,0.967254,0.680519
8000,0.1199,0.123526,0.967697,0.983849,0.967697,0.683198
9000,0.1164,0.114208,0.968611,0.984305,0.968611,0.685141
10000,0.1265,0.115291,0.969054,0.984527,0.969054,0.685929


[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 

Checkpoint destination directory gender/checkpoint-11000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 

[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
{'eval_f1': 0.9728457940044842, 'eval_loss': 0.11968476325273514, 'eval_accuracy': 0.9864228970022421, 'eval_f1_micro': 0.9728457940044842, 'eval_f1_macro': 0.694442020880927, 'eval_runtime': 108.9114, 'eval_samples_per_second': 331.71, 'eval_steps_per_second': 10.366, 'epoch': 3.0}
[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
PredictionOutput(predictions=array([[ 4.716741  , -2.342588  , -2.5688286 , -0.02610279],
       [ 5.1555758 , -1.8308469 , -1.9624968 , -1.0014923 ],
       [-1.3350621 , -3.259153  , -3.926163  ,  4.3154197 ],
       ...,
       [ 5.8448224 , -1.4336758 , -1.792121  , -1.7757436 ],
       [ 5.

In [19]:
dataset_whole=load_dataset("Kira-Asimov/gender_clinical_trial", split='train')
val_data= dataset_whole.select(range(361265)).train_test_split(test_size=0.1, seed=42)['test']
test_data=dataset_whole.select(range(40141))

Repo card metadata block was not found. Setting CardData to empty.


In [9]:
metrics = trainer.evaluate()
print(metrics)

[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
{'eval_f1': 0.9728457940044842, 'eval_loss': 0.11968476325273514, 'eval_accuracy': 0.9864228970022421, 'eval_f1_micro': 0.9728457940044842, 'eval_f1_macro': 0.694442020880927, 'eval_runtime': 106.9055, 'eval_samples_per_second': 337.934, 'eval_steps_per_second': 10.561, 'epoch': 3.0}


In [30]:
test_dataset = test_data
#labels, id2label, label2id = get_label_metadata(test_dataset)
#tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
dataset_cols = [col for col in test_dataset.column_names if col not in ["text", "input_ids", "attention_mask", "labels"]]
tokenized_test_dataset = test_dataset.map(tokenize, batched=True, remove_columns=dataset_cols)
predictions = trainer.predict(tokenized_test_dataset )
print(predictions)

Map:   0%|          | 0/40141 [00:00<?, ? examples/s]

[0 3 0 ... 0 3 0] [[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 ...
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 ...
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]]
PredictionOutput(predictions=array([[ 4.9490566, -1.2921993, -0.8475742, -1.8257195],
       [ 0.6916529, -3.1520386, -4.12158  ,  3.8424046],
       [ 5.7010155, -1.6232574, -2.0388768, -1.4439045],
       ...,
       [ 5.7971687, -1.3787054, -1.6560345, -1.8532151],
       [-1.0265679, -3.062165 , -4.358201 ,  4.1740217],
       [ 4.6868644, -1.4692303, -0.2606884, -1.7192413]], dtype=float32), label_ids=array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.]], dtype=float32), metrics={'test_loss': 0.059766173362731934, 'test_accuracy': 0.9932737101716449, 'test_f1_micro': 0.9865474203432899, 'test_f1_macro': 0.7243323118107851, 'test_eval_f1': 0.9865474203432899, 'test_runt

In [31]:
from sklearn.metrics import classification_report

# Tokenize test dataset
predictions = trainer.predict(tokenized_test_dataset).predictions

predicted_labels_index = np.argmax(predictions, axis=-1)
num_classes = predictions.shape[1]  # Assuming predictions is a 2D array where axis=1 is the number of classes
predicted_labels = np.eye(num_classes)[predicted_labels_index]  
true_labels = tokenized_test_dataset["labels"]
# Get predictions


# Convert probabilities to binary predictions using threshold
#predicted_labels = (predictions > sigmoid_threshold).astype(int)
report = classification_report(true_labels, predicted_labels, target_names=labels, digits=4)
print(report)

[0 3 0 ... 0 3 0] [[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 ...
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 ...
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]]
              precision    recall  f1-score   support

         All     0.9899    0.9952    0.9925     34451
        None     0.0000    0.0000    0.0000        58
        Male     0.9688    0.9259    0.9469      1646
      Female     0.9644    0.9516    0.9579      3986

   micro avg     0.9865    0.9865    0.9865     40141
   macro avg     0.7308    0.7182    0.7243     40141
weighted avg     0.9850    0.9865    0.9858     40141
 samples avg     0.9865    0.9865    0.9865     40141



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Testing Fine-Tuned model

In [43]:
from peft import PeftModel,PeftConfig
def instantiate_classifier_finetuned(model_name,labels, id2label, label2id):
    """
    We're instantiating a BERT model, and then replacing the classification layer with a custom one for our task.
    
    Args:
      labels: a list of all the labels in the dataset
      id2label: a dictionary mapping from label ids to label names
      label2id: a dictionary mapping labels to integers
    
    Returns:
      A model with a classifier that has 3 layers.
    """

    model = AutoModelForSequenceClassification.from_pretrained(
       'domenicrosati/ClinicalTrialBioBert',
        problem_type="single_label_classification",
        num_labels=len(labels),
        id2label=id2label,
        label2id=label2id,
    )
    model = PeftModel.from_pretrained(model,model_name,is_trainable=True)
    #merged_model=model.merge_and_unload()
    #for param in merged_model.parameters():
     #    param.requires_grad = True
    model.classifier = nn.Sequential(
        nn.Linear(768, 50),
        nn.ReLU(),
        nn.Linear(50, len(labels))
    )
    return model

def training(
    epochs= 3 ,
    output_model_name= 'gender_fine-tuned',
    subset_data: bool = False,
    push_to_hub: bool = False,
    personal_token: Optional[str] = None,
    model_name='domenicrosati/ClinicalTrialBioBert'
):
    """
    Main logic of the fine-tuning process: this function loads the dataset, tokenizes it,
    splits it into train and validation sets, loads the model, trains it, and saves it
    
    Args:
      epochs (int): number of epochs to train for
      output_model_name (str): filename and path to the directory where the model will be saved.
      subset_data (bool): flag to indicate whether to use a subset of the data for testing purposes
      push_to_hub (bool): flag to indicate whether to push the model to the hub
      personal_token (str | None): your personal Hugging Face Hub token
    """
    
    logging.basicConfig(level=logging.INFO)

    dataset_whole=load_dataset("Kira-Asimov/gender_clinical_trial", split='train')
    dataset =  dataset_whole.select(range(361265)).train_test_split(test_size=0.1, seed=42)
    test_data=dataset_whole.select(range(40141))
    #print(dataset)
    global labels, id2label, label2id
    labels, id2label, label2id = get_label_metadata(dataset)
    print(labels, id2label, label2id)
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained('domenicrosati/ClinicalTrialBioBert', do_lower_case=True)
    dataset_cols = [col for col in dataset["train"].column_names if col not in ["text", "input_ids", "attention_mask", "labels"]]
    tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset_cols)
    #print(tokenized_dataset)
    train_dataset, test_dataset = prepare_splits_for_training(tokenized_dataset, subset_data)
    #print(train_dataset)
    logging.info(f"Train dataset length: {len(train_dataset)}")
    logging.info(f"Test dataset length: {len(test_dataset)}")
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    args = TrainingArguments(
        output_dir=output_model_name,
        evaluation_strategy="steps",
        learning_rate=5e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        weight_decay=0.01,
        data_seed=42,
        num_train_epochs=epochs,
        metric_for_best_model="f1",
        save_total_limit=1,
        load_best_model_at_end=False,
        report_to=["tensorboard"],
        save_strategy='no',
        eval_steps=1000
    )
    trainer = MultilabelTrainer(
        model= instantiate_classifier_finetuned(model_name,labels, id2label, label2id),
        args=args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    
    trainer.train()
    metrics = trainer.evaluate()
    print(metrics)
    predictions = trainer.predict(test_dataset)
    print(predictions)
    # trainer.save_model(output_model_name)
    # if push_to_hub:
    #     trainer.push_to_hub()

    return trainer,test_data

In [45]:
trainer_new,test_data_new=training(model_name='checkpoint-205835')

Repo card metadata block was not found. Setting CardData to empty.


['All', 'None', 'Male', 'Female'] {0: 'All', 1: 'None', 2: 'Male', 3: 'Female'} {'All': 0, 'None': 1, 'Male': 2, 'Female': 3}


INFO:root:Train dataset length: 325138
INFO:root:Test dataset length: 36127
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at domenicrosati/ClinicalTrialBioBert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,F1,Accuracy,F1 Micro,F1 Macro
1000,0.1448,0.13939,0.961414,0.980707,0.961414,0.669151
2000,0.1334,0.128615,0.965649,0.982824,0.965649,0.678501
3000,0.1415,0.124494,0.966618,0.983309,0.966618,0.680979
4000,0.1308,0.122136,0.965704,0.982852,0.965704,0.67948
5000,0.1241,0.129894,0.967476,0.983738,0.967476,0.682833
6000,0.1281,0.131053,0.967393,0.983696,0.967393,0.680887
7000,0.1131,0.119495,0.967504,0.983752,0.967504,0.680975
8000,0.1197,0.117317,0.967808,0.983904,0.967808,0.683909
9000,0.1151,0.113977,0.969773,0.984887,0.969773,0.68794
10000,0.1265,0.113547,0.969109,0.984554,0.969109,0.685487


[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
[0 0 3 ... 0 3 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 

[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
{'eval_f1': 0.9729011542613558, 'eval_loss': 0.119154192507267, 'eval_accuracy': 0.9864505771306779, 'eval_f1_micro': 0.9729011542613558, 'eval_f1_macro': 0.6947800023735957, 'eval_runtime': 110.4846, 'eval_samples_per_second': 326.987, 'eval_steps_per_second': 10.219, 'epoch': 3.0}
[0 0 3 ... 0 0 0] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
PredictionOutput(predictions=array([[ 4.771084  , -2.0946858 , -2.7640128 , -0.06948467],
       [ 4.0393047 , -2.8971076 , -0.9918452 , -0.6479542 ],
       [-0.8695626 , -1.7980453 , -3.0349772 ,  5.308833  ],
       ...,
       [ 5.78096   , -1.4707508 , -2.7121763 , -1.7542156 ],
       [ 5.

In [46]:
from sklearn.metrics import classification_report
test_dataset = test_data_new

# Tokenize test dataset

dataset_cols = [col for col in test_dataset.column_names if col not in ["text", "input_ids", "attention_mask", "labels"]]
tokenized_test_dataset = test_dataset.map(tokenize, batched=True, remove_columns=dataset_cols)
predictions = trainer_new.predict(tokenized_test_dataset )
print(predictions)

Map:   0%|          | 0/40141 [00:00<?, ? examples/s]

[0 3 0 ... 0 3 0] [[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 ...
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 ...
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]]
PredictionOutput(predictions=array([[ 4.282083 , -2.168684 , -0.6400821, -1.9821142],
       [ 1.6935655, -2.6631646, -3.765895 ,  4.2597303],
       [ 5.553697 , -1.994703 , -2.4438477, -1.3117491],
       ...,
       [ 5.7561235, -1.431405 , -2.3210175, -2.203385 ],
       [-0.6470486, -1.8412061, -3.2235136,  5.3283615],
       [ 5.581014 , -1.4580578, -2.3946776, -1.894033 ]], dtype=float32), label_ids=array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.]], dtype=float32), metrics={'test_loss': 0.05281538888812065, 'test_accuracy': 0.994245285369074, 'test_f1_micro': 0.9884905707381481, 'test_f1_macro': 0.728208965271378, 'test_eval_f1': 0.9884905707381481, 'test_runtime

In [47]:
from sklearn.metrics import classification_report

# Tokenize test dataset
predictions = trainer_new.predict(tokenized_test_dataset).predictions

predicted_labels_index = np.argmax(predictions, axis=-1)
num_classes = predictions.shape[1]  # Assuming predictions is a 2D array where axis=1 is the number of classes
predicted_labels = np.eye(num_classes)[predicted_labels_index]  
true_labels = tokenized_test_dataset["labels"]
# Get predictions


# Convert probabilities to binary predictions using threshold
#predicted_labels = (predictions > sigmoid_threshold).astype(int)
report = classification_report(true_labels, predicted_labels, target_names=labels, digits=4)
print(report)

[0 3 0 ... 0 3 0] [[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 ...
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]] [[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 ...
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]]
              precision    recall  f1-score   support

         All     0.9911    0.9960    0.9935     34451
        None     0.0000    0.0000    0.0000        58
        Male     0.9722    0.9350    0.9532      1646
      Female     0.9718    0.9604    0.9661      3986

   micro avg     0.9885    0.9885    0.9885     40141
   macro avg     0.7338    0.7228    0.7282     40141
weighted avg     0.9870    0.9885    0.9877     40141
 samples avg     0.9885    0.9885    0.9885     40141



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
