In [3]:
import logging
from typing import Optional

from datasets import load_dataset
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer
import typer

In [2]:
import re
import shutil
from pathlib import Path

import sklearn.metrics
import torch
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

def export_labels_to_model(model_name: str, model) -> None:
    """
    Reads from a model configuration to export the labels of the class target to a file in the model's assets folder.
    
    Args:
      model_name (str): The name of the model. This is used to create a directory for the model.
      model: The model to export.
    """
    labels = model.config.label2id
    labels = sorted(labels, key=labels.get)

    model_assets_path = f'models/{model_name}/saved_model/1/assets'

    with open(f'{model_assets_path}/labels.txt', 'w') as f:
        f.write('\n'.join(labels))

def save_model_from_hub(model_name: str) -> None:
    """
    We load the model and tokenizer from the HuggingFace hub, save them to the `models` directory, and then export
    the labels of the model to the directory that contains all the assets.
    
    Args:
      model_name (str): The name of the model you want to save.
    """

    model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model.save_pretrained(f'models/{model_name}', from_tf=True, save_format='tf', saved_model=True)
    tokenizer.save_pretrained(f'models/{model_name}_tokenizer', from_tf=True, save_format='tf')
    export_labels_to_model(model_name, model)

    print(f"Model {model_name} saved.")

def copy_tokenizer_vocab_to_model(model_name):
    """
    We copy the tokenizer's vocabulary to the model's directory, so that we can use the model for
    predictions.

    Args:
        model_name (str): The name of the model you want to use.
    """

    tokenizer_vocab_path = f'models/{model_name}_tokenizer/vocab.txt'
    model_assets_path = f'models/{model_name}/saved_model/1/assets'

    shutil.copyfile(tokenizer_vocab_path, f'{model_assets_path}/vocab.txt')
    

def prepare_model_from_hub(model_name: str, model_dir:str) -> None:
    """
    If the model directory doesn't exist, download the model from the HuggingFace Hub, and copy the tokenizer
    vocab to the model directory so that the format can be digested by Spark NLP.
    
    Args:
      model_name (str): The name of the model you want to use.
      model_dir (str): The directory where the model will be saved.
    """

    model_path = f'{model_dir}/{model_name}'

    if not Path(model_path).is_dir():
        save_model_from_hub(model_name)
        copy_tokenizer_vocab_to_model(model_name)

def get_label_metadata(dataset):
  """
  It takes a dataset and returns a list of labels, a dictionary mapping label ids to labels, and a
  dictionary mapping labels to label ids
  
  Args:
    dataset: the dataset object
  """
  labels = [label for label in dataset['train'].features.keys() if label not in ['text', 'label_descriptions']]
  id2label = dict(enumerate(labels))
  label2id = {label:idx for idx, label in enumerate(labels)}
  return labels, id2label, label2id

def compute_metrics(eval_pred):
  """
  It takes in the predictions and labels from the model, and returns a dictionary of metrics.
  Logits are converted into probabilities following a sigmoid function; then, the predictions are
  converted into binary values by comparing the probabilities to a threshold.
  
  Args:
    eval_pred: a tuple of (predictions, labels)
  
  Returns:
    A dictionary with the accuracy, f1_micro and f1_macro
  """
  sigmoid_threshold = 0.3
  #print(eval_pred)  
  predictions, labels = eval_pred
  #print( predictions, labels)  
  accuracy = accuracy_thresh(predictions, labels, sigmoid_threshold)
  f1_micro = sklearn.metrics.f1_score(labels, (predictions > sigmoid_threshold), average="micro")
  f1_macro = sklearn.metrics.f1_score(labels, (predictions > sigmoid_threshold), average="macro")
  #confusion_matrix = sklearn.metrics.confusion_matrix(labels.flatten(), (predictions > 0.5).flatten().astype(int))
  #print(confusion_matrix)  
  return {
      "accuracy_thresh": accuracy,
      "f1_micro": f1_micro,
      "f1_macro": f1_macro,
      "eval_f1": f1_micro
  }

def accuracy_thresh(y_pred, y_true, thresh): 
    """
    It takes in a predicted probability and a true label, and returns the accuracy of the prediction
    
    Args:
      y_pred: the predicted values
      y_true: the ground truth labels
      thresh: the threshold for the prediction to be considered a positive prediction.
    
    Returns:
      The mean of the accuracy of the predictions.
    """
    y_pred = torch.from_numpy(y_pred).sigmoid()
    #print(y_pred)
    y_true = torch.from_numpy(y_true)
    #print(y_true)
    return ((y_pred>thresh)==y_true.bool()).float().mean().item()


def prepare_splits_for_training(dataset, subset_data):
  """Splits and shuffles the dataset into train and test splits.

  Args:
      dataset (DatasetDict): The dataset to split. 
      subset_data (bool, optional): Flag to use a subset of the data.

  Returns:
      Tuple[Dataset]: One dataset object per train, test split.
  """
  fraction = 0.05 if subset_data else 1
  splits = [dataset["train"], dataset["test"]]

  return [
    split.shuffle(seed=42).select(range(int(len(split) * fraction)))
    for split in splits
  ]

def convert_to_tf_dataset(dataset, data_collator, shuffle_flag, batch_size):
  """
  We convert the dataset to a tf.data.Dataset object, which is a TensorFlow object that can be used
  to train a model
  
  Args:
    dataset: The dataset to convert to a tf.data.Dataset.
    data_collator: This is a function that takes in a list of tensors and returns a single tensor.
    shuffle_flag: Whether to shuffle the dataset or not.
    batch_size: The number of samples per batch.
  
  Returns:
    A tf.data.Dataset object
  """
  return (
      dataset.to_tf_dataset(
          columns=["attention_mask", "input_ids", "token_type_ids"],
          label_cols=["labels"],
          shuffle=shuffle_flag,
          collate_fn=data_collator,
          batch_size=batch_size
      )
  )

def preprocess_text(text: str):
    """Cleans and removes special characters from the text."""

    replacements = [
        (r"what's", "what is "),
        (r"won't", "will not "),
        (r"\'s", " "),
        (r"\'ve", " have "),
        (r"can't", "can not "),
        (r"n't", " not "),
        (r"i'm", "i am "),
        (r"\'re", " are "),
        (r"\'d", " would "),
        (r"\'ll", " will "),
        (r"\'scuse", " excuse "),
        (r"\'\n", " "),
        (r"-", " "),
        (r"\'\xa0", " "),
        (r"(@.*?)[\s]", " "),
        (r"&amp;", "&"),
    ]
    
    text = text.lower()
    for pattern, replacement in replacements:
        text = re.sub(pattern, replacement, text)

    text = re.sub(r"\s+", " ", text).strip()
    return text


In [4]:
def tokenize(batch):
    """Tokenises the text and creates a numpy array with its assigned labels."""
    text = [preprocess_text(text) for text in batch["text"]]
    encoding = tokenizer(text, max_length=177, padding="max_length", truncation=True)

    labels_batch = {k: batch[k] for k in batch.keys() if k in labels}
    #print(labels_batch)
    labels_matrix = np.zeros((len(text), len(labels)))
    for idx, label in enumerate(labels):
        #print(label)
        labels_matrix[:, idx] = labels_batch[label]
    #print(labels_matrix)
    encoding["labels"] = labels_matrix.tolist()
    #print(encoding["labels"])
    return encoding

In [5]:
class MultilabelTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False):
            """
            Custom loss function calculation using BCEWithLogitsLoss, it returns the loss and the outputs if the
            return_outputs flag is set to True
            This function is used during training, evaluation, and prediction; specifically every time a batch is processed.
            The default loss function is here https://github.com/huggingface/transformers/blob/820c46a707ddd033975bc3b0549eea200e64c7da/src/transformers/trainer.py#L2561
            
            Args:
              model: the model we're training
              inputs: a dictionary of input tensors
              return_outputs: if True, the loss and the model outputs are returned. If False, only the loss is
            returned. Defaults to False
            
            Returns:
              The loss and the outputs of the model.
            """
            labels = inputs.pop("labels")
            # forward pass
            outputs = model(**inputs)
            logits = outputs.logits
            # compute custom loss
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
                            labels.float().view(-1, self.model.config.num_labels))
            return (loss, outputs) if return_outputs else loss


# Testing the original model

In [6]:
def instantiate_classifier(model_name,labels, id2label, label2id):
    """
    We're instantiating a BERT model, and then replacing the classification layer with a custom one for our task.
    
    Args:
      labels: a list of all the labels in the dataset
      id2label: a dictionary mapping from label ids to label names
      label2id: a dictionary mapping labels to integers
    
    Returns:
      A model with a classifier that has 3 layers.
    """

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        problem_type="multi_label_classification",
        num_labels=len(labels),
        id2label=id2label,
        label2id=label2id,
    )
    model.classifier = nn.Sequential(
        nn.Linear(768, 50),
        nn.ReLU(),
        nn.Linear(50, len(labels))
    )
    return model

In [7]:

def training(
    epochs= 20 ,
    output_model_name= 'stop_reasons',
    subset_data: bool = False,
    push_to_hub: bool = False,
    personal_token: Optional[str] = None,
    model_name='domenicrosati/ClinicalTrialBioBert'
):
    """
    Main logic of the fine-tuning process: this function loads the dataset, tokenizes it,
    splits it into train and validation sets, loads the model, trains it, and saves it
    
    Args:
      epochs (int): number of epochs to train for
      output_model_name (str): filename and path to the directory where the model will be saved.
      subset_data (bool): flag to indicate whether to use a subset of the data for testing purposes
      push_to_hub (bool): flag to indicate whether to push the model to the hub
      personal_token (str | None): your personal Hugging Face Hub token
    """
    
    logging.basicConfig(level=logging.INFO)

    dataset = load_dataset("opentargets/clinical_trial_reason_to_stop", split='train').train_test_split(test_size=0.1, seed=42)
    #print(dataset)
    global labels
    labels, id2label, label2id = get_label_metadata(dataset)

    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
    dataset_cols = [col for col in dataset["train"].column_names if col not in ["text", "input_ids", "attention_mask", "labels"]]
    tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset_cols)
    #print(tokenized_dataset)
    train_dataset, test_dataset = prepare_splits_for_training(tokenized_dataset, subset_data)
    #print(train_dataset)
    logging.info(f"Train dataset length: {len(train_dataset)}")
    logging.info(f"Test dataset length: {len(test_dataset)}")
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    args = TrainingArguments(
        output_dir=output_model_name,
        evaluation_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        weight_decay=0.01,
        data_seed=42,
        num_train_epochs=epochs,
        metric_for_best_model="f1",
        save_total_limit=1,
        load_best_model_at_end=True,
        report_to=["tensorboard"],
        save_strategy='epoch'
    )
    trainer = MultilabelTrainer(
        model=instantiate_classifier(model_name,labels, id2label, label2id),
        args=args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    
    trainer.train()
    metrics = trainer.evaluate()
    print(metrics)
    predictions = trainer.predict(test_dataset)
    print(predictions)
    trainer.save_model(output_model_name)
    if push_to_hub:
        trainer.push_to_hub()

    return trainer

In [70]:
trainer=training(model_name='domenicrosati/ClinicalTrialBioBert')

INFO:root:Train dataset length: 3372
INFO:root:Test dataset length: 375
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at domenicrosati/ClinicalTrialBioBert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy Thresh,F1 Micro,F1 Macro
1,No log,0.222329,0.0,0.934118,0.0,0.0
2,No log,0.212383,0.0,0.940863,0.0,0.0
3,No log,0.198061,0.347979,0.936941,0.347979,0.045496
4,No log,0.182911,0.319846,0.942902,0.319846,0.047401
5,0.222600,0.166113,0.449123,0.945726,0.449123,0.090466
6,0.222600,0.148894,0.572785,0.949647,0.572785,0.183926
7,0.222600,0.138052,0.552846,0.95749,0.552846,0.15613
8,0.222600,0.123628,0.62037,0.961255,0.62037,0.207971
9,0.222600,0.115981,0.667647,0.966431,0.667647,0.291283
10,0.123200,0.110649,0.69341,0.96549,0.69341,0.306426


Checkpoint destination directory stop_reasons/checkpoint-2120 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_f1': 0.7553191489361702, 'eval_loss': 0.10010654479265213, 'eval_accuracy_thresh': 0.9679999947547913, 'eval_f1_micro': 0.7553191489361702, 'eval_f1_macro': 0.44535013143424784, 'eval_runtime': 1.1242, 'eval_samples_per_second': 333.572, 'eval_steps_per_second': 10.674, 'epoch': 20.0}
PredictionOutput(predictions=array([[-6.1578274 , -4.349581  , -5.2267456 , ..., -2.925995  ,
        -7.59398   , -7.6312084 ],
       [-6.1778355 , -4.336791  , -5.2155905 , ..., -2.9656754 ,
        -7.557175  , -7.6767106 ],
       [-5.901342  , -3.7539372 , -5.0187736 , ..., -2.6876483 ,
        -7.1717057 , -7.239819  ],
       ...,
       [ 1.1950762 , -2.0148554 , -5.8048916 , ..., -1.8729451 ,
        -2.824988  , -4.97338   ],
       [-2.3707304 , -4.3253994 , -6.287864  , ...,  1.0312095 ,
        -5.6261773 , -4.512992  ],
       [-1.2205863 , -4.4121037 , -5.8266373 , ...,  0.17906903,
        -4.6545606 , -4.9390655 ]], dtype=float32), label_ids=array([[0., 0., 0., ..., 0., 0., 0.],
 

In [24]:
metrics = trainer.evaluate()
print(metrics)

{'eval_f1': 0.7542706964520368, 'eval_loss': 0.11011417955160141, 'eval_accuracy_thresh': 0.966901957988739, 'eval_f1_micro': 0.7542706964520368, 'eval_f1_macro': 0.46486736559453723, 'eval_runtime': 0.7703, 'eval_samples_per_second': 486.802, 'eval_steps_per_second': 15.578, 'epoch': 20.0}


In [30]:
test_dataset = load_dataset("opentargets/clinical_trial_reason_to_stop", split='all')
labels, id2label, label2id = get_label_metadata(test_dataset)
#tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
dataset_cols = [col for col in test_dataset.column_names if col not in ["text", "input_ids", "attention_mask", "labels"]]
tokenized_test_dataset = test_dataset.map(tokenize, batched=True, remove_columns=dataset_cols)
predictions = trainer.predict(tokenized_test_dataset )
print(predictions)

Map:   0%|          | 0/3747 [00:00<?, ? examples/s]

PredictionOutput(predictions=array([[-3.9271808 , -3.1209285 , -4.3894625 , ..., -3.6100576 ,
        -4.401446  , -4.186394  ],
       [-2.9825072 , -3.3565352 , -5.2395205 , ...,  1.6936542 ,
        -3.4803228 , -3.8520677 ],
       [ 0.23840736, -3.3388488 , -2.9050484 , ..., -1.7254094 ,
        -3.3295286 , -1.2810366 ],
       ...,
       [-3.8575442 , -2.9342434 , -4.6032095 , ..., -3.4704201 ,
        -4.4367795 , -4.2635236 ],
       [-4.0914125 ,  2.8136544 , -6.530663  , ..., -3.807361  ,
        -4.734856  , -5.6108866 ],
       [-3.5844002 , -2.249797  , -2.9839425 , ..., -2.3815846 ,
         1.2088852 , -4.242043  ]], dtype=float32), label_ids=array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]], dtype=float32), metrics={'test_loss': 0.0621742382645607, 'test_accuracy_thresh': 0.9867501

In [34]:
from sklearn.metrics import classification_report
test_dataset = load_dataset("opentargets/clinical_trial_reason_to_stop", split='all')

# Tokenize test dataset

dataset_cols = [col for col in test_dataset.column_names if col not in ["text", "input_ids", "attention_mask", "labels"]]
tokenized_test_dataset = test_dataset.map(tokenize, batched=True, remove_columns=dataset_cols)
predictions = trainer.predict(tokenized_test_dataset).predictions
true_labels = tokenized_test_dataset["labels"]
# Get predictions
sigmoid_threshold = 0.3

# Convert probabilities to binary predictions using threshold
predicted_labels = (predictions > sigmoid_threshold).astype(int)
report = classification_report(true_labels, predicted_labels, target_names=labels, digits=4)
print(report)

                         precision    recall  f1-score   support

          Another_Study     0.9613    0.7487    0.8418       199
Business_Administrative     0.9630    0.9520    0.9575       792
                Covid19     0.9892    1.0000    0.9946       183
           Endpoint_Met     0.0000    0.0000    0.0000        51
         Ethical_Reason     0.0000    0.0000    0.0000        17
      Insufficient_Data     0.0000    0.0000    0.0000        39
Insufficient_Enrollment     0.9792    0.9647    0.9719      1075
       Interim_Analysis     0.0000    0.0000    0.0000        28
         Invalid_Reason     0.8856    0.8360    0.8601       250
    Logistics_Resources     0.9521    0.5947    0.7321       301
               Negative     0.9603    0.9212    0.9404       368
             No_Context     0.0000    0.0000    0.0000        83
             Regulatory     0.8842    0.7500    0.8116       112
     Safety_Sideeffects     0.9634    0.8720    0.9154       211
           Study_Design 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Testing the fine-tuned model

In [14]:
from peft import PeftModel,PeftConfig
def instantiate_classifier_finetuned(model_name,labels, id2label, label2id):
    """
    We're instantiating a BERT model, and then replacing the classification layer with a custom one for our task.
    
    Args:
      labels: a list of all the labels in the dataset
      id2label: a dictionary mapping from label ids to label names
      label2id: a dictionary mapping labels to integers
    
    Returns:
      A model with a classifier that has 3 layers.
    """

    model = AutoModelForSequenceClassification.from_pretrained(
       'domenicrosati/ClinicalTrialBioBert',
        problem_type="multi_label_classification",
        num_labels=len(labels),
        id2label=id2label,
        label2id=label2id,
    )
    model = PeftModel.from_pretrained(model,model_name,is_trainable=True)
    #merged_model=model.merge_and_unload()
    #for param in merged_model.parameters():
    #     param.requires_grad = True
    model.classifier = nn.Sequential(
        nn.Linear(768, 50),
        nn.ReLU(),
        nn.Linear(50, len(labels))
    )
    return model

def training(
    epochs= 20 ,
    output_model_name= 'stop_reasons',
    subset_data: bool = False,
    push_to_hub: bool = False,
    personal_token: Optional[str] = None,
    model_name='domenicrosati/ClinicalTrialBioBert'
):
    """
    Main logic of the fine-tuning process: this function loads the dataset, tokenizes it,
    splits it into train and validation sets, loads the model, trains it, and saves it
    
    Args:
      epochs (int): number of epochs to train for
      output_model_name (str): filename and path to the directory where the model will be saved.
      subset_data (bool): flag to indicate whether to use a subset of the data for testing purposes
      push_to_hub (bool): flag to indicate whether to push the model to the hub
      personal_token (str | None): your personal Hugging Face Hub token
    """
    
    logging.basicConfig(level=logging.INFO)

    dataset = load_dataset("opentargets/clinical_trial_reason_to_stop", split='train').train_test_split(test_size=0.1, seed=42)
    #print(dataset)
    global labels
    labels, id2label, label2id = get_label_metadata(dataset)

    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained('domenicrosati/ClinicalTrialBioBert', do_lower_case=True)
    dataset_cols = [col for col in dataset["train"].column_names if col not in ["text", "input_ids", "attention_mask", "labels"]]
    tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset_cols)
    #print(tokenized_dataset)
    train_dataset, test_dataset = prepare_splits_for_training(tokenized_dataset, subset_data)
    #print(train_dataset)
    logging.info(f"Train dataset length: {len(train_dataset)}")
    logging.info(f"Test dataset length: {len(test_dataset)}")
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    args = TrainingArguments(
        output_dir=output_model_name,
        evaluation_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        weight_decay=0.01,
        data_seed=42,
        num_train_epochs=epochs,
        metric_for_best_model="f1",
        save_total_limit=1,
        load_best_model_at_end=False,
        report_to=["tensorboard"],
        save_strategy='no'
    )
    trainer = MultilabelTrainer(
        model= instantiate_classifier_finetuned(model_name,labels, id2label, label2id),
        args=args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    
    trainer.train()
    metrics = trainer.evaluate()
    print(metrics)
    predictions = trainer.predict(test_dataset)
    print(predictions)
    # trainer.save_model(output_model_name)
    # if push_to_hub:
    #     trainer.push_to_hub()

    return trainer

In [64]:
trainer_new=training(model_name='checkpoint-205835')

INFO:root:Train dataset length: 3372
INFO:root:Test dataset length: 375
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at domenicrosati/ClinicalTrialBioBert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy Thresh,F1 Micro,F1 Macro
1,No log,0.210394,0.0,0.921569,0.0,0.0
2,No log,0.155732,0.530478,0.952471,0.530478,0.152572
3,No log,0.130636,0.591195,0.958902,0.591195,0.202407
4,No log,0.114937,0.676301,0.963608,0.676301,0.332821
5,0.155400,0.104436,0.719665,0.967216,0.719665,0.420913
6,0.155400,0.101064,0.738622,0.965647,0.738622,0.467878
7,0.155400,0.097988,0.76178,0.969412,0.76178,0.493684
8,0.155400,0.095487,0.757458,0.968627,0.757458,0.504375
9,0.155400,0.094918,0.777778,0.970039,0.777778,0.524676
10,0.057800,0.09555,0.774112,0.970039,0.774112,0.511452


{'eval_f1': 0.7696139476961394, 'eval_loss': 0.10430705547332764, 'eval_accuracy_thresh': 0.9678431153297424, 'eval_f1_micro': 0.7696139476961394, 'eval_f1_macro': 0.5458725394836774, 'eval_runtime': 0.8003, 'eval_samples_per_second': 468.555, 'eval_steps_per_second': 14.994, 'epoch': 20.0}
PredictionOutput(predictions=array([[-6.327882 , -5.5061526, -6.6775985, ..., -2.8965   , -6.376354 ,
        -7.5506186],
       [-6.344967 , -6.221231 , -6.6962476, ..., -4.3206105, -6.6697307,
        -7.7399435],
       [-6.259497 , -5.7451634, -6.751837 , ..., -4.6739883, -6.5754023,
        -7.640315 ],
       ...,
       [ 3.2270386, -3.6547203, -5.0737753, ..., -5.015713 , -5.2820845,
        -5.5807095],
       [-4.67397  , -4.587115 , -6.628884 , ...,  3.382313 , -5.016668 ,
        -6.397219 ],
       [-4.1944165, -1.1102599, -5.3798323, ..., -3.7680554, -5.387648 ,
        -5.6805696]], dtype=float32), label_ids=array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],


In [65]:
from sklearn.metrics import classification_report
test_dataset = load_dataset("opentargets/clinical_trial_reason_to_stop", split='all')

# Tokenize test dataset

dataset_cols = [col for col in test_dataset.column_names if col not in ["text", "input_ids", "attention_mask", "labels"]]
tokenized_test_dataset = test_dataset.map(tokenize, batched=True, remove_columns=dataset_cols)
predictions = trainer_new.predict(tokenized_test_dataset).predictions
true_labels = tokenized_test_dataset["labels"]
# Get predictions
sigmoid_threshold = 0.3

# Convert probabilities to binary predictions using threshold
predicted_labels = (predictions > sigmoid_threshold).astype(int)
report = classification_report(true_labels, predicted_labels, target_names=labels, digits=4)
print(report)

                         precision    recall  f1-score   support

          Another_Study     0.9744    0.9548    0.9645       199
Business_Administrative     0.9712    0.9811    0.9761       792
                Covid19     0.9946    1.0000    0.9973       183
           Endpoint_Met     0.9474    0.7059    0.8090        51
         Ethical_Reason     1.0000    0.1176    0.2105        17
      Insufficient_Data     1.0000    0.1795    0.3043        39
Insufficient_Enrollment     0.9861    0.9898    0.9879      1075
       Interim_Analysis     1.0000    0.8214    0.9020        28
         Invalid_Reason     0.9790    0.9320    0.9549       250
    Logistics_Resources     0.9148    0.9269    0.9208       301
               Negative     0.9782    0.9755    0.9769       368
             No_Context     1.0000    0.7952    0.8859        83
             Regulatory     0.9717    0.9196    0.9450       112
     Safety_Sideeffects     0.9857    0.9810    0.9834       211
           Study_Design 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
