## BBBP Prediction Task Using LoRA Finetuned Molformer

Loading Dataset

In [1]:
import pandas as pd

train_bbbp=pd.read_csv('/home/raghvendra2/Molformer_Finetuning/BBBP_Prediction_Molformer_Finetuned/bbbp/train.csv')
val_bbbp=pd.read_csv('/home/raghvendra2/Molformer_Finetuning/BBBP_Prediction_Molformer_Finetuned/bbbp/valid.csv')

In [3]:
train_bbbp.head()

Unnamed: 0,num,name,p_np,smiles
0,1,Propanolol,1,[Cl].CC(C)NCC(O)COc1cccc2ccccc12
1,2,Terbutylchlorambucil,1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl
2,3,40730,1,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...
3,4,24,1,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C
4,6,cefoperazone,1,CCN1CCN(C(=O)N[C@@H](C(=O)N[C@H]2[C@H]3SCC(=C(...


In [7]:
missing_values = train_bbbp.isnull().sum()

print("Missing values per label:\n", missing_values)

Missing values per label:
 num       0
name      0
p_np      0
smiles    0
dtype: int64


No missing values present in the dataset

In [6]:
train_bbbp['p_np'].value_counts(normalize=True)


p_np
1    0.787675
0    0.212325
Name: proportion, dtype: float64

We have class Imbalance

Load tokenizer and Classsification Model

In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "ibm/MoLFormer-XL-both-10pct",
    trust_remote_code=True
)

# Load the model with a classification head
model = AutoModelForSequenceClassification.from_pretrained(
    "ibm/MoLFormer-XL-both-10pct",
    num_labels=2,
    problem_type="single_label_classification",    
    trust_remote_code=True
)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of MolformerForSequenceClassification were not initialized from the model checkpoint at ibm/MoLFormer-XL-both-10pct and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.dense2.bias', 'classifier.dense2.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing Training and Validation Dataset

In [10]:
smiles_list= train_bbbp['smiles'].tolist()
smiles_val=val_bbbp['smiles'].tolist()
train_tokenized=tokenizer(smiles_list)
val_tokenized=tokenizer(smiles_val)

In [11]:
from datasets import Dataset
train_dataset = Dataset.from_dict(train_tokenized)
val_dataset = Dataset.from_dict(val_tokenized)

In [12]:
train_labels = train_bbbp['p_np'].tolist() # Assuming tasks start from column 1
val_labels = val_bbbp['p_np'].tolist()

In [13]:
train_dataset = train_dataset.add_column("labels", train_labels)
val_dataset = val_dataset.add_column("labels", val_labels)

LoRA Finetuning

In [13]:
import wandb
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from evaluate import load
from datasets import Dataset
import numpy as np
import pandas as pd
import os
from scipy.special import softmax
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score,matthews_corrcoef

In [14]:
def data_load():
    train_bbbp=pd.read_csv('/home/raghvendra2/Molformer_Finetuning/BBBP_Prediction_Molformer_Finetuned/bbbp/train.csv')
    val_bbbp=pd.read_csv('/home/raghvendra2/Molformer_Finetuning/BBBP_Prediction_Molformer_Finetuned/bbbp/valid.csv')

    return train_bbbp, val_bbbp

In [15]:
def data_prep(data_process,tokenizer):

    smiles_list = data_process['smiles'].tolist()
    tokenized=tokenizer(smiles_list)
    
    
    dataset = Dataset.from_dict(tokenized)
    

    labels = data_process['p_np'].tolist() # Assuming tasks start from column 1
    
    dataset = dataset.add_column("labels", labels)
    

    return dataset

In [16]:
from peft import LoraConfig, get_peft_model, PeftModel

def lora_config(r,lora_alpha,dropout):

    lora_config = LoraConfig(
        task_type="SEQ_CLS",  # Sequence classification task
        r=r,  # Rank of LoRA matrices
        lora_alpha=lora_alpha,  # Scaling factor double of rank( from the rule of thumb)
        target_modules='all-linear',
        lora_dropout=dropout  # Dropout rate
        #init_lora_weights="gaussian"
    )

    return lora_config

### Weighted Loss Trainer

In [None]:
import torch

class_weights= [1-(train_dataset['labels'].count(0)/len(train_dataset['labels'])),
                           1-(train_dataset['labels'].count(1)/len(train_dataset['labels']))]

class_weights = torch.from_numpy(np.array(class_weights)).float().to("cuda")

class WeightedLossTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):

        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Extract labels
        labels = inputs.get("labels")

        # compute custom loss (suppose one has 2 labels with different weights)
        loss_func = torch.nn.CrossEntropyLoss(weight=class_weights)

        # compute loss
        loss = loss_func(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

### Focal Loss Trainer

In [17]:
#focal loss computation
import torch.nn.functional as F
import torch

def focal_loss_multiclass(inputs, targets, alpha=1, gamma=2):
    log_prob = F.log_softmax(inputs, dim=-1)
    prob = torch.exp(log_prob)  # Convert log probabilities back to normal probabilities

    targets_one_hot = F.one_hot(targets, num_classes=inputs.shape[-1])
    pt = torch.sum(prob * targets_one_hot, dim=-1)  # Get probability of the true class

    focal_loss = -alpha * (1 - pt) ** gamma * torch.sum(log_prob * targets_one_hot, dim=-1)
    
    return focal_loss.mean() 


class FocalLossTrainer(Trainer):
    
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss = focal_loss_multiclass(logits, labels)
        
        return (loss, outputs) if return_outputs else loss 

In [18]:
from evaluate import load
import numpy as np
from scipy.special import softmax
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score,matthews_corrcoef

accuracy_metric = load("accuracy")
mcc_metric= load("matthews_correlation")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    probabilities = softmax(logits, axis=1)[:, 1]  # Get probabilities for class 1
    predictions = np.argmax(logits, axis=1)  # Choose the most likely class
    

    mcc = matthews_corrcoef(labels, predictions)

    return {
        "eval_mcc_metric": mcc,
        "Accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "AUC-ROC": roc_auc_score(labels, probabilities),  # AUC-ROC requires probabilities
        "Precision": precision_score(labels, predictions),
        "Recall": recall_score(labels, predictions),
        "F1-score": f1_score(labels, predictions)
    } 



### Weighted Loss Trainer

In [11]:

# Initialize W&B with sweep
def run_training():
    run = wandb.init(project="BBBP Hyperparameter Tuning")
    config = run.config   

    # Define unique save path for each W&B run
    save_dir = f"./models_BBBP/{wandb.run.id}"  # Unique directory for each run
    os.makedirs(save_dir, exist_ok=True)

    tokenizer = AutoTokenizer.from_pretrained(
        "ibm/MoLFormer-XL-both-10pct",
        trust_remote_code=True
    )

    # Load data
    train_data, val_data = data_load()
    training_data = data_prep(train_data, tokenizer)
    validation_data = data_prep(val_data, tokenizer)

    # Load base model
    model_clin = AutoModelForSequenceClassification.from_pretrained(
        "ibm/MoLFormer-XL-both-10pct",
        num_labels=2,
        problem_type="single_label_classification",    
        trust_remote_code=True
    )

    # Apply LoRA
    peft_config = lora_config(config.r, config.lora_alpha, config.dropout)
    lora_model = get_peft_model(model_clin, peft_config)
    
    lora_model.print_trainable_parameters()

    # Training arguments
    training_args = TrainingArguments(
        output_dir=save_dir,
        evaluation_strategy="epoch",
        learning_rate=config.lr,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=20,
        weight_decay=0.01,
        save_strategy="epoch",  # Save model at each epoch
        logging_dir=f"./logs_BBBP/{wandb.run.id}",
        logging_strategy="steps",
        logging_steps=100,
        report_to="wandb",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="eval_mcc_metric"
    )

    accuracy_metric = load("accuracy")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        probabilities = softmax(logits, axis=1)[:, 1]  
        predictions = np.argmax(logits, axis=1)  
        mcc = matthews_corrcoef(labels, predictions)

        return {
            "eval_mcc_metric": mcc,
            "Accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
            "AUC-ROC": roc_auc_score(labels, probabilities),
            "Precision": precision_score(labels, predictions),
            "Recall": recall_score(labels, predictions),
            "F1-score": f1_score(labels, predictions)
        }

    # Initialize trainer
    trainer = WeightedLossTrainer(
        model=lora_model,
        args=training_args,
        train_dataset=training_data,
        eval_dataset=validation_data,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
    )

    # Train model
    trainer.train()

    # Save model and tokenizer for this run
    trainer.save_model(save_dir)
    
    print(f"Model saved to {save_dir}")
    
    wandb.finish()


### Focal loss trainer

In [None]:


# Initialize W&B with sweep
def run_training():
    run = wandb.init(project="BBBP focal loss Hyperparameter Tuning")
    config = run.config   

    # Define unique save path for each W&B run
    save_dir = f"./models_BBBP_focal_loss/{wandb.run.id}"  # Unique directory for each run
    os.makedirs(save_dir, exist_ok=True)

    tokenizer = AutoTokenizer.from_pretrained(
        "ibm/MoLFormer-XL-both-10pct",
        trust_remote_code=True
    )

    # Load data
    train_data, val_data = data_load()
    training_data = data_prep(train_data, tokenizer)
    validation_data = data_prep(val_data, tokenizer)

    # Load base model
    model_clin = AutoModelForSequenceClassification.from_pretrained(
        "ibm/MoLFormer-XL-both-10pct",
        num_labels=2,
        problem_type="single_label_classification",    
        trust_remote_code=True
    )

    # Apply LoRA
    peft_config = lora_config(config.r, config.lora_alpha, config.dropout)
    lora_model = get_peft_model(model_clin, peft_config)
    
    lora_model.print_trainable_parameters()

    # Training arguments
    training_args = TrainingArguments(
        output_dir=save_dir,
        evaluation_strategy="epoch",
        learning_rate=config.lr,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=20,
        weight_decay=0.01,
        save_strategy="epoch",  # Save model at each epoch
        logging_dir=f"./logs_BBBP/{wandb.run.id}",
        logging_strategy="steps",
        logging_steps=100,
        report_to="wandb",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="eval_mcc_metric"
    )

    accuracy_metric = load("accuracy")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        probabilities = softmax(logits, axis=1)[:, 1]  
        predictions = np.argmax(logits, axis=1)  
        mcc = matthews_corrcoef(labels, predictions)

        return {
            "eval_mcc_metric": mcc,
            "Accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
            "AUC-ROC": roc_auc_score(labels, probabilities),
            "Precision": precision_score(labels, predictions),
            "Recall": recall_score(labels, predictions),
            "F1-score": f1_score(labels, predictions)
        }

    # Initialize trainer
    trainer = FocalLossTrainer(
        model=lora_model,
        args=training_args,
        train_dataset=training_data,
        eval_dataset=validation_data,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
    )

    # Train model
    trainer.train()

    # Save model and tokenizer for this run
    trainer.save_model(save_dir)
    
    print(f"Model saved to {save_dir}")
    
    wandb.finish()


In [None]:



tokenizer = AutoTokenizer.from_pretrained(
"ibm/MoLFormer-XL-both-10pct",
trust_remote_code=True
)



# Load base model
model_clin = AutoModelForSequenceClassification.from_pretrained(
"ibm/MoLFormer-XL-both-10pct",
num_labels=2,
problem_type="single_label_classification",    
trust_remote_code=True
)

# Apply LoRA
peft_config = lora_config(4, 128, 0.2)
lora_model = get_peft_model(model_clin, peft_config)

lora_model.print_trainable_parameters()




Some weights of MolformerForSequenceClassification were not initialized from the model checkpoint at ibm/MoLFormer-XL-both-10pct and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.dense2.bias', 'classifier.dense2.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,640,458 || all params: 47,213,588 || trainable%: 3.4745


Evaluate on Test Dataset

In [21]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

base_model = AutoModelForSequenceClassification.from_pretrained(
    "ibm/MoLFormer-XL-both-10pct",
    num_labels=2,
    problem_type="single_label_classification",    
    trust_remote_code=True,
    deterministic_eval=True
)

tokenizer = AutoTokenizer.from_pretrained(
    "ibm/MoLFormer-XL-both-10pct",
    trust_remote_code=True
)

Some weights of MolformerForSequenceClassification were not initialized from the model checkpoint at ibm/MoLFormer-XL-both-10pct and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.dense2.bias', 'classifier.dense2.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
import pandas as pd

test_data=pd.read_csv('/home/raghvendra2/Molformer_Finetuning/BBBP_Prediction_Molformer_Finetuned/bbbp/test.csv')

In [23]:
from datasets import Dataset

smiles_test = test_data['smiles'].tolist()

test_tokenized =tokenizer(smiles_test)

test_dataset = Dataset.from_dict(test_tokenized)

In [24]:
test_labels = test_data['p_np'].tolist() 


test_dataset = test_dataset.add_column("labels", test_labels)

In [25]:
from evaluate import load
import numpy as np
from scipy.special import softmax
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef

accuracy_metric = load("accuracy")

def compute_metrics(eval_pred):
        logits, labels = eval_pred
        probabilities = softmax(logits, axis=1)[:, 1]  # Get probabilities for class 1
        predictions = np.argmax(logits, axis=1)  # Choose the most likely class
        mcc = matthews_corrcoef(labels, predictions)
        
        
        return {
            "eval_mcc_metric": mcc,
            "Accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
            "AUC-ROC": roc_auc_score(labels, probabilities),  # AUC-ROC requires probabilities
            "Precision": precision_score(labels, predictions),
            "Recall": recall_score(labels, predictions),
            "F1-score": f1_score(labels, predictions)
        }

In [26]:
from transformers import Trainer, TrainingArguments

eval_args = TrainingArguments(
    output_dir="./test_results_clintox_wandb",
    per_device_eval_batch_size=16,
    report_to="none",  # Disable logging to W&B for test


)

tokenizer = AutoTokenizer.from_pretrained(
"ibm/MoLFormer-XL-both-10pct",
trust_remote_code=True
)

In [31]:
# List all checkpoints inside models directory

models_dir = "./models_BBBP_focal_loss"
model_checkpoints = [
    os.path.join(models_dir, ckpt) 
    for ckpt in os.listdir(models_dir) 
    if os.path.isdir(os.path.join(models_dir, ckpt))  # Ensure it's a directory
]
print(model_checkpoints)
# Evaluate each saved model

for model_path in model_checkpoints:
    
    print(f"Evaluating model: {model_path}")

    # Load the fine-tuned LoRA adapter model
    
    adapter_model = PeftModel.from_pretrained(base_model, model_path)
    adapter_model.eval()  # Set to evaluation mode

    # Initialize Trainer for evaluation
    trainer = FocalLossTrainer(
        model=adapter_model,
        args=eval_args,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # Run evaluation
    test_results = trainer.evaluate()
    
    print(f"Test Results for BBBP focal loss {model_path}: {test_results}")


['./models_BBBP_focal_loss/qh5ahemf', './models_BBBP_focal_loss/6lsvst0q', './models_BBBP_focal_loss/7gw4703h', './models_BBBP_focal_loss/zvxvvpai', './models_BBBP_focal_loss/u6qyowv9', './models_BBBP_focal_loss/mbek35f7', './models_BBBP_focal_loss/bxga4pr1', './models_BBBP_focal_loss/mnh92oo9', './models_BBBP_focal_loss/zoq9c9ir', './models_BBBP_focal_loss/appu9sb4']
Evaluating model: ./models_BBBP_focal_loss/qh5ahemf


  trainer = FocalLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP focal loss ./models_BBBP_focal_loss/qh5ahemf: {'eval_mcc_metric': 0.8214238960405138, 'eval_loss': 0.12069400399923325, 'eval_model_preparation_time': 0.0131, 'eval_Accuracy': 0.9175257731958762, 'eval_AUC-ROC': 0.964273445551357, 'eval_Precision': 0.928, 'eval_Recall': 0.943089430894309, 'eval_F1-score': 0.9354838709677419, 'eval_runtime': 0.4934, 'eval_samples_per_second': 393.157, 'eval_steps_per_second': 26.346}
Evaluating model: ./models_BBBP_focal_loss/6lsvst0q


  trainer = FocalLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP focal loss ./models_BBBP_focal_loss/6lsvst0q: {'eval_mcc_metric': 0.7641952976557471, 'eval_loss': 0.07125464826822281, 'eval_model_preparation_time': 0.0137, 'eval_Accuracy': 0.8917525773195877, 'eval_AUC-ROC': 0.952708118630482, 'eval_Precision': 0.8923076923076924, 'eval_Recall': 0.943089430894309, 'eval_F1-score': 0.9169960474308301, 'eval_runtime': 0.4726, 'eval_samples_per_second': 410.471, 'eval_steps_per_second': 27.506}
Evaluating model: ./models_BBBP_focal_loss/7gw4703h


  trainer = FocalLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP focal loss ./models_BBBP_focal_loss/7gw4703h: {'eval_mcc_metric': 0.7330462738834338, 'eval_loss': 0.07228748500347137, 'eval_model_preparation_time': 0.013, 'eval_Accuracy': 0.8711340206185567, 'eval_AUC-ROC': 0.9480132829497309, 'eval_Precision': 0.9298245614035088, 'eval_Recall': 0.8617886178861789, 'eval_F1-score': 0.8945147679324894, 'eval_runtime': 0.481, 'eval_samples_per_second': 403.305, 'eval_steps_per_second': 27.026}
Evaluating model: ./models_BBBP_focal_loss/zvxvvpai


  trainer = FocalLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP focal loss ./models_BBBP_focal_loss/zvxvvpai: {'eval_mcc_metric': 0.7759615538661074, 'eval_loss': 0.07295171171426773, 'eval_model_preparation_time': 0.0131, 'eval_Accuracy': 0.8969072164948454, 'eval_AUC-ROC': 0.9653040192373755, 'eval_Precision': 0.905511811023622, 'eval_Recall': 0.9349593495934959, 'eval_F1-score': 0.92, 'eval_runtime': 0.4772, 'eval_samples_per_second': 406.572, 'eval_steps_per_second': 27.245}
Evaluating model: ./models_BBBP_focal_loss/u6qyowv9


  trainer = FocalLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP focal loss ./models_BBBP_focal_loss/u6qyowv9: {'eval_mcc_metric': 0.7875047469022934, 'eval_loss': 0.06442337483167648, 'eval_model_preparation_time': 0.0133, 'eval_Accuracy': 0.9020618556701031, 'eval_AUC-ROC': 0.9607236917439597, 'eval_Precision': 0.9126984126984127, 'eval_Recall': 0.9349593495934959, 'eval_F1-score': 0.9236947791164659, 'eval_runtime': 0.4789, 'eval_samples_per_second': 405.102, 'eval_steps_per_second': 27.146}
Evaluating model: ./models_BBBP_focal_loss/mbek35f7


  trainer = FocalLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


  trainer = FocalLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP focal loss ./models_BBBP_focal_loss/mbek35f7: {'eval_mcc_metric': 0.7534564646973192, 'eval_loss': 0.06791112571954727, 'eval_model_preparation_time': 0.0131, 'eval_Accuracy': 0.8865979381443299, 'eval_AUC-ROC': 0.9646169701133631, 'eval_Precision': 0.8976377952755905, 'eval_Recall': 0.926829268292683, 'eval_F1-score': 0.912, 'eval_runtime': 0.4778, 'eval_samples_per_second': 406.037, 'eval_steps_per_second': 27.209}
Evaluating model: ./models_BBBP_focal_loss/bxga4pr1


Test Results for BBBP focal loss ./models_BBBP_focal_loss/bxga4pr1: {'eval_mcc_metric': 0.7414360272005406, 'eval_loss': 0.07517533749341965, 'eval_model_preparation_time': 0.0156, 'eval_Accuracy': 0.8814432989690721, 'eval_AUC-ROC': 0.955570823313867, 'eval_Precision': 0.8846153846153846, 'eval_Recall': 0.9349593495934959, 'eval_F1-score': 0.9090909090909091, 'eval_runtime': 0.574, 'eval_samples_per_second': 337.978, 'eval_steps_per_second': 22.648}
Evaluating model: ./models_BBBP_focal_loss/mnh92oo9


  trainer = FocalLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP focal loss ./models_BBBP_focal_loss/mnh92oo9: {'eval_mcc_metric': 0.7641952976557471, 'eval_loss': 0.06537608057260513, 'eval_model_preparation_time': 0.0155, 'eval_Accuracy': 0.8917525773195877, 'eval_AUC-ROC': 0.9616397572426428, 'eval_Precision': 0.8923076923076924, 'eval_Recall': 0.943089430894309, 'eval_F1-score': 0.9169960474308301, 'eval_runtime': 0.5809, 'eval_samples_per_second': 333.95, 'eval_steps_per_second': 22.378}
Evaluating model: ./models_BBBP_focal_loss/zoq9c9ir


  trainer = FocalLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP focal loss ./models_BBBP_focal_loss/zoq9c9ir: {'eval_mcc_metric': 0.6959174862901277, 'eval_loss': 0.07610391825437546, 'eval_model_preparation_time': 0.0155, 'eval_Accuracy': 0.8608247422680413, 'eval_AUC-ROC': 0.9449215618916753, 'eval_Precision': 0.8692307692307693, 'eval_Recall': 0.9186991869918699, 'eval_F1-score': 0.8932806324110671, 'eval_runtime': 0.5813, 'eval_samples_per_second': 333.728, 'eval_steps_per_second': 22.363}
Evaluating model: ./models_BBBP_focal_loss/appu9sb4


  trainer = FocalLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP focal loss ./models_BBBP_focal_loss/appu9sb4: {'eval_mcc_metric': 0.7299512279923026, 'eval_loss': 0.07019228488206863, 'eval_model_preparation_time': 0.0156, 'eval_Accuracy': 0.8762886597938144, 'eval_AUC-ROC': 0.954425741440513, 'eval_Precision': 0.8778625954198473, 'eval_Recall': 0.9349593495934959, 'eval_F1-score': 0.905511811023622, 'eval_runtime': 0.5822, 'eval_samples_per_second': 333.216, 'eval_steps_per_second': 22.329}


### Best model for focal loss: models_BBBP_focal_loss/zvxvvpai

In [35]:
# List all checkpoints inside models directory

models_dir = "./models_BBBP"
model_checkpoints = [
    os.path.join(models_dir, ckpt) 
    for ckpt in os.listdir(models_dir) 
    if os.path.isdir(os.path.join(models_dir, ckpt))  # Ensure it's a directory
]
print(model_checkpoints)
# Evaluate each saved model

for model_path in model_checkpoints:
    
    print(f"Evaluating model: {model_path}")

    # Load the fine-tuned LoRA adapter model
    
    adapter_model = PeftModel.from_pretrained(base_model, model_path)
    adapter_model.eval()  # Set to evaluation mode

    # Initialize Trainer for evaluation
    trainer = WeightedLossTrainer(
        model=adapter_model,
        args=eval_args,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # Run evaluation
    test_results = trainer.evaluate()
    
    print(f"Test Results for BBBP {model_path}: {test_results}")


['./models_BBBP/9vkzvef4', './models_BBBP/ngi394z7', './models_BBBP/mx5n8q53', './models_BBBP/5xmpcs4o', './models_BBBP/5tf2qbgz', './models_BBBP/8xuh5hwj', './models_BBBP/repizpfk', './models_BBBP/7dgp2e3w', './models_BBBP/sot1c3xx', './models_BBBP/ebn9ruz1', './models_BBBP/fk9mffo4', './models_BBBP/q4lnica8', './models_BBBP/q2xirung', './models_BBBP/50iwmcx3', './models_BBBP/gxheqjrf', './models_BBBP/2g93zcww', './models_BBBP/ste6wfed', './models_BBBP/4wgyglva', './models_BBBP/8fpgrske', './models_BBBP/kex36n5y']
Evaluating model: ./models_BBBP/9vkzvef4


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/9vkzvef4: {'eval_mcc_metric': -0.3805373150193529, 'eval_loss': 0.957651674747467, 'eval_model_preparation_time': 0.0125, 'eval_Accuracy': 0.27319587628865977, 'eval_AUC-ROC': 0.15092179090804994, 'eval_Precision': 0.25, 'eval_Recall': 0.07317073170731707, 'eval_F1-score': 0.11320754716981132, 'eval_runtime': 0.4955, 'eval_samples_per_second': 391.521, 'eval_steps_per_second': 26.236}
Evaluating model: ./models_BBBP/ngi394z7


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/ngi394z7: {'eval_mcc_metric': 0.723782267124832, 'eval_loss': 0.23008368909358978, 'eval_model_preparation_time': 0.0128, 'eval_Accuracy': 0.865979381443299, 'eval_AUC-ROC': 0.9596931180579411, 'eval_Precision': 0.9292035398230089, 'eval_Recall': 0.8536585365853658, 'eval_F1-score': 0.8898305084745762, 'eval_runtime': 0.4916, 'eval_samples_per_second': 394.667, 'eval_steps_per_second': 26.447}
Evaluating model: ./models_BBBP/mx5n8q53


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/mx5n8q53: {'eval_mcc_metric': 0.7454811946201522, 'eval_loss': 0.2184453010559082, 'eval_model_preparation_time': 0.0128, 'eval_Accuracy': 0.8762886597938144, 'eval_AUC-ROC': 0.9607236917439597, 'eval_Precision': 0.9380530973451328, 'eval_Recall': 0.8617886178861789, 'eval_F1-score': 0.8983050847457628, 'eval_runtime': 0.4917, 'eval_samples_per_second': 394.541, 'eval_steps_per_second': 26.438}
Evaluating model: ./models_BBBP/5xmpcs4o


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/5xmpcs4o: {'eval_mcc_metric': 0.7146317484209869, 'eval_loss': 0.23606035113334656, 'eval_model_preparation_time': 0.0127, 'eval_Accuracy': 0.8608247422680413, 'eval_AUC-ROC': 0.9514485285697927, 'eval_Precision': 0.9285714285714286, 'eval_Recall': 0.8455284552845529, 'eval_F1-score': 0.8851063829787233, 'eval_runtime': 0.4868, 'eval_samples_per_second': 398.484, 'eval_steps_per_second': 26.703}
Evaluating model: ./models_BBBP/5tf2qbgz


Test Results for BBBP ./models_BBBP/5tf2qbgz: {'eval_mcc_metric': 0.7330462738834338, 'eval_loss': 0.2229623943567276, 'eval_model_preparation_time': 0.0129, 'eval_Accuracy': 0.8711340206185567, 'eval_AUC-ROC': 0.9551127905645254, 'eval_Precision': 0.9298245614035088, 'eval_Recall': 0.8617886178861789, 'eval_F1-score': 0.8945147679324894, 'eval_runtime': 0.4869, 'eval_samples_per_second': 398.46, 'eval_steps_per_second': 26.701}
Evaluating model: ./models_BBBP/8xuh5hwj


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/8xuh5hwj: {'eval_mcc_metric': 0.7146317484209869, 'eval_loss': 0.23071998357772827, 'eval_model_preparation_time': 0.0128, 'eval_Accuracy': 0.8608247422680413, 'eval_AUC-ROC': 0.9596931180579411, 'eval_Precision': 0.9285714285714286, 'eval_Recall': 0.8455284552845529, 'eval_F1-score': 0.8851063829787233, 'eval_runtime': 0.4877, 'eval_samples_per_second': 397.796, 'eval_steps_per_second': 26.656}
Evaluating model: ./models_BBBP/repizpfk


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/repizpfk: {'eval_mcc_metric': 0.6285452326977308, 'eval_loss': 0.28119876980781555, 'eval_model_preparation_time': 0.0128, 'eval_Accuracy': 0.8092783505154639, 'eval_AUC-ROC': 0.9347303332188251, 'eval_Precision': 0.9215686274509803, 'eval_Recall': 0.7642276422764228, 'eval_F1-score': 0.8355555555555556, 'eval_runtime': 0.4805, 'eval_samples_per_second': 403.784, 'eval_steps_per_second': 27.058}
Evaluating model: ./models_BBBP/7dgp2e3w


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/7dgp2e3w: {'eval_mcc_metric': 0.6966551358109493, 'eval_loss': 0.23464028537273407, 'eval_model_preparation_time': 0.0128, 'eval_Accuracy': 0.8505154639175257, 'eval_AUC-ROC': 0.9508759876331156, 'eval_Precision': 0.9272727272727272, 'eval_Recall': 0.8292682926829268, 'eval_F1-score': 0.8755364806866953, 'eval_runtime': 0.4866, 'eval_samples_per_second': 398.707, 'eval_steps_per_second': 26.717}
Evaluating model: ./models_BBBP/sot1c3xx


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/sot1c3xx: {'eval_mcc_metric': 0.7704751344800757, 'eval_loss': 0.22046661376953125, 'eval_model_preparation_time': 0.0128, 'eval_Accuracy': 0.8865979381443299, 'eval_AUC-ROC': 0.9611817244933012, 'eval_Precision': 0.954954954954955, 'eval_Recall': 0.8617886178861789, 'eval_F1-score': 0.905982905982906, 'eval_runtime': 0.4867, 'eval_samples_per_second': 398.631, 'eval_steps_per_second': 26.712}
Evaluating model: ./models_BBBP/ebn9ruz1


Test Results for BBBP ./models_BBBP/ebn9ruz1: {'eval_mcc_metric': 0.6367615579291525, 'eval_loss': 0.2501973509788513, 'eval_model_preparation_time': 0.0127, 'eval_Accuracy': 0.8144329896907216, 'eval_AUC-ROC': 0.9506469712584449, 'eval_Precision': 0.9223300970873787, 'eval_Recall': 0.7723577235772358, 'eval_F1-score': 0.8407079646017699, 'eval_runtime': 0.4858, 'eval_samples_per_second': 399.338, 'eval_steps_per_second': 26.76}
Evaluating model: ./models_BBBP/fk9mffo4


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/fk9mffo4: {'eval_mcc_metric': 0.7704751344800757, 'eval_loss': 0.21848148107528687, 'eval_model_preparation_time': 0.0127, 'eval_Accuracy': 0.8865979381443299, 'eval_AUC-ROC': 0.9628993473033323, 'eval_Precision': 0.954954954954955, 'eval_Recall': 0.8617886178861789, 'eval_F1-score': 0.905982905982906, 'eval_runtime': 0.4818, 'eval_samples_per_second': 402.663, 'eval_steps_per_second': 26.983}
Evaluating model: ./models_BBBP/q4lnica8


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/q4lnica8: {'eval_mcc_metric': 0.7765225738324971, 'eval_loss': 0.21523205935955048, 'eval_model_preparation_time': 0.0129, 'eval_Accuracy': 0.8917525773195877, 'eval_AUC-ROC': 0.9641589373640215, 'eval_Precision': 0.9473684210526315, 'eval_Recall': 0.8780487804878049, 'eval_F1-score': 0.9113924050632911, 'eval_runtime': 0.4973, 'eval_samples_per_second': 390.106, 'eval_steps_per_second': 26.141}
Evaluating model: ./models_BBBP/q2xirung


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/q2xirung: {'eval_mcc_metric': 0.7272188253817875, 'eval_loss': 0.22433921694755554, 'eval_model_preparation_time': 0.0128, 'eval_Accuracy': 0.865979381443299, 'eval_AUC-ROC': 0.9595786098706057, 'eval_Precision': 0.9369369369369369, 'eval_Recall': 0.8455284552845529, 'eval_F1-score': 0.8888888888888888, 'eval_runtime': 0.4826, 'eval_samples_per_second': 402.014, 'eval_steps_per_second': 26.939}
Evaluating model: ./models_BBBP/50iwmcx3


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/50iwmcx3: {'eval_mcc_metric': 0.7834337703851408, 'eval_loss': 0.21752232313156128, 'eval_model_preparation_time': 0.0127, 'eval_Accuracy': 0.8969072164948454, 'eval_AUC-ROC': 0.960838199931295, 'eval_Precision': 0.9401709401709402, 'eval_Recall': 0.8943089430894309, 'eval_F1-score': 0.9166666666666666, 'eval_runtime': 0.4895, 'eval_samples_per_second': 396.307, 'eval_steps_per_second': 26.557}
Evaluating model: ./models_BBBP/gxheqjrf


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/gxheqjrf: {'eval_mcc_metric': 0.6450566560194042, 'eval_loss': 0.2560390532016754, 'eval_model_preparation_time': 0.0128, 'eval_Accuracy': 0.8195876288659794, 'eval_AUC-ROC': 0.9462956601396999, 'eval_Precision': 0.9230769230769231, 'eval_Recall': 0.7804878048780488, 'eval_F1-score': 0.8458149779735683, 'eval_runtime': 0.4947, 'eval_samples_per_second': 392.165, 'eval_steps_per_second': 26.279}
Evaluating model: ./models_BBBP/2g93zcww


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/2g93zcww: {'eval_mcc_metric': 0.7093887596065858, 'eval_loss': 0.2195080816745758, 'eval_model_preparation_time': 0.0129, 'eval_Accuracy': 0.8556701030927835, 'eval_AUC-ROC': 0.9569449215618917, 'eval_Precision': 0.9357798165137615, 'eval_Recall': 0.8292682926829268, 'eval_F1-score': 0.8793103448275862, 'eval_runtime': 0.4865, 'eval_samples_per_second': 398.806, 'eval_steps_per_second': 26.724}
Evaluating model: ./models_BBBP/ste6wfed


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/ste6wfed: {'eval_mcc_metric': 0.7888790496107927, 'eval_loss': 0.2089870125055313, 'eval_model_preparation_time': 0.0129, 'eval_Accuracy': 0.8969072164948454, 'eval_AUC-ROC': 0.966334592923394, 'eval_Precision': 0.9557522123893806, 'eval_Recall': 0.8780487804878049, 'eval_F1-score': 0.9152542372881356, 'eval_runtime': 0.489, 'eval_samples_per_second': 396.751, 'eval_steps_per_second': 26.586}
Evaluating model: ./models_BBBP/4wgyglva


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/4wgyglva: {'eval_mcc_metric': 0.7330462738834338, 'eval_loss': 0.23195064067840576, 'eval_model_preparation_time': 0.0128, 'eval_Accuracy': 0.8711340206185567, 'eval_AUC-ROC': 0.9590060689339287, 'eval_Precision': 0.9298245614035088, 'eval_Recall': 0.8617886178861789, 'eval_F1-score': 0.8945147679324894, 'eval_runtime': 0.4915, 'eval_samples_per_second': 394.681, 'eval_steps_per_second': 26.448}
Evaluating model: ./models_BBBP/8fpgrske


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/8fpgrske: {'eval_mcc_metric': 0.764207955971172, 'eval_loss': 0.22854012250900269, 'eval_model_preparation_time': 0.0128, 'eval_Accuracy': 0.8865979381443299, 'eval_AUC-ROC': 0.9609527081186304, 'eval_Precision': 0.9391304347826087, 'eval_Recall': 0.8780487804878049, 'eval_F1-score': 0.907563025210084, 'eval_runtime': 0.4873, 'eval_samples_per_second': 398.113, 'eval_steps_per_second': 26.678}
Evaluating model: ./models_BBBP/kex36n5y


  trainer = WeightedLossTrainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Test Results for BBBP ./models_BBBP/kex36n5y: {'eval_mcc_metric': 0.6919611956034514, 'eval_loss': 0.23126566410064697, 'eval_model_preparation_time': 0.0129, 'eval_Accuracy': 0.845360824742268, 'eval_AUC-ROC': 0.9560288560632084, 'eval_Precision': 0.9345794392523364, 'eval_Recall': 0.8130081300813008, 'eval_F1-score': 0.8695652173913043, 'eval_runtime': 0.4946, 'eval_samples_per_second': 392.267, 'eval_steps_per_second': 26.286}


### Best model for wieghted Loss trainer: models_BBBP/ste6wfed

## Focal Loss Evaluation