**METRICS**

In [1]:
import numpy as np
import pandas as pd

chemberta_metrics_path = "/home/raghvendra2/Molformer_Finetuning/classification_model_flavor/Results_baseline/metrics_ChemBERTa_Base_embeddings.npy"
molformer_metrics_path = "/home/raghvendra2/Molformer_Finetuning/classification_model_flavor/Results_baseline/metrics_MolFormer_Base_embeddings.npy"

molformer_metrics_dict = np.load(molformer_metrics_path, allow_pickle=True).item()
chemberta_metrics_dict = np.load(chemberta_metrics_path, allow_pickle=True).item()

molformer_df = pd.DataFrame(molformer_metrics_dict).T 
chemberta_df = pd.DataFrame(chemberta_metrics_dict).T  

# Print as Tables
print("\n===== MolFormer Metrics =====")
print(molformer_df.to_string())

print("\n===== ChemBERTa Metrics =====")
print(chemberta_df.to_string())



===== MolFormer Metrics =====
          Accuracy     AUC  Precision  Recall  F1 Micro  F1 Macro
XGBoost     0.8793  0.9667     0.6663  0.6443    0.8793    0.6522
LightGBM    0.8784  0.9623     0.8470  0.7082    0.8784    0.7159
CatBoost    0.8385  0.9397     0.6231  0.6796    0.8385    0.6473

===== ChemBERTa Metrics =====
          Accuracy     AUC  Precision  Recall  F1 Micro  F1 Macro
XGBoost     0.8855  0.9656     0.8680  0.6924    0.8855    0.7194
LightGBM    0.8869  0.9623     0.8599  0.7186    0.8869    0.7278
CatBoost    0.8625  0.9560     0.8218  0.7391    0.8625    0.7436


In [2]:
import os
import numpy as np
import pandas as pd

results_dir = "/home/raghvendra2/Molformer_Finetuning/classification_model_flavor/Results"

metric_files = sorted([f for f in os.listdir(results_dir) if f.startswith("metrics_") and f.endswith(".npy")])

for file_name in metric_files:
    file_path = os.path.join(results_dir, file_name)
    
    try:
        metrics_dict = np.load(file_path, allow_pickle=True).item()
        metrics_df = pd.DataFrame(metrics_dict).T
        
        print(f"\n===== Metrics from: {file_name} =====")
        print(metrics_df.to_string())
        
    except Exception as e:
        print(f"Error reading {file_name}: {e}")



===== Metrics from: metrics_ChemBERTa_10M_MLM_embeddings.npy =====
          Accuracy     AUC  Precision  Recall  F1 Micro  F1 Macro     MCC
XGBoost     0.8882  0.9637     0.8631  0.7270    0.8882    0.7609  0.7900
LightGBM    0.8838  0.9687     0.8510  0.7387    0.8838    0.7605  0.7879
CatBoost    0.8713  0.9623     0.7504  0.7811    0.8713    0.7615  0.7763

===== Metrics from: metrics_ChemBERTa_10M_MTR_embeddings.npy =====
          Accuracy     AUC  Precision  Recall  F1 Micro  F1 Macro     MCC
XGBoost     0.8909  0.9662     0.8679  0.7331    0.8909    0.7666  0.7950
LightGBM    0.8815  0.9660     0.7706  0.8538    0.8815    0.8070  0.7908
CatBoost    0.8975  0.9752     0.8332  0.8676    0.8975    0.8487  0.8170

===== Metrics from: metrics_ChemBERTa_5M_MTR_embeddings.npy =====
          Accuracy     AUC  Precision  Recall  F1 Micro  F1 Macro     MCC
XGBoost     0.8900  0.9782     0.8739  0.7030    0.8900    0.7281  0.7931
LightGBM    0.8855  0.9666     0.7496  0.7910    0.8855  

**PARAMS**

In [2]:
### ====== BEST PARAMS FILES (npy) ====== ###
molformer_params_path = "/home/raghvendra2/Molformer_Finetuning/classification_model_flavor/Results_baseline/best_params_MolFormer_Base_embeddings.json"
chemberta_params_path = "/home/raghvendra2/Molformer_Finetuning/classification_model_flavor/Results_baseline/best_params_ChemBERTa_Base_embeddings.json"


In [4]:
import pandas as pd
import json

with open(molformer_params_path, "r") as f:
    molformer_best_params = json.load(f)

with open(chemberta_params_path, "r") as f:
    chemberta_best_params = json.load(f)

molformer_params_df = pd.DataFrame([molformer_best_params])
chemberta_params_df = pd.DataFrame([chemberta_best_params])

print("\n===== MolFormer Best Hyperparameters =====")
print(molformer_params_df.to_string())

print("\n===== ChemBERTa Best Hyperparameters =====")
print(chemberta_params_df.to_string())


===== MolFormer Best Hyperparameters =====
                                                                      LightGBM                                                                     CatBoost
0  {'max_depth': 4, 'learning_rate': 0.06276426483034911, 'n_estimators': 293}  {'max_depth': 8, 'learning_rate': 0.08569847429722578, 'n_estimators': 231}

===== ChemBERTa Best Hyperparameters =====
                                                                       LightGBM                                                                     CatBoost
0  {'max_depth': 5, 'learning_rate': 0.055931730180030954, 'n_estimators': 310}  {'max_depth': 6, 'learning_rate': 0.01976065986026186, 'n_estimators': 403}


**LIGHTGBM MODEL**

In [21]:
### ====== LIGHTGBM MODEL FILES ====== ###
molformer_lgb_model_path = "/home/raghvendra2/Molformer_Finetuning/classification_model_flavor/Results_baseline/lgb_model_MolFormer_Base_embeddings.pkl"
chemberta_lgb_model_path = "/home/raghvendra2/Molformer_Finetuning/classification_model_flavor/Results_baseline/lgb_model_ChemBERTa_Base_embeddings.pkl"

In [22]:
import lightgbm as lgb
import catboost as cb
import pickle

In [23]:
with open(molformer_lgb_model_path, "rb") as f:
    molformer_lgb_model = pickle.load(f)

with open(chemberta_lgb_model_path, "rb") as f:
    chemberta_lgb_model = pickle.load(f)

molformer_params_dict = molformer_lgb_model.get_params()
chemberta_params_dict = chemberta_lgb_model.get_params()

molformer_params_df = pd.DataFrame(molformer_params_dict.items(), columns=["Parameter", "Value"])
chemberta_params_df = pd.DataFrame(chemberta_params_dict.items(), columns=["Parameter", "Value"])

print("\n===== MolFormer LightGBM Model Parameters =====")
print(molformer_params_df.to_string(index=False))

print("\n===== ChemBERTa LightGBM Model Parameters =====")
print(chemberta_params_df.to_string(index=False))

UnpicklingError: invalid load key, '\x08'.

**CATBOOST MODEL**

In [12]:
### ====== CATBOOST MODEL FILES (cbm) ====== ###
chemberta_cat_model_path = "/home/raghvendra2/Molformer_Finetuning/classification_model_flavor/Results_baseline/cat_model_ChemBERTa_Base_embeddings.cbm"
molformer_cat_model_path = "/home/raghvendra2/Molformer_Finetuning/classification_model_flavor/Results_baseline/cat_model_MolFormer_Base_embeddings.cbm"

In [13]:
import catboost as cb

molformer_cat_model = cb.CatBoostClassifier()
molformer_cat_model.load_model(molformer_cat_model_path)

chemberta_cat_model = cb.CatBoostClassifier()
chemberta_cat_model.load_model(chemberta_cat_model_path)

molformer_cat_params_dict = molformer_cat_model.get_params()
chemberta_cat_params_dict = chemberta_cat_model.get_params()

molformer_cat_params_df = pd.DataFrame(molformer_cat_params_dict.items(), columns=["Parameter", "Value"])
chemberta_cat_params_df = pd.DataFrame(chemberta_cat_params_dict.items(), columns=["Parameter", "Value"])

print("\n===== MolFormer CatBoost Model Parameters =====")
print(molformer_cat_params_df.to_string(index=False))

print("\n===== ChemBERTa CatBoost Model Parameters =====")
print(chemberta_cat_params_df.to_string(index=False))


===== MolFormer CatBoost Model Parameters =====
         Parameter      Value
             depth          8
auto_class_weights   Balanced
     learning_rate   0.085698
        iterations        231
           verbose          0
     loss_function MultiClass

===== ChemBERTa CatBoost Model Parameters =====
         Parameter      Value
             depth          6
auto_class_weights   Balanced
     learning_rate   0.019761
        iterations        403
           verbose          0
     loss_function MultiClass
