In [None]:
import pycaret
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from pycaret.classification import setup, compare_models
from pycaret.classification import *
from sklearn.metrics import balanced_accuracy_score, matthews_corrcoef
from sklearn.metrics import confusion_matrix

import sklearn

In [None]:
#Define the type of classifiers to build

model_list = ['et','ada','lr','ridge','gbc','rf','dt','lightgbm','svm','lda','knn','nb','qda','dummy','xgboost']



In [None]:
#Define random seed(s)

#session_ids=[3,42,121,198]
session_ids=[16] 

In [None]:
#Define evaluation metrics: balanced accuracy, balanced MCC

#Function for balanced accuracy
def balanced_accuracy(y_true, y_pred):
    return balanced_accuracy_score(y_true, y_pred)

#Function for balanced MCC
def balanced_mcc(y_true, y_pred):
    # Get confusion matrix components
    TN, FP, FN, TP = confusion_matrix(y_true, y_pred).ravel()
    
    # Calculate sensitivity, specificity, and prevalence
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    positive_prevalence = (TP + FN) / (TP + FP + TN + FN)
    
    # Calculate Balanced MCC
    numerator = sensitivity + specificity - 1
    denominator = np.sqrt(
        (sensitivity + (1-specificity) * ((1-positive_prevalence) / positive_prevalence )) * 
        (specificity + (1-sensitivity) * (positive_prevalence / (1-positive_prevalence)))
    )

    if denominator == 0:
        return 0
    elif numerator == 0:
        return 0
    else:
        return numerator / denominator

Change variables to define which dataset is used for training (compound set, target label, features)

In [None]:
dataset = "kadar"       #options: kadar, combined
transport = "influx"    #options: influx, efflux, pampa, bbb
feature = "md"          #options: md or fp
feature_set = "all"     #options: all or sub

In [None]:
#Define and load training set (here the variables are used to define the file path that are later used for defining the saved file names as well)
file_name = f'../../data_preparation/variable_{feature}_generation/{dataset}_{feature}/{dataset}_train_{transport}_{feature}_{feature_set}.csv'
df = pd.read_csv(file_name, index_col=0)

In [None]:
#Check the dataset for cleaning
df.head()

In [None]:
#Define target column, and code classes

stat_col=f"status_{transport}"

df.rename(columns={"Classification":stat_col}, inplace=True)

if transport == "influx":
    df[stat_col] = df[stat_col].replace({'Substrate':1, 'Non-substrate':0})
elif transport == "efflux":
    df[stat_col] = df[stat_col].replace({'Substrate':1, 'Non-substrate':0})
elif transport == "pampa":
    df[stat_col] = df[stat_col].replace({'high':1, 'low':0})
elif transport == "bbb":
    df[stat_col] = df[stat_col].replace({'BBB+':1, 'BBB-':0})

In [None]:
#Clean the dataset
if transport == "pampa":
    df_train = df.drop(['Unnamed: 0','Phenotype','Permeability','SMILES_raw','papyrus_SMILES','papyrus_inchi_key','inchi_connectivity'], axis=1)
else:
    df_train = df.drop(['Unnamed: 0','SMILES_raw','papyrus_SMILES','papyrus_inchi_key','inchi_connectivity'], axis=1)
df_train.head()

In [None]:
#Build 15 classifiers with all random seed

for s_id in session_ids:
    print(f"Setting up PyCaret session:  ID-{s_id}, Transport - {transport}")
    target_col =f"status_{transport}"

    # Setup the environment with the specific session ID
    grid = setup(data=df_train, 
             target=target_col, 
             session_id=s_id,
             html=True, 
             verbose=True, 
             fold=5, 
             data_split_shuffle=True,
             remove_multicollinearity=True,  
             multicollinearity_threshold=0.9, 
             low_variance_threshold=0.05,
    )
    


    #Get unprocessed training
    train_raw = get_config('X_train')
    train_raw_inchi = pd.merge(train_raw, df[["inchi_connectivity"]], left_index=True, right_index=True, how='left')
    train_raw_file_name = f'{dataset}/experiments/{dataset}_{feature}_{feature_set}_{transport}_raw_train_16.csv'
    train_raw_inchi.to_csv(train_raw_file_name, index=False)

     #Get unprocessed test
    test_raw = get_config('X_test')
    test_raw_inchi = pd.merge(test_raw, df[["inchi_connectivity",target_col,'papyrus_SMILES']], left_index=True, right_index=True, how='left')
    test_raw_file_name = f'{dataset}/experiments/{dataset}_{feature}_{feature_set}_{transport}_raw_test_16.csv'
    test_raw_inchi.to_csv(test_raw_file_name, index=False)



    # Add the custom metrics to pycaret
    add_metric('balanced_acc', 'Balanced Accuracy', balanced_accuracy, target='pred')
    add_metric('balanced_mcc', 'Balanced MCC', balanced_mcc, greater_is_better=True, target='pred')
    

    # Comparing all models 
    models_comparison = compare_models(sort="Balanced MCC", n_select=16, exclude='catboost')
    
    #Saving all models
    for model in models_comparison:
        if feature == "maccs":
            model_name = f"models_maccs_only/{transport}/{dataset}_{transport}_maccs_{model.__class__.__name__}_session_{s_id}"
        else:
            model_name = f"models_{feature_set}_{feature}/{transport}/{dataset}_{transport}__{feature}_{feature_set}_{model.__class__.__name__}_session_{s_id}"
        save_model(model, model_name)
        
       
    # Save comparison metrics for each session as a CSV
    metrics_df = pull()
    metrics_filename = f"metrics/{dataset}_{feature}_{feature_set}_{transport}_train_raw_metrics_session_{s_id}.csv"
    metrics_df.to_csv(metrics_filename)