In [1]:
import pycaret
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from pycaret.classification import setup, compare_models
from pycaret.classification import *
from sklearn.metrics import balanced_accuracy_score, matthews_corrcoef
from sklearn.metrics import confusion_matrix

import sklearn

In [2]:
#Define the type of classifiers to build

model_list = ['et','ada','lr','ridge','gbc','rf','dt','lightgbm','svm','lda','knn','nb','qda','dummy','xgboost']



In [3]:
#Define random seed(s)

#session_ids=[3,42,121,198]
session_ids=[16] 

In [4]:
#Define evaluation metrics: balanced accuracy, balanced MCC

#Function for balanced accuracy
def balanced_accuracy(y_true, y_pred):
    return balanced_accuracy_score(y_true, y_pred)

#Function for balanced MCC
def balanced_mcc(y_true, y_pred):
    # Get confusion matrix components
    TN, FP, FN, TP = confusion_matrix(y_true, y_pred).ravel()
    
    # Calculate sensitivity, specificity, and prevalence
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    positive_prevalence = (TP + FN) / (TP + FP + TN + FN)
    
    # Calculate Balanced MCC
    numerator = sensitivity + specificity - 1
    denominator = np.sqrt(
        (sensitivity + (1-specificity) * ((1-positive_prevalence) / positive_prevalence )) * 
        (specificity + (1-sensitivity) * (positive_prevalence / (1-positive_prevalence)))
    )

    if denominator == 0:
        return 0
    elif numerator == 0:
        return 0
    else:
        return numerator / denominator

Change variables to define which dataset is used for training (compound set, target label, features)

In [8]:
dataset = "kadar"       #kadar, combined
transport = "influx"       #influx, efflux, pampa, bbb
feature = "md"          #md or fp
feature_set = "all"        #all or sub

In [9]:
#Define and load training set (here the variables are used to define the file path that are later used for defining the saved file names as well)
file_name = f'../../data_preparation/variable_{feature}_generation/{dataset}_{feature}/{dataset}_train_{transport}_{feature}_{feature_set}.csv'
df = pd.read_csv(file_name, index_col=0)

In [10]:
#Check the dataset for cleaning
df.head()

Unnamed: 0.1,Unnamed: 0,SMILES_raw,status_influx,papyrus_SMILES,papyrus_inchi_key,inchi_connectivity,bluedesc - NumberOfAtoms,bluedesc - NumberOfB,bluedesc - NumberOfBr,bluedesc - NumberOfC,...,padel - AMW,padel - WTPT-1,padel - WTPT-2,padel - WTPT-3,padel - WTPT-4,padel - WTPT-5,padel - WPATH,padel - WPOL,padel - XLogP,padel - Zagreb
0,0,CO[C@H]1/C=C/O[C@@]2(C)Oc3c(C)c(O)c4c(O)c(cc(O...,Non-substrate,CO[C@H]1/C=C/O[C@@]2(C)Oc3c(C)c(O)c4c(O)c(cc(O...,HJYYPODYNSCCOU-WXCWORQHSA-N,HJYYPODYNSCCOU,97,0,0,0,...,7.188761,99.445635,1.988913,35.188376,32.142163,3.046214,8863.0,99.0,3.29,266.0
1,1,CC(C)N(CCC(C(N)=O)(c1ccccc1)c1ccccn1)C(C)C,Non-substrate,CC(C)N(CCC(C(N)=O)(c1ccccc1)c1ccccn1)C(C)C,UVTNFZQICZKOEM-UHFFFAOYSA-N,UVTNFZQICZKOEM,54,0,0,0,...,6.282057,49.574275,1.982971,11.229089,2.443494,8.785594,1372.0,42.0,2.717,124.0
2,2,OCCN(CCO)c1nc(N2CCCCC2)c2nc(N(CCO)CCO)nc(N3CCC...,Non-substrate,OCCN(CCO)c1nc(N2CCCCC2)c2nc(N(CCO)CCO)nc(N3CCC...,IZEKFCXSFNUWAM-UHFFFAOYSA-N,IZEKFCXSFNUWAM,76,0,0,0,...,6.635753,73.777166,2.049366,36.581342,9.782608,26.798734,3652.0,60.0,-0.166,182.0
3,3,CN(C)CCCN1c2ccccc2CCc2ccccc21,Non-substrate,CN(C)CCCN1c2ccccc2CCc2ccccc21,BCGWQEUPMDMJNV-UHFFFAOYSA-N,BCGWQEUPMDMJNV,45,0,0,0,...,6.226532,43.271083,2.060528,6.563459,0.0,6.563459,882.0,35.0,2.6,108.0
4,4,Cc1ccccc1C(OCCN(C)C)c1ccccc1,Non-substrate,Cc1ccccc1C(OCCN(C)C)c1ccccc1,QVYRGXJJSLMXQH-UHFFFAOYSA-N,QVYRGXJJSLMXQH,43,0,0,0,...,6.259953,40.155977,2.007799,6.063001,3.080516,2.982485,825.0,27.0,2.314,96.0


In [28]:
#Define target column, and code classes

stat_col=f"status_{transport}"

df.rename(columns={"Classification":stat_col}, inplace=True)

if transport == "influx":
    df[stat_col] = df[stat_col].replace({'Substrate':1, 'Non-substrate':0})
elif transport == "efflux":
    df[stat_col] = df[stat_col].replace({'Substrate':1, 'Non-substrate':0})
elif transport == "pampa":
    df[stat_col] = df[stat_col].replace({'high':1, 'low':0})
elif transport == "bbb":
    df[stat_col] = df[stat_col].replace({'BBB+':1, 'BBB-':0})

In [29]:
#Clean the dataset
if transport == "pampa":
    df_train = df.drop(['Unnamed: 0','Phenotype','Permeability','SMILES_raw','papyrus_SMILES','papyrus_inchi_key','inchi_connectivity'], axis=1)
else:
    df_train = df.drop(['Unnamed: 0','SMILES_raw','papyrus_SMILES','papyrus_inchi_key','inchi_connectivity'], axis=1)
df_train.head()

Unnamed: 0,status_efflux,bluedesc - NumberOfAtoms,bluedesc - NumberOfB,bluedesc - NumberOfBr,bluedesc - NumberOfC,bluedesc - NumberOfCl,bluedesc - NumberOfF,bluedesc - NumberOfHal,bluedesc - NumberOfI,bluedesc - NumberOfN,...,padel - AMW,padel - WTPT-1,padel - WTPT-2,padel - WTPT-3,padel - WTPT-4,padel - WTPT-5,padel - WPATH,padel - WPOL,padel - XLogP,padel - Zagreb
0,0,70,0,0,0,0,0,0,0,0,...,6.946616,73.359952,2.037776,21.219289,8.199115,13.020174,4488.0,60.0,2.236,186.0
1,1,69,0,0,0,0,0,0,0,0,...,7.003655,74.394447,2.066512,18.503656,8.231232,10.272424,4228.0,64.0,3.338,194.0
2,1,68,0,0,0,0,0,0,0,0,...,7.121286,74.394447,2.066512,21.540021,8.231232,13.308789,4228.0,64.0,1.879,194.0
3,1,79,0,0,0,0,0,0,0,0,...,7.231386,86.012168,2.047909,30.331394,13.615352,16.716042,6867.0,75.0,1.72,222.0
4,1,68,0,0,0,0,0,0,0,0,...,7.121286,74.395428,2.06654,21.544872,8.231322,13.31355,4147.0,64.0,1.879,194.0


In [31]:
#Build 15 classifiers with all random seed

for s_id in session_ids:
    print(f"Setting up PyCaret session:  ID-{s_id}, Transport - {transport}")
    target_col =f"status_{transport}"

    # Setup the environment with the specific session ID
    grid = setup(data=df_train, 
             target=target_col, 
             session_id=s_id,
             html=True, 
             verbose=True, 
             fold=5, 
             data_split_shuffle=True,
             remove_multicollinearity=True,  
             multicollinearity_threshold=0.9, 
             low_variance_threshold=0.05,
    )
    


    #Get unprocessed training
    train_raw = get_config('X_train')
    train_raw_inchi = pd.merge(train_raw, df[["inchi_connectivity"]], left_index=True, right_index=True, how='left')
    train_raw_file_name = f'{dataset}/experiments/{dataset}_{feature}_{feature_set}_{transport}_raw_train_16.csv'
    train_raw_inchi.to_csv(train_raw_file_name, index=False)

     #Get unprocessed test
    test_raw = get_config('X_test')
    test_raw_inchi = pd.merge(test_raw, df[["inchi_connectivity",target_col,'papyrus_SMILES']], left_index=True, right_index=True, how='left')
    test_raw_file_name = f'{dataset}/experiments/{dataset}_{feature}_{feature_set}_{transport}_raw_test_16.csv'
    test_raw_inchi.to_csv(test_raw_file_name, index=False)



    # Add the custom metrics to pycaret
    add_metric('balanced_acc', 'Balanced Accuracy', balanced_accuracy, target='pred')
    add_metric('balanced_mcc', 'Balanced MCC', balanced_mcc, greater_is_better=True, target='pred')
    

    # Comparing all models 
    models_comparison = compare_models(sort="Balanced MCC", n_select=16, exclude='catboost')
    
    #Saving all models
    for model in models_comparison:
        if feature == "maccs":
            model_name = f"models_maccs_only/{transport}/{dataset}_{transport}_maccs_{model.__class__.__name__}_session_{s_id}"
        else:
            model_name = f"models_{feature_set}_{feature}/{transport}/{dataset}_{transport}__{feature}_{feature_set}_{model.__class__.__name__}_session_{s_id}"
        save_model(model, model_name)
        
       
    # Save comparison metrics for each session as a CSV
    metrics_df = pull()
    metrics_filename = f"metrics/{dataset}_{feature}_{feature_set}_{transport}_train_raw_metrics_session_{s_id}.csv"
    metrics_df.to_csv(metrics_filename)

Setting up PyCaret session:  ID-3, Transport - efflux


Unnamed: 0,Description,Value
0,Session id,3
1,Target,status_efflux
2,Target type,Binary
3,Original data shape,"(3632, 3195)"
4,Transformed data shape,"(3632, 391)"
5,Transformed train set shape,"(2542, 391)"
6,Transformed test set shape,"(1090, 391)"
7,Numeric features,3194
8,Rows with missing values,0.1%
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Balanced Accuracy,Balanced MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8179,0.8621,0.9269,0.8422,0.8823,0.4827,0.4958,0.7195,0.4958,4.046
xgboost,Extreme Gradient Boosting,0.8139,0.8499,0.9248,0.8394,0.8799,0.4709,0.4831,0.7139,0.4831,4.196
et,Extra Trees Classifier,0.8139,0.8647,0.9269,0.8381,0.8802,0.4684,0.4807,0.712,0.4807,3.63
rf,Random Forest Classifier,0.812,0.857,0.9354,0.8312,0.8801,0.4514,0.4701,0.7005,0.4701,3.962
lda,Linear Discriminant Analysis,0.797,0.8049,0.8938,0.8411,0.8664,0.4443,0.4495,0.7097,0.4495,3.572
ridge,Ridge Classifier,0.7982,0.0,0.9056,0.835,0.8686,0.4356,0.4443,0.7013,0.4443,3.724
gbc,Gradient Boosting Classifier,0.8005,0.8397,0.9285,0.8238,0.8728,0.4176,0.436,0.6851,0.436,7.27
ada,Ada Boost Classifier,0.7777,0.7891,0.8933,0.8213,0.8556,0.3763,0.384,0.6735,0.384,4.314
dt,Decision Tree Classifier,0.7278,0.6531,0.8106,0.8186,0.8144,0.3032,0.3037,0.6531,0.3037,4.064
knn,K Neighbors Classifier,0.7565,0.7238,0.92,0.7861,0.8478,0.2575,0.279,0.6089,0.279,4.788


Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Setting up PyCaret session:  ID-42, Transport - efflux


Unnamed: 0,Description,Value
0,Session id,42
1,Target,status_efflux
2,Target type,Binary
3,Original data shape,"(3632, 3195)"
4,Transformed data shape,"(3632, 393)"
5,Transformed train set shape,"(2542, 393)"
6,Transformed test set shape,"(1090, 393)"
7,Numeric features,3194
8,Rows with missing values,0.1%
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Balanced Accuracy,Balanced MCC,TT (Sec)
et,Extra Trees Classifier,0.8116,0.8636,0.9322,0.8324,0.8795,0.4541,0.4703,0.7026,0.4703,3.61
lightgbm,Light Gradient Boosting Machine,0.8061,0.8597,0.9226,0.8326,0.8752,0.4451,0.4587,0.7009,0.4587,4.044
rf,Random Forest Classifier,0.8061,0.851,0.9429,0.8207,0.8775,0.4219,0.4473,0.6826,0.4473,3.84
xgboost,Extreme Gradient Boosting,0.8013,0.8506,0.9162,0.8315,0.8717,0.4355,0.4466,0.6976,0.4466,4.208
gbc,Gradient Boosting Classifier,0.7966,0.8324,0.9274,0.8202,0.8704,0.4054,0.4237,0.6786,0.4237,6.892
ridge,Ridge Classifier,0.7868,0.0,0.8943,0.8298,0.8608,0.408,0.4134,0.6897,0.4134,3.556
lda,Linear Discriminant Analysis,0.7821,0.7938,0.8773,0.8355,0.8558,0.4107,0.413,0.6961,0.413,3.572
ada,Ada Boost Classifier,0.7671,0.7799,0.8864,0.8143,0.8487,0.3466,0.353,0.6595,0.353,4.178
dt,Decision Tree Classifier,0.7282,0.6519,0.8127,0.8176,0.8151,0.302,0.3023,0.6519,0.3023,3.904
knn,K Neighbors Classifier,0.7529,0.7224,0.912,0.7869,0.8447,0.2553,0.274,0.6094,0.274,4.32


Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Setting up PyCaret session:  ID-121, Transport - efflux


Unnamed: 0,Description,Value
0,Session id,121
1,Target,status_efflux
2,Target type,Binary
3,Original data shape,"(3632, 3195)"
4,Transformed data shape,"(3632, 386)"
5,Transformed train set shape,"(2542, 386)"
6,Transformed test set shape,"(1090, 386)"
7,Numeric features,3194
8,Rows with missing values,0.1%
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Balanced Accuracy,Balanced MCC,TT (Sec)
et,Extra Trees Classifier,0.8242,0.8769,0.9354,0.8435,0.887,0.4956,0.51,0.7238,0.51,3.642
xgboost,Extreme Gradient Boosting,0.8171,0.864,0.9232,0.8435,0.8815,0.484,0.4943,0.7213,0.4943,4.218
lightgbm,Light Gradient Boosting Machine,0.8179,0.8632,0.9285,0.8412,0.8826,0.4809,0.4938,0.718,0.4938,4.164
rf,Random Forest Classifier,0.8194,0.8655,0.9408,0.8354,0.8849,0.4731,0.4916,0.71,0.4916,3.838
lda,Linear Discriminant Analysis,0.7998,0.8023,0.8848,0.8501,0.8668,0.463,0.4659,0.723,0.4659,3.58
ridge,Ridge Classifier,0.8013,0.0,0.8981,0.8429,0.8695,0.455,0.4603,0.714,0.4603,3.542
gbc,Gradient Boosting Classifier,0.7998,0.828,0.9301,0.8221,0.8727,0.4121,0.43,0.6822,0.43,6.986
ada,Ada Boost Classifier,0.7762,0.8021,0.8858,0.8242,0.8537,0.3792,0.3846,0.6772,0.3846,4.196
dt,Decision Tree Classifier,0.7407,0.657,0.8335,0.8183,0.8256,0.3198,0.3207,0.657,0.3207,3.87
knn,K Neighbors Classifier,0.7526,0.7241,0.9088,0.7881,0.8441,0.2588,0.2754,0.6116,0.2754,4.364


Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Setting up PyCaret session:  ID-198, Transport - efflux


Unnamed: 0,Description,Value
0,Session id,198
1,Target,status_efflux
2,Target type,Binary
3,Original data shape,"(3632, 3195)"
4,Transformed data shape,"(3632, 393)"
5,Transformed train set shape,"(2542, 393)"
6,Transformed test set shape,"(1090, 393)"
7,Numeric features,3194
8,Rows with missing values,0.1%
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Balanced Accuracy,Balanced MCC,TT (Sec)
et,Extra Trees Classifier,0.8222,0.8803,0.9349,0.8416,0.8856,0.4904,0.5074,0.7204,0.5074,3.62
lightgbm,Light Gradient Boosting Machine,0.821,0.8665,0.9285,0.8443,0.8843,0.4924,0.5046,0.724,0.5046,4.032
xgboost,Extreme Gradient Boosting,0.8171,0.8624,0.9232,0.8435,0.8814,0.4847,0.4962,0.7213,0.4962,4.212
rf,Random Forest Classifier,0.8139,0.8686,0.9381,0.8313,0.8814,0.4567,0.4768,0.7019,0.4768,3.84
ridge,Ridge Classifier,0.7986,0.0,0.8997,0.8391,0.8681,0.4436,0.4501,0.7073,0.4501,3.532
lda,Linear Discriminant Analysis,0.7931,0.8072,0.8815,0.8449,0.8624,0.4443,0.4481,0.7132,0.4481,3.554
gbc,Gradient Boosting Classifier,0.7943,0.8381,0.9226,0.8208,0.8685,0.4019,0.4196,0.6784,0.4196,6.948
ada,Ada Boost Classifier,0.773,0.7893,0.8933,0.8165,0.853,0.3585,0.3666,0.6644,0.3666,4.194
dt,Decision Tree Classifier,0.7384,0.6694,0.8148,0.8276,0.8211,0.3346,0.3349,0.6694,0.3349,3.876
knn,K Neighbors Classifier,0.7569,0.73,0.9082,0.7924,0.8463,0.2769,0.2929,0.6202,0.2929,4.374


Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
