In [1]:
import pycaret
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from pycaret.classification import setup, compare_models
from pycaret.classification import *
from sklearn.metrics import balanced_accuracy_score, matthews_corrcoef
from sklearn.metrics import confusion_matrix

import sklearn

In [2]:
#Define the type of classifiers to build

model_list = ['et','ada','lr','ridge','gbc','rf','dt','lightgbm','svm','lda','knn','nb','qda','dummy','xgboost']



In [3]:
#Define random seed(s)

session_ids=[16] 

In [4]:
#Define evaluation metrics: balanced accuracy, balanced MCC

#Function for balanced accuracy
def balanced_accuracy(y_true, y_pred):
    return balanced_accuracy_score(y_true, y_pred)

#Function for balanced MCC
def balanced_mcc(y_true, y_pred):
    # Get confusion matrix components
    TN, FP, FN, TP = confusion_matrix(y_true, y_pred).ravel()
    
    # Calculate sensitivity, specificity, and prevalence
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    positive_prevalence = (TP + FN) / (TP + FP + TN + FN)
    
    # Calculate Balanced MCC
    numerator = sensitivity + specificity - 1
    denominator = np.sqrt(
        (sensitivity + (1-specificity) * ((1-positive_prevalence) / positive_prevalence )) * 
        (specificity + (1-sensitivity) * (positive_prevalence / (1-positive_prevalence)))
    )

    if denominator == 0:
        return 0
    elif numerator == 0:
        return 0
    else:
        return numerator / denominator

Change variables to define which dataset is used for training (compound set, target label, features)

In [15]:
itarget = "bcl"         #bcl or mcl
feature = 'fp'         #md or fp

In [16]:
#Define and load training set (here the variables are used to define the file path that are later used for defining the saved file names as well)

file_name = f'../../data_preparation/variable_{feature}_generation/inhibitors_{feature}/inhibitors_{itarget}_{feature}_sub.csv'
df = pd.read_csv(file_name, index_col=0)

In [17]:
#Check the dataset for cleaning
df.head()

Unnamed: 0.1,Unnamed: 0,papyrus_SMILES,InChIKey,connectivity,pchembl_value_Mean,Class,chemopy - FP2_1,chemopy - FP2_2,chemopy - FP2_3,chemopy - FP2_4,...,chemopy - Avalon_503,chemopy - Avalon_504,chemopy - Avalon_505,chemopy - Avalon_506,chemopy - Avalon_507,chemopy - Avalon_508,chemopy - Avalon_509,chemopy - Avalon_510,chemopy - Avalon_511,chemopy - Avalon_512
0,0,Cc1cc(=O)c2c(o1)c(CC(C)C)c(O)c(O)c2O,ADNKLAAIRXDEOP-UHFFFAOYSA-N,ADNKLAAIRXDEOP,6.14,Non-inhibitor,0,0,0,0,...,1,0,0,1,1,0,0,0,0,0
1,1,N#CC1=C(O)c2cccc3c(Sc4ccc(Br)cc4)ccc(c23)C1=NC...,AEAIKBHAZFBIFT-UHFFFAOYSA-N,AEAIKBHAZFBIFT,5.99,Non-inhibitor,0,0,0,0,...,1,0,1,1,0,0,0,0,1,0
2,2,COC1(CC(C)C)CCN(c2ccc(C(=O)NS(=O)(=O)c3ccc(NC(...,AGGPTAKEWABCFB-UHFFFAOYSA-N,AGGPTAKEWABCFB,7.25,Inhibitor,0,0,0,0,...,0,0,1,1,0,1,0,0,0,0
3,3,O=C1c2c(cccc2)C(=O)c2c1cc(O)c(O)c2O,AHKDJQYHVWSRLT-UHFFFAOYSA-N,AHKDJQYHVWSRLT,5.88,Non-inhibitor,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
4,4,CC1=CC(C=C(Cl)c2ccc(C(=O)O)cc2)C2(C(=O)O)C=CC(...,AIPCKDBTGKGQEY-UHFFFAOYSA-N,AIPCKDBTGKGQEY,6.43,Non-inhibitor,0,0,0,0,...,0,0,1,1,0,0,1,0,1,0


In [18]:
#Code classes

df["Class"] = df["Class"].replace({'Inhibitor':1, 'Non-inhibitor':0})


In [19]:
#Clean the dataset
df_train = df.drop(['Unnamed: 0','papyrus_SMILES','InChIKey','connectivity','pchembl_value_Mean'], axis=1)

df_train.head()

Unnamed: 0,Class,chemopy - FP2_1,chemopy - FP2_2,chemopy - FP2_3,chemopy - FP2_4,chemopy - FP2_5,chemopy - FP2_6,chemopy - FP2_7,chemopy - FP2_8,chemopy - FP2_9,...,chemopy - Avalon_503,chemopy - Avalon_504,chemopy - Avalon_505,chemopy - Avalon_506,chemopy - Avalon_507,chemopy - Avalon_508,chemopy - Avalon_509,chemopy - Avalon_510,chemopy - Avalon_511,chemopy - Avalon_512
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,0
1,0,0,0,0,0,0,1,0,1,0,...,1,0,1,1,0,0,0,0,1,0
2,1,0,0,0,0,0,1,1,0,1,...,0,0,1,1,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,1,0,1,0


Random split

In [20]:
#Build 15 classifiers with all random seed

for s_id in session_ids:
    print(f"Setting up PyCaret session:  ID-{s_id}, Transport - {itarget}")
    target_col ="Class"

    # Setup the environment with the specific session ID
    grid = setup(data=df_train, 
             target=target_col, 
             session_id=s_id,
             html=True, 
             verbose=True, 
             fold=5, 
             data_split_shuffle=True,
             remove_multicollinearity=True,  
             multicollinearity_threshold=0.9, 
             low_variance_threshold=0.05,
    )
    

    #Get unprocessed training
    train_raw = get_config('X_train')
    train_raw_inchi = pd.merge(train_raw, df[["connectivity"]], left_index=True, right_index=True, how='left')
    train_raw_file_name = f'experiments/{itarget}_{feature}_raw_train_16.csv'
    train_raw_inchi.to_csv(train_raw_file_name, index=False)


     #Get unprocessed test
    test_raw = get_config('X_test')
    test_raw_inchi = pd.merge(test_raw, df[['connectivity',target_col,'papyrus_SMILES']], left_index=True, right_index=True, how='left')
    test_raw_file_name = f'experiments/{itarget}_{feature}_raw_test_16.csv'
    test_raw_inchi.to_csv(test_raw_file_name, index=False)


    # Add the custom metrics to pycaret
    add_metric('balanced_acc', 'Balanced Accuracy', balanced_accuracy, target='pred')
    add_metric('balanced_mcc', 'Balanced MCC', balanced_mcc, greater_is_better=True, target='pred')
    

    # Comparing all models 
    models_comparison = compare_models(sort="Balanced MCC", n_select=16, exclude='catboost')
    
    #Saving all models
    for model in models_comparison:
        model_name = f"models/{itarget}/{itarget}_random_{feature}_{model.__class__.__name__}_session_{s_id}"
        save_model(model, model_name)
        
       
    # Save comparison metrics for each session as a CSV
    metrics_df = pull()
    metrics_filename = f"metrics/{itarget}_random_{feature}_train_raw_metrics_session_{s_id}.csv"
    metrics_df.to_csv(metrics_filename)

Setting up PyCaret session:  ID-16, Transport - bcl


Unnamed: 0,Description,Value
0,Session id,16
1,Target,Class
2,Target type,Binary
3,Original data shape,"(428, 14432)"
4,Transformed data shape,"(428, 6109)"
5,Transformed train set shape,"(299, 6109)"
6,Transformed test set shape,"(129, 6109)"
7,Numeric features,14431
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Balanced Accuracy,Balanced MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8795,0.903,0.7229,0.8488,0.7714,0.6914,0.7026,0.8331,0.7026,4.818
et,Extra Trees Classifier,0.8763,0.9056,0.7124,0.8515,0.7673,0.6846,0.6957,0.8279,0.6957,4.554
rf,Random Forest Classifier,0.8729,0.8983,0.6784,0.8651,0.7524,0.6695,0.6841,0.8157,0.6841,4.578
gbc,Gradient Boosting Classifier,0.8697,0.9077,0.7131,0.8357,0.7626,0.6738,0.683,0.8234,0.683,5.144
xgboost,Extreme Gradient Boosting,0.8695,0.9104,0.7007,0.8408,0.7522,0.6659,0.68,0.8197,0.68,5.218
svm,SVM - Linear Kernel,0.8662,0.0,0.7229,0.8224,0.756,0.6657,0.6793,0.8236,0.6793,4.576
knn,K Neighbors Classifier,0.8695,0.8935,0.7118,0.8256,0.7562,0.6688,0.6787,0.8228,0.6787,4.828
ada,Ada Boost Classifier,0.8631,0.8995,0.7582,0.7937,0.7639,0.6683,0.6787,0.8319,0.6787,4.716
ridge,Ridge Classifier,0.8629,0.0,0.7235,0.806,0.7507,0.6575,0.6686,0.8216,0.6686,4.528
lr,Logistic Regression,0.8629,0.9172,0.7118,0.809,0.745,0.6531,0.6651,0.8181,0.6651,5.156


Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved


Temporal split

In [21]:
split = "temporal"

In [22]:
#Load the train and test data
file_name_train = f'splitted_data/inhibitors_{itarget}_{split}_{feature}_train.csv'
df_train = pd.read_csv(file_name_train, index_col=0)
df_train.reset_index(drop=True, inplace=True)

file_name_test = f'splitted_data/inhibitors_{itarget}_{split}_{feature}_test.csv'
df_test = pd.read_csv(file_name_test,index_col=0)
df_test.reset_index(drop=True, inplace=True)

In [23]:
#dataframe cleaning for train

columns_to_drop = ['papyrus_SMILES', 'InChIKey', 'inchi_connectivity','pchembl_value_Mean']
df = df_train.drop(columns=columns_to_drop)
testing = df_test.drop(columns=columns_to_drop)


In [24]:
#classification fix
df['Class'] = df['Class'].replace({'Inhibitor': 1, 'Non-inhibitor': 0})
df['Class'] = df['Class'].astype(int)

testing['Class'] = testing['Class'].replace({'Inhibitor': 1, 'Non-inhibitor': 0})
testing['Class'] = testing['Class'].astype(int)


In [25]:
for s_id in session_ids:
    print(f"Setting up PyCaret session:  ID-{s_id}, Feature -{feature}, Target - {itarget}, Split - {split}" )

    # Setup the environment with the specific session ID
    grid = setup(data=df, 
             target='Class', 
             session_id=16,
             html=True, 
             verbose=True, 
             fold=5, 
             remove_multicollinearity=True,  
             multicollinearity_threshold=0.9, 
             low_variance_threshold=0.05,
             test_data= testing,
             index=False
    )
    


    # Add the custom metrics to pycaret
    add_metric('balanced_acc', 'Balanced Accuracy', balanced_accuracy, target='pred')
    add_metric('balanced_mcc', 'Balanced MCC', balanced_mcc, greater_is_better=True, target='pred')
    

    # Comparing all models 
    models_comparison = compare_models(sort="Balanced MCC", n_select=16, exclude='catboost')
    
    #Saving all models
    for model in models_comparison:
        model_name = f"models/{itarget}/{itarget}_{split}_{feature}_{model.__class__.__name__}_session_{s_id}"
        save_model(model, model_name)
        
       
    # Save comparison metrics for each session as a CSV
    metrics_df = pull()
    metrics_filename = f"metrics/{itarget}_{split}_{feature}_train_raw_metrics_session_{s_id}.csv"
    metrics_df.to_csv(metrics_filename)

Setting up PyCaret session:  ID-16, Feature -fp, Target - bcl, Split - temporal


Unnamed: 0,Description,Value
0,Session id,16
1,Target,Class
2,Target type,Binary
3,Original data shape,"(428, 14432)"
4,Transformed data shape,"(428, 5808)"
5,Transformed train set shape,"(291, 5808)"
6,Transformed test set shape,"(137, 5808)"
7,Numeric features,14431
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Balanced Accuracy,Balanced MCC,TT (Sec)
rf,Random Forest Classifier,0.914,0.9463,0.8519,0.9023,0.873,0.8083,0.8125,0.8996,0.8125,4.382
gbc,Gradient Boosting Classifier,0.9072,0.9515,0.8619,0.8787,0.8667,0.7957,0.7997,0.8967,0.7997,4.956
et,Extra Trees Classifier,0.9003,0.9449,0.8229,0.8911,0.8532,0.7779,0.7818,0.8825,0.7818,4.344
lr,Logistic Regression,0.8968,0.9495,0.8324,0.8757,0.8493,0.7712,0.7761,0.882,0.7761,4.766
lda,Linear Discriminant Analysis,0.8935,0.9316,0.8624,0.8516,0.8512,0.7687,0.7752,0.8865,0.7752,4.3
lightgbm,Light Gradient Boosting Machine,0.8935,0.94,0.8429,0.8586,0.8472,0.7657,0.7695,0.882,0.7695,4.672
knn,K Neighbors Classifier,0.8933,0.9343,0.8424,0.8556,0.8463,0.7648,0.7676,0.8817,0.7676,4.73
svm,SVM - Linear Kernel,0.8866,0.0,0.8224,0.8607,0.8351,0.7491,0.7558,0.8717,0.7558,4.41
xgboost,Extreme Gradient Boosting,0.8866,0.9487,0.8124,0.862,0.8314,0.7465,0.7521,0.8693,0.7521,5.006
ridge,Ridge Classifier,0.8831,0.0,0.8029,0.8622,0.8273,0.7393,0.7446,0.8646,0.7446,4.414


Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved


Cluster split

In [26]:
split = "cluster"

In [27]:
#Load the train and test data
file_name_train = f'splitted_data/inhibitors_{itarget}_{split}_{feature}_train.csv'
df_train = pd.read_csv(file_name_train, index_col=0)
df_train.reset_index(drop=True, inplace=True)

file_name_test = f'splitted_data/inhibitors_{itarget}_{split}_{feature}_test.csv'
df_test = pd.read_csv(file_name_test,index_col=0)
df_test.reset_index(drop=True, inplace=True)

In [28]:
#dataframe cleaning for train

columns_to_drop = ['papyrus_SMILES', 'InChIKey', 'inchi_connectivity','pchembl_value_Mean']
df = df_train.drop(columns=columns_to_drop)
testing = df_test.drop(columns=columns_to_drop)


In [29]:
#classification fix
df['Class'] = df['Class'].replace({'Inhibitor': 1, 'Non-inhibitor': 0})
df['Class'] = df['Class'].astype(int)

testing['Class'] = testing['Class'].replace({'Inhibitor': 1, 'Non-inhibitor': 0})
testing['Class'] = testing['Class'].astype(int)


In [30]:
for s_id in session_ids:
    print(f"Setting up PyCaret session:  ID-{s_id}, Feature -{feature}, Target - {itarget}, Split - {split}" )

    # Setup the environment with the specific session ID
    grid = setup(data=df, 
             target='Class', 
             session_id=16,
             html=True, 
             verbose=True, 
             fold=5, 
             remove_multicollinearity=True,  
             multicollinearity_threshold=0.9, 
             low_variance_threshold=0.05,
             test_data= testing,
             index=False
    )
    


    # Add the custom metrics to pycaret
    add_metric('balanced_acc', 'Balanced Accuracy', balanced_accuracy, target='pred')
    add_metric('balanced_mcc', 'Balanced MCC', balanced_mcc, greater_is_better=True, target='pred')
    

    # Comparing all models 
    models_comparison = compare_models(sort="Balanced MCC", n_select=16, exclude='catboost')
    
    #Saving all models
    for model in models_comparison:
        model_name = f"models/{itarget}/{itarget}_{split}_{feature}_{model.__class__.__name__}_session_{s_id}"
        save_model(model, model_name)
        
       
    # Save comparison metrics for each session as a CSV
    metrics_df = pull()
    metrics_filename = f"metrics/{itarget}_{split}_{feature}_train_raw_metrics_session_{s_id}.csv"
    metrics_df.to_csv(metrics_filename)

Setting up PyCaret session:  ID-16, Feature -fp, Target - bcl, Split - cluster


Unnamed: 0,Description,Value
0,Session id,16
1,Target,Class
2,Target type,Binary
3,Original data shape,"(428, 14432)"
4,Transformed data shape,"(428, 5327)"
5,Transformed train set shape,"(301, 5327)"
6,Transformed test set shape,"(127, 5327)"
7,Numeric features,14431
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Balanced Accuracy,Balanced MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.9236,0.9542,0.8444,0.8963,0.868,0.8144,0.8165,0.9009,0.8165,4.644
rf,Random Forest Classifier,0.9236,0.9479,0.7889,0.9465,0.8602,0.8082,0.8149,0.885,0.8149,4.14
ridge,Ridge Classifier,0.9203,0.0,0.8333,0.9004,0.8616,0.806,0.8104,0.8954,0.8104,4.098
et,Extra Trees Classifier,0.9203,0.9422,0.8111,0.9146,0.8591,0.8038,0.8072,0.889,0.8072,4.152
lightgbm,Light Gradient Boosting Machine,0.9169,0.9461,0.8333,0.8906,0.8571,0.7989,0.8032,0.893,0.8032,4.39
lr,Logistic Regression,0.9137,0.9545,0.8222,0.8877,0.8518,0.791,0.7939,0.8875,0.7939,4.706
xgboost,Extreme Gradient Boosting,0.9137,0.949,0.8111,0.8955,0.8491,0.7889,0.7926,0.8843,0.7926,4.664
lda,Linear Discriminant Analysis,0.9104,0.9447,0.8444,0.8593,0.8497,0.786,0.7881,0.8915,0.7881,4.356
knn,K Neighbors Classifier,0.9036,0.9386,0.7778,0.8868,0.8258,0.7599,0.7653,0.8675,0.7653,4.43
svm,SVM - Linear Kernel,0.9003,0.0,0.8333,0.8366,0.8341,0.7629,0.7637,0.8811,0.7637,4.112


Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
