In [None]:
import pycaret
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from pycaret.classification import setup, compare_models
from pycaret.classification import *
from sklearn.metrics import balanced_accuracy_score, matthews_corrcoef
from sklearn.metrics import confusion_matrix

import sklearn

In [None]:
#Define the type of classifiers to build

model_list = ['et','ada','lr','ridge','gbc','rf','dt','lightgbm','svm','lda','knn','nb','qda','dummy','xgboost']



In [None]:
#Define random seed(s)

session_ids=[16] 

In [None]:
#Define evaluation metrics: balanced accuracy, balanced MCC

#Function for balanced accuracy
def balanced_accuracy(y_true, y_pred):
    return balanced_accuracy_score(y_true, y_pred)

#Function for balanced MCC
def balanced_mcc(y_true, y_pred):
    # Get confusion matrix components
    TN, FP, FN, TP = confusion_matrix(y_true, y_pred).ravel()
    
    # Calculate sensitivity, specificity, and prevalence
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    positive_prevalence = (TP + FN) / (TP + FP + TN + FN)
    
    # Calculate Balanced MCC
    numerator = sensitivity + specificity - 1
    denominator = np.sqrt(
        (sensitivity + (1-specificity) * ((1-positive_prevalence) / positive_prevalence )) * 
        (specificity + (1-sensitivity) * (positive_prevalence / (1-positive_prevalence)))
    )

    if denominator == 0:
        return 0
    elif numerator == 0:
        return 0
    else:
        return numerator / denominator

Change variables to define which dataset is used for training (compound set, target label, features)

In [None]:
itarget = "bcl"         #bcl or mcl
feature = 'fp'         #md or fp

In [None]:
#Define and load training set (here the variables are used to define the file path that are later used for defining the saved file names as well)

file_name = f'../../data_preparation/variable_{feature}_generation/inhibitors_{feature}/inhibitors_{itarget}_{feature}_sub.csv'
df = pd.read_csv(file_name, index_col=0)

In [None]:
#Check the dataset for cleaning
df.head()

In [None]:
#Code classes

df["Class"] = df["Class"].replace({'Inhibitor':1, 'Non-inhibitor':0})


In [None]:
#Clean the dataset
df_train = df.drop(['Unnamed: 0','papyrus_SMILES','InChIKey','connectivity','pchembl_value_Mean'], axis=1)

df_train.head()

Random split

In [None]:
#Build 15 classifiers with all random seed

for s_id in session_ids:
    print(f"Setting up PyCaret session:  ID-{s_id}, Transport - {itarget}")
    target_col ="Class"

    # Setup the environment with the specific session ID
    grid = setup(data=df_train, 
             target=target_col, 
             session_id=s_id,
             html=True, 
             verbose=True, 
             fold=5, 
             data_split_shuffle=True,
             remove_multicollinearity=True,  
             multicollinearity_threshold=0.9, 
             low_variance_threshold=0.05,
    )
    

    #Get unprocessed training
    train_raw = get_config('X_train')
    train_raw_inchi = pd.merge(train_raw, df[["connectivity"]], left_index=True, right_index=True, how='left')
    train_raw_file_name = f'experiments/{itarget}_{feature}_raw_train_16.csv'
    train_raw_inchi.to_csv(train_raw_file_name, index=False)


     #Get unprocessed test
    test_raw = get_config('X_test')
    test_raw_inchi = pd.merge(test_raw, df[['connectivity',target_col,'papyrus_SMILES']], left_index=True, right_index=True, how='left')
    test_raw_file_name = f'experiments/{itarget}_{feature}_raw_test_16.csv'
    test_raw_inchi.to_csv(test_raw_file_name, index=False)


    # Add the custom metrics to pycaret
    add_metric('balanced_acc', 'Balanced Accuracy', balanced_accuracy, target='pred')
    add_metric('balanced_mcc', 'Balanced MCC', balanced_mcc, greater_is_better=True, target='pred')
    

    # Comparing all models 
    models_comparison = compare_models(sort="Balanced MCC", n_select=16, exclude='catboost')
    
    #Saving all models
    for model in models_comparison:
        model_name = f"models/{itarget}/{itarget}_random_{feature}_{model.__class__.__name__}_session_{s_id}"
        save_model(model, model_name)
        
       
    # Save comparison metrics for each session as a CSV
    metrics_df = pull()
    metrics_filename = f"metrics/{itarget}_random_{feature}_train_raw_metrics_session_{s_id}.csv"
    metrics_df.to_csv(metrics_filename)

Temporal split

In [None]:
split = "temporal"

In [None]:
#Load the train and test data
file_name_train = f'splitted_data/inhibitors_{itarget}_{split}_{feature}_train.csv'
df_train = pd.read_csv(file_name_train, index_col=0)
df_train.reset_index(drop=True, inplace=True)

file_name_test = f'splitted_data/inhibitors_{itarget}_{split}_{feature}_test.csv'
df_test = pd.read_csv(file_name_test,index_col=0)
df_test.reset_index(drop=True, inplace=True)

In [None]:
#dataframe cleaning for train

columns_to_drop = ['papyrus_SMILES', 'InChIKey', 'inchi_connectivity','pchembl_value_Mean']
df = df_train.drop(columns=columns_to_drop)
testing = df_test.drop(columns=columns_to_drop)


In [None]:
#classification fix
df['Class'] = df['Class'].replace({'Inhibitor': 1, 'Non-inhibitor': 0})
df['Class'] = df['Class'].astype(int)

testing['Class'] = testing['Class'].replace({'Inhibitor': 1, 'Non-inhibitor': 0})
testing['Class'] = testing['Class'].astype(int)


In [None]:
for s_id in session_ids:
    print(f"Setting up PyCaret session:  ID-{s_id}, Feature -{feature}, Target - {itarget}, Split - {split}" )

    # Setup the environment with the specific session ID
    grid = setup(data=df, 
             target='Class', 
             session_id=16,
             html=True, 
             verbose=True, 
             fold=5, 
             remove_multicollinearity=True,  
             multicollinearity_threshold=0.9, 
             low_variance_threshold=0.05,
             test_data= testing,
             index=False
    )
    


    # Add the custom metrics to pycaret
    add_metric('balanced_acc', 'Balanced Accuracy', balanced_accuracy, target='pred')
    add_metric('balanced_mcc', 'Balanced MCC', balanced_mcc, greater_is_better=True, target='pred')
    

    # Comparing all models 
    models_comparison = compare_models(sort="Balanced MCC", n_select=16, exclude='catboost')
    
    #Saving all models
    for model in models_comparison:
        model_name = f"models/{itarget}/{itarget}_{split}_{feature}_{model.__class__.__name__}_session_{s_id}"
        save_model(model, model_name)
        
       
    # Save comparison metrics for each session as a CSV
    metrics_df = pull()
    metrics_filename = f"metrics/{itarget}_{split}_{feature}_train_raw_metrics_session_{s_id}.csv"
    metrics_df.to_csv(metrics_filename)

Cluster split

In [None]:
split = "cluster"

In [None]:
#Load the train and test data
file_name_train = f'splitted_data/inhibitors_{itarget}_{split}_{feature}_train.csv'
df_train = pd.read_csv(file_name_train, index_col=0)
df_train.reset_index(drop=True, inplace=True)

file_name_test = f'splitted_data/inhibitors_{itarget}_{split}_{feature}_test.csv'
df_test = pd.read_csv(file_name_test,index_col=0)
df_test.reset_index(drop=True, inplace=True)

In [None]:
#dataframe cleaning for train

columns_to_drop = ['papyrus_SMILES', 'InChIKey', 'inchi_connectivity','pchembl_value_Mean']
df = df_train.drop(columns=columns_to_drop)
testing = df_test.drop(columns=columns_to_drop)


In [None]:
#classification fix
df['Class'] = df['Class'].replace({'Inhibitor': 1, 'Non-inhibitor': 0})
df['Class'] = df['Class'].astype(int)

testing['Class'] = testing['Class'].replace({'Inhibitor': 1, 'Non-inhibitor': 0})
testing['Class'] = testing['Class'].astype(int)


In [None]:
for s_id in session_ids:
    print(f"Setting up PyCaret session:  ID-{s_id}, Feature -{feature}, Target - {itarget}, Split - {split}" )

    # Setup the environment with the specific session ID
    grid = setup(data=df, 
             target='Class', 
             session_id=16,
             html=True, 
             verbose=True, 
             fold=5, 
             remove_multicollinearity=True,  
             multicollinearity_threshold=0.9, 
             low_variance_threshold=0.05,
             test_data= testing,
             index=False
    )
    


    # Add the custom metrics to pycaret
    add_metric('balanced_acc', 'Balanced Accuracy', balanced_accuracy, target='pred')
    add_metric('balanced_mcc', 'Balanced MCC', balanced_mcc, greater_is_better=True, target='pred')
    

    # Comparing all models 
    models_comparison = compare_models(sort="Balanced MCC", n_select=16, exclude='catboost')
    
    #Saving all models
    for model in models_comparison:
        model_name = f"models/{itarget}/{itarget}_{split}_{feature}_{model.__class__.__name__}_session_{s_id}"
        save_model(model, model_name)
        
       
    # Save comparison metrics for each session as a CSV
    metrics_df = pull()
    metrics_filename = f"metrics/{itarget}_{split}_{feature}_train_raw_metrics_session_{s_id}.csv"
    metrics_df.to_csv(metrics_filename)