In [None]:
import pycaret
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from pycaret.classification import setup, compare_models
from pycaret.classification import tune_model
from pycaret.classification import *
from sklearn.metrics import balanced_accuracy_score, matthews_corrcoef
from sklearn.metrics import confusion_matrix

from pycaret.classification import load_model

import pickle



In [None]:
#Define the models to use for the different targets

def define_models(t):

    if t == "mcl": #using fingeprints
        model_names = ["RandomForestClassifier",'GaussianNB',"ExtraTreesClassifier"]

    elif t =="bcl": #using molecular descriptors
        model_names = ["DecisionTreeClassifier",'LGBMClassifier',"GradientBoostingClassifier"]

    return model_names

In [None]:
#Load the model

def load_the_model(t,model_name,f):
    model_file = f'../model_building/inhibitors/models/{t}/{t}_cluster_{f}_{model_name}_session_16'
    model = load_model(model_file)

    return model

In [None]:
#Make predictions

def make_prediction(model,df):
    predictions = predict_model(model, data=df, raw_score=True)
    
    return predictions

In [None]:
#Create a clean dataframe for prediciton related information

def clean_df_for_info(df,):

    columns_to_keep = ['papyrus_SMILES','papyrus_inchi_key']
    df = df[columns_to_keep]

    return df

Mcl-1 prediction

In [None]:
#Making prediction about MCL-1 inhibitory effect

t = "mcl"
f="fp"

models = define_models(t)

df_info_mcl = pd.DataFrame()

#iterate through the compounds with fingerprints files
for i in range(1,48):

    #Load the data
    file_name = f'data/subfp/small_mols_papyrus_460k_chemopy_{i}.feather'
    df_tmp=pd.read_feather(file_name)

    df_info_mcl_tmp = clean_df_for_info(df_tmp)

    for m in models:
        classifier = load_the_model(t,m,f)
        predictions = make_prediction(classifier,df_tmp)

        pred_classes = predictions['prediction_label'].values

        probability_score= predictions['prediction_score_1'].values

        column_mod_score=f'{m}_pred_score_{t}_{f}'
        df_info_mcl_tmp[column_mod_score] = probability_score

        column_mod_class = f'{m}_pred_class_{t}_{f}'
        df_info_mcl_tmp[column_mod_class] = pred_classes


    #majority vote
    counts = df_info_mcl_tmp.iloc[:, [-5, -3, -1]].apply(lambda x: (x == 1).sum(), axis=1)
    column_maj_class =f'status_{t}'
    df_info_mcl_tmp[column_maj_class] = counts.apply(lambda x: 1 if x > 1 else 0)


    df_info_mcl = pd.concat([df_info_mcl,df_info_mcl_tmp],ignore_index=True)

#Save the prediction
df_info_mcl.to_feather("inhibitor_pred_per_target/mcl_prediction.feather")

In [None]:

specific_value = 1  
mcl_pos= (df_info_mcl['status_mcl'] == specific_value).sum()
print(f'MCL-1 positive hits: {mcl_pos}')

In [None]:
len(df_info_mcl)

Bcl-2 prediction

In [None]:
#Load the small molecules with their calculated molecular descriptors
df_submd = pd.read_feather('data/submd/small_mol_460k_submd.feather')
df_submd.replace([np.inf, -np.inf], 0, inplace=True)

In [None]:
#Making prediction about BCL-2 inhibitory effect

t = "bcl"
f="md"

models = define_models(t)

df_info_bcl = clean_df_for_info(df_submd)
print(f'Clean pred df: {len(df_info_bcl)}')

for m in models:
    classifier = load_the_model(t,m,f)
    predictions = make_prediction(classifier,df_submd)

    pred_classes = predictions['prediction_label'].values

    probability_score= predictions['prediction_score_1'].values

    column_mod_score=f'{m}_pred_score_{t}_{f}'
    df_info_bcl[column_mod_score] = probability_score

    column_mod_class = f'{m}_pred_class_{t}_{f}'
    df_info_bcl[column_mod_class] = pred_classes


 #majority vote
counts = df_info_bcl.iloc[:, [-5, -3, -1]].apply(lambda x: (x == 1).sum(), axis=1)
column_maj_class =f'status_{t}'
df_info_bcl[column_maj_class] = counts.apply(lambda x: 1 if x > 1 else 0)


df_info_bcl.to_feather("inhibitor_pred_per_target/bcl_prediction.feather")

In [None]:
#Check for BCL-2 positive hits

specific_value = 1  
bcl_pos= (df_info_bcl['status_bcl'] == specific_value).sum()
print(f'BCL-2 positive hits: {bcl_pos}')

Load all the prediction

In [None]:
#Load the data
mcl_info = pd.read_feather('460k/mcl_pred_460k.feather')
bcl_info = pd.read_feather('460k/bcl_pred_460k.feather')

#compounds = pd.read_csv('../../6_prediction_bbb/small_mols_papyrus_460k.csv', index_col=0)

In [None]:

bbb_info = pd.read_csv('../../6_prediction_bbb/460k/small_mol_460k_bbb_pos_15_effluxsubmd.csv', index_col=0)
len(bbb_info)

In [None]:
mcl_info.head()

In [None]:
mcl_info["papyrus_inchi_key"] = compounds["papyrus_inchi_key"]
bcl_info["papyrus_inchi_key"] = compounds["papyrus_inchi_key"]

Single positive

In [None]:
#Check for BCL-2 positive hits

specific_value = 1  
bcl_pos= (bcl_info['status_bcl'] == specific_value).sum()
print(f'BCL-2 positive hits: {bcl_pos}')

In [None]:
#Check for MCL-1 positive hits

specific_value = 1  
mcl_pos= (mcl_info['status_mcl'] == specific_value).sum()
print(f'MCL-1 positive hits: {mcl_pos}')

Double positive

In [None]:
#Filter for positive predictions

df_mcl_pos = mcl_info[mcl_info['status_mcl'] == 1]
df_bcl_pos = bcl_info[bcl_info['status_bcl'] == 1]

#Check for overlapping olecules
double_positive = df_bcl_pos[df_bcl_pos['papyrus_inchi_key'].isin(df_mcl_pos['papyrus_inchi_key'])]   
print(f'MCL-1 and BL-2 positive hits: {len(double_positive)}')
double_positive.to_csv('double_positive_pred_460k.csv', index=True)

Filter for BBB permeability

In [None]:
#Check for BBB permeablity, MCL-1 and BCL-2 triple positive hits
bbb_double_pos = bbb_info[bbb_info['papyrus_inchi_key'].isin(double_positive['papyrus_inchi_key'])] 
print(f'Triple positive hits: {len(bbb_double_pos)}')
bbb_double_pos.head()

In [None]:
bbb_double_pos.to_csv('triple_pos_efflux_submd.csv',index=True)

In [None]:
#Check for BBB and MCL-1 positive hits
bbb_mcl_inhibitors = bbb_info[bbb_info['papyrus_SMILES'].isin(df_mcl_pos['papyrus_SMILES'])] 
print(f'BBB permeable and MCL-1 positive hits: {len(bbb_mcl_inhibitors)}')
bbb_mcl_inhibitors.to_csv("bbb_mcl_inhibitors_460k_efflux_submd.csv", index=True)

In [None]:
#Check for BBB and BCL-2 positive hits

bbb_bcl_inhibitors = bbb_info[bbb_info['papyrus_SMILES'].isin(df_bcl_pos['papyrus_SMILES'])] 
print(f'BBB permeable and BCL-2 positive hits: {len(bbb_bcl_inhibitors)}')
bbb_bcl_inhibitors.to_csv("bbb_bcl_inhibitors_460k_efflux_submd.csv", index=True)
