#  CDK2 Ligands agaist CDK2 and FXa proteins
### Validation Method

In [1]:
import pandas as pd
import numpy as np
import glob, sys, os
sys.path.append('..')

In [2]:
from modules.plotting_metrics import PlotMetric
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', context='talk', font_scale=0.8)

## Evaluations using DEKOIS Datasets

#### FXa 

#### CDK2

In [3]:
df_dks_CDK2 = pd.read_csv('../2_Docking_analysis/DEKOIS2_VINARDO_403_prots_1240_mols.csv', index_col = 'ligand')
df_dks_CDK2 = df_dks_CDK2.reset_index().drop_duplicates(subset='ligand', keep='first').set_index('ligand')
# dekois_dksc, y_true_dekois = drop_duplicated_mols(dekois_dksc, 'DEKOIS')
y_true_CDK2 = df_dks_CDK2.activity
df_dks_CDK2.drop('activity', axis=1, inplace=True)
df_dks_CDK2.index.name = 'name'
df_dks_CDK2.fillna(-7, inplace=True)
df_dks_CDK2.shape

(1240, 402)

In [4]:
df_dks_FXA =pd.read_csv('../../../FXa/ANALISIS/Cross_Docking_using_CDK2_mols/DEKOIS_VINARDO_137_prots_1240_mols.csv', index_col = 'ligand') 
df_dks_FXA = df_dks_FXA.reset_index().drop_duplicates(subset='ligand', keep='first').set_index('ligand')
# dekois_dksc, y_true_dekois = drop_duplicated_mols(dekois_dksc, 'DEKOIS')
y_true_FXA = df_dks_FXA.activity
df_dks_FXA.drop('activity', axis=1, inplace=True)
df_dks_FXA.index.name = 'name'
df_dks_FXA.shape

(1240, 136)

In [5]:
(y_true_FXA == y_true_CDK2).all()

True

## Scaffold Splitting

In [6]:
#*************************************************
# Functions to compute stratify scaffold splitting
#*************************************************
sys.path.append('../2_Docking_analysis/')
from scaffold_splitter import train_test_scaffold_split

In [8]:
# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = '../2_Docking_analysis/df_COCRYS_CSAR_DUD_DEKOIS_Murcko_Scaffolds_SMILES.obj'

# df_scff_murcko = pd.read_pickle(file)
# df_scff_murcko.loc['DEKOIS']

### Helper Functions

In [9]:
results_dict = {}

In [10]:
%run ../6_Machine_Learning_Models/4_Helper_Functions_Model_Selection_Grid_Search.ipynb

#  Hyperparameter Tunning: Grid Search

***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    DEKOIS: CDK2 Molecules against CDK2 protein 
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

#### CDK2 Molecules 

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>DEKOIS</b>

In [11]:
%%time
from sklearn.linear_model import LogisticRegression

protein='CDK2'
library='DEKOIS'
X = df_dks_CDK2
y = y_true_CDK2

for rep in range(15):
    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein 
    hyperparams = {'C': [100], 
                   'class_weight': [None, 'balanced']}
    estimator = LogisticRegression(penalty='l1', solver='liblinear')

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, estimator, hyperparams, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 43.6 s, sys: 7.16 s, total: 50.8 s
Wall time: 3min 37s


In [None]:
row_names = ['Train_set', 'Test_set', 'Model name', 'Split', 
             
            'N_mols_train', 'N_actives_train',  'Num_mols_test', 'N_actives_test', 
            'Mean-CV-ROC', 'ROC-AUC_train', 'ROC-AUC_test', 'best_params',
             
            'DkS_max_ROC_train',  'DkSc_med_ROC_train', 'DkSc_mean_ROC_train', 
            'DkS_max_ROC_test',  'DkSc_med_ROC_test', 'DkSc_mean_ROC_test'
            ]

In [None]:
# Bagging Classifier

In [22]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

protein='CDK2'
library='DEKOIS'
X = df_dks_CDK2
y = y_true_CDK2

for rep in range(15):
    estimator_name = 'Bagg_LogReg'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    # Base estimator
    estimator = LogisticRegression(
                                   C=100,
                                   penalty='l1', 
                                   solver='liblinear')
    
    # Bagging Classifier
    bag_estimator = BaggingClassifier(estimator, n_estimators=20, max_features=10)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 bag_estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 29.9 s, sys: 6.83 s, total: 36.7 s
Wall time: 42 s


In [31]:
%%time
from sklearn.ensemble import RandomForestClassifier

protein='CDK2'
library='DEKOIS'
X = df_dks_CDK2
y = y_true_CDK2

for rep in range(15):
    estimator_name = 'RandForest'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    estimator = RandomForestClassifier(n_estimators=100, max_depth=3, max_features=10)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 
                 X, y, 
                 estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 17 s, sys: 137 ms, total: 17.1 s
Wall time: 24.7 s


In [36]:
%%time
from xgboost import XGBClassifier

protein='CDK2'
library='DEKOIS'
X = df_dks_CDK2
y = y_true_CDK2

for rep in range(15):
    estimator_name = 'XGB_tree'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    estimator = XGBClassifier(n_estimators=100,
                              learning_rate=0.1,
                              max_depth=3
                             )
    
    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 43.1 s, sys: 154 ms, total: 43.3 s
Wall time: 39.3 s


***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    DEKOIS: FXa Molecules against FXa protein 
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

#### CDK2 Molecules

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>Merged Libraries</b>

In [None]:
%%time
from sklearn.linear_model import LogisticRegression

protein='FXA'
library='DEKOIS'
X = df_dks_FXA
y = y_true_FXA

for rep in range(15):
    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein 
    hyperparams = {'C': [100]}
    estimator = LogisticRegression(penalty='l1', solver='liblinear')

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, estimator, hyperparams, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

In [26]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

protein='FXA'
library='DEKOIS'
X = df_dks_FXA
y = y_true_FXA

for rep in range(15):
    estimator_name = 'Bagg_LogReg'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    # Base estimator
    estimator = LogisticRegression(
                                   C=100,
                                   penalty='l1', 
                                   solver='liblinear')
    
    # Bagging Classifier
    bag_estimator = BaggingClassifier(estimator, n_estimators=20, max_features=10)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 bag_estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 23.7 s, sys: 6.96 s, total: 30.7 s
Wall time: 37.7 s


In [32]:
%%time
from sklearn.ensemble import RandomForestClassifier

protein='FXA'
library='DEKOIS'
X = df_dks_FXA
y = y_true_FXA

for rep in range(15):
    estimator_name = 'RandForest'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    estimator = RandomForestClassifier(n_estimators=100, max_depth=3, max_features=10)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 8.37 s, sys: 81.5 ms, total: 8.46 s
Wall time: 12.8 s


In [37]:
%%time
from xgboost import XGBClassifier

protein='FXA'
library='DEKOIS'
X = df_dks_FXA
y = y_true_FXA

for rep in range(15):
    estimator_name = 'XGB_tree'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    estimator = XGBClassifier(n_estimators=100,
                              learning_rate=0.1,
                              max_depth=3
                             )
    
    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 18.3 s, sys: 70 ms, total: 18.4 s
Wall time: 13.6 s


In [41]:
row_names = ['Train_set', 'Test_set', 'Model name', 'Split', 
             
            'N_mols_train', 'N_actives_train',  'Num_mols_test', 'N_actives_test', 
            'Mean-CV-ROC', 'ROC-AUC_train', 'ROC-AUC_test', 'best_params',
             
            'DkS_max_ROC_train',  'DkSc_med_ROC_train', 'DkSc_mean_ROC_train', 
            'DkS_max_ROC_test',  'DkSc_med_ROC_test', 'DkSc_mean_ROC_test'
            ]

df_cross_docking_CDK2_ligs = pd.DataFrame(results_dict, index = row_names).T

# df_cross_docking_CDK2_ligs.to_pickle('df_cross_docking_CDK2_ligs.pkl')


df = df_cross_docking_CDK2_ligs
df['rep'] =  df['Model name'].str.split('_rep').str[1]
df['protein'] =  df['Model name'].str.split('_').str[-1]
df['Model name'] = df['Model name'].str.split('_').str[0]
df = df.reset_index(drop=True)
df.drop(['best_params', 'rep'], axis=1, inplace=True)
df.set_index(['Train_set', 'Test_set', 'Model name', 'Split', 'protein'], inplace=True)
df = df.sort_index(level=1, axis=1)

In [39]:
df_cross_docking_CDK2_ligs

Unnamed: 0,Train_set,Test_set,Model name,Split,N_actives_train,N_actives_test,N_mols_train,Num_mols_test,Mean-CV-ROC,ROC-AUC_train,ROC-AUC_test,best_params,DkS_max_ROC_train,DkSc_med_ROC_train,DkSc_mean_ROC_train,DkS_max_ROC_test,DkSc_med_ROC_test,DkSc_mean_ROC_test,rep,protein
DEKOIS_DEKOIS_LogReg_rep0_CDK2_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.795185,1,0.801,"{'C': 100, 'class_weight': None}",0.811185,0.696241,0.695978,0.835833,0.69325,0.691632,0_CDK2,CDK2
DEKOIS_DEKOIS_LogReg_rep1_CDK2_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.79537,1,0.725333,"{'C': 100, 'class_weight': None}",0.794963,0.67638,0.675152,0.902333,0.749667,0.755004,1_CDK2,CDK2
DEKOIS_DEKOIS_LogReg_rep2_CDK2_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.739259,1,0.842,"{'C': 100, 'class_weight': None}",0.82563,0.719444,0.719734,0.789833,0.630583,0.628427,2_CDK2,CDK2
DEKOIS_DEKOIS_LogReg_rep3_CDK2_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.783704,1,0.787333,"{'C': 100, 'class_weight': None}",0.799722,0.697426,0.695742,0.839167,0.695583,0.696123,3_CDK2,CDK2
DEKOIS_DEKOIS_LogReg_rep4_CDK2_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.780556,1,0.703667,"{'C': 100, 'class_weight': None}",0.794259,0.690667,0.688671,0.891833,0.711583,0.715444,4_CDK2,CDK2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DEKOIS_DEKOIS_XGB_tree_rep10_FXA_random,DEKOIS,DEKOIS,XGB,random,930,30,310,10,0.740741,1,0.597,{},0.768204,0.642463,0.64955,0.609667,0.504667,0.504441,10_FXA,FXA
DEKOIS_DEKOIS_XGB_tree_rep11_FXA_random,DEKOIS,DEKOIS,XGB,random,930,30,310,10,0.680833,1,0.738333,{},0.699796,0.602194,0.603827,0.739167,0.646167,0.638925,11_FXA,FXA
DEKOIS_DEKOIS_XGB_tree_rep12_FXA_random,DEKOIS,DEKOIS,XGB,random,930,30,310,10,0.781296,1,0.634667,{},0.739741,0.650093,0.650235,0.638333,0.504917,0.501515,12_FXA,FXA
DEKOIS_DEKOIS_XGB_tree_rep13_FXA_random,DEKOIS,DEKOIS,XGB,random,930,30,310,10,0.655648,1,0.723,{},0.718556,0.615935,0.616251,0.686,0.60675,0.601708,13_FXA,FXA


In [40]:
roc_cols = ['Mean-CV-ROC', 'ROC-AUC_train', 'ROC-AUC_test']

df.astype('float').filter(regex='ROC').groupby(level=[2,4]).aggregate(['mean', 'std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,DkS_max_ROC_test,DkS_max_ROC_train,DkSc_mean_ROC_test,DkSc_mean_ROC_train,DkSc_med_ROC_test,DkSc_med_ROC_train,Mean-CV-ROC,ROC-AUC_test,ROC-AUC_train
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean
Model name,protein,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Bagg,CDK2,0.843367,0.798758,0.696912,0.694884,0.698189,0.69521,0.81242,0.810156,0.867674
Bagg,FXA,0.7062,0.719005,0.585736,0.621527,0.58475,0.619225,0.708704,0.694267,0.785452
LogReg,CDK2,0.846567,0.798417,0.695143,0.695404,0.695472,0.695533,0.78,0.787022,1.0
LogReg,FXA,0.764322,0.699567,0.644214,0.602279,0.643217,0.599538,0.55779,0.553356,0.979684
RandForest,CDK2,0.852289,0.79651,0.708595,0.690698,0.710294,0.691057,0.753556,0.778478,0.933996
RandForest,FXA,0.7335,0.710505,0.620408,0.610355,0.620844,0.607781,0.678284,0.693033,0.954578
XGB,CDK2,0.841489,0.799285,0.700842,0.693356,0.700339,0.693786,0.759037,0.817978,1.0
XGB,FXA,0.730278,0.711214,0.61539,0.611935,0.617878,0.609828,0.683105,0.703822,1.0
