#  CDK2 Ligands agaist CDK2 and FXa proteins
### Validation Method

In [22]:
import pandas as pd
import numpy as np
import glob, sys, os
sys.path.append('..')

In [23]:
from modules.plotting_metrics import PlotMetric
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', context='talk', font_scale=0.8)

## Evaluations using DEKOIS Datasets: CDK2 Molecules

#### CDK2

In [24]:
df_dks_CDK2 = pd.read_csv('../2_Docking_analysis/DEKOIS2_VINARDO_403_prots_1240_mols.csv', index_col = 'ligand')
df_dks_CDK2 = df_dks_CDK2.reset_index().drop_duplicates(subset='ligand', keep='first').set_index('ligand')
# dekois_dksc, y_true_dekois = drop_duplicated_mols(dekois_dksc, 'DEKOIS')
y_true_CDK2 = df_dks_CDK2.activity
df_dks_CDK2.drop('activity', axis=1, inplace=True)
df_dks_CDK2.index.name = 'name'
df_dks_CDK2.fillna(-7, inplace=True)
df_dks_CDK2.shape

(1240, 402)

In [25]:
df_dks_FXA =pd.read_csv('../../../FXa/ANALISIS/Cross_Docking_using_CDK2_mols/DEKOIS_VINARDO_137_prots_1240_mols.csv', index_col = 'ligand') 
df_dks_FXA = df_dks_FXA.reset_index().drop_duplicates(subset='ligand', keep='first').set_index('ligand')
# dekois_dksc, y_true_dekois = drop_duplicated_mols(dekois_dksc, 'DEKOIS')
y_true_FXA = df_dks_FXA.activity
df_dks_FXA.drop('activity', axis=1, inplace=True)
df_dks_FXA.index.name = 'name'
df_dks_FXA.shape

(1240, 136)

In [26]:

(y_true_FXA == y_true_CDK2).all()

True

# Protein conformations Selection

## CDK2 Conformations

In [27]:
# Read the table of conformations metadata
path_to_json_file = '../data/TABLA_MTDATA_CDK2_402_crys_LIGS_INFO.json'

# Read the metadata table                      
df_prot = pd.read_json(path_to_json_file)
# Select the 20 conformations with the best Resolution
selected_confs = df_prot.sort_values('Inhib_mass', ascending=False).index[:20]
print(selected_confs)

# Keep the selected conformations inside the D scores matrix
df_dks_CDK2 = df_dks_CDK2.loc[:, selected_confs]

Index(['4bcm', '3eid', '2g9x', '1pkd', '4nj3', '4acm', '1aq1', '4ez7', '4erw',
       '3qtu', '1y91', '4bco', '4fkv', '1fvv', '2wih', '1urw', '3ezr', '2r64',
       '5nev', '5a14'],
      dtype='object')


## FXa conformations

In [28]:
# Read the table of conformations metadata
path_to_json_file = '../../../FXa/ANALISIS/1_Fetching_and_generating_data/TABLA_MTDATA_FXA_136_crys_LIGS_INFO.json' 

# Read the metadata table                      
df_prot = pd.read_json(path_to_json_file)
# Select the 20 conformations with the best Resolution
selected_confs = df_prot.sort_values('Inhib_mass', ascending=False).index[:20]
print(selected_confs)

# Keep the selected conformations inside the D scores matrix
df_dks_FXA = df_dks_FXA.loc[:, selected_confs]

Index(['4btt', '1iqe', '4bti', '1iqn', '3ffg', '3kqc', '3kqd', '3kqe', '1lpg',
       '5k0h', '1iqf', '3sw2', '2fzz', '4btu', '1g2l', '3kqb', '1ioe', '2j4i',
       '4y7b', '2vwm'],
      dtype='object')


### Helper Functions

In [29]:
results_dict = {}

In [30]:
%run ../6_Machine_Learning_Models/4_Helper_Functions_Model_Selection_Grid_Search.ipynb

#  Hyperparameter Tunning: Grid Search

***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    DEKOIS: CDK2 Molecules against 20 confs of CDK2 protein 
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

#### CDK2 Molecules 

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>DEKOIS</b>

In [31]:
%%time
from sklearn.linear_model import LogisticRegression

protein='CDK2'
library='DEKOIS'
X = df_dks_CDK2
y = y_true_CDK2

for rep in range(15):
    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein 
    hyperparams = {'C': [100], 
                   'class_weight': [None, 'balanced']}
    estimator = LogisticRegression(penalty='l1', solver='liblinear')

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, estimator, hyperparams, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 8.94 s, sys: 5.85 s, total: 14.8 s
Wall time: 16.4 s


In [32]:
# Bagging Classifier

In [33]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

protein='CDK2'
library='DEKOIS'
X = df_dks_CDK2
y = y_true_CDK2

for rep in range(15):
    estimator_name = 'Bagg_LogReg'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    # Base estimator
    estimator = LogisticRegression(
                                   C=100,
                                   penalty='l1', 
                                   solver='liblinear')
    
    # Bagging Classifier
    bag_estimator = BaggingClassifier(estimator, n_estimators=20, max_features=10)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 bag_estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 17.7 s, sys: 6.05 s, total: 23.7 s
Wall time: 30.5 s


In [34]:
%%time
from sklearn.ensemble import RandomForestClassifier

protein='CDK2'
library='DEKOIS'
X = df_dks_CDK2
y = y_true_CDK2

for rep in range(15):
    estimator_name = 'RandForest'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    estimator = RandomForestClassifier(n_estimators=100, max_depth=3, max_features=10)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 
                 X, y, 
                 estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 4.46 s, sys: 40.5 ms, total: 4.5 s
Wall time: 10.3 s


In [35]:
%%time
from xgboost import XGBClassifier

protein='CDK2'
library='DEKOIS'
X = df_dks_CDK2
y = y_true_CDK2

for rep in range(15):
    estimator_name = 'XGB_tree'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    estimator = XGBClassifier(n_estimators=100,
                              learning_rate=0.1,
                              max_depth=3
                             )
    
    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 9.11 s, sys: 25.2 ms, total: 9.14 s
Wall time: 4.05 s


***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    DEKOIS: FXa Molecules against FXa protein 
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

#### CDK2 Molecules

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>Merged Libraries</b>

In [36]:
%%time
from sklearn.linear_model import LogisticRegression

protein='FXA'
library='DEKOIS'
X = df_dks_FXA
y = y_true_FXA

for rep in range(15):
    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein 
    hyperparams = {'C': [100]}
    estimator = LogisticRegression(penalty='l1', solver='liblinear')

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, estimator, hyperparams, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 8.57 s, sys: 5.82 s, total: 14.4 s
Wall time: 7.86 s


In [37]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

protein='FXA'
library='DEKOIS'
X = df_dks_FXA
y = y_true_FXA

for rep in range(15):
    estimator_name = 'Bagg_LogReg'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    # Base estimator
    estimator = LogisticRegression(
                                   C=100,
                                   penalty='l1', 
                                   solver='liblinear')
    
    # Bagging Classifier
    bag_estimator = BaggingClassifier(estimator, n_estimators=20, max_features=10)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 bag_estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 19.9 s, sys: 6.14 s, total: 26.1 s
Wall time: 37.2 s




In [38]:
%%time
from sklearn.ensemble import RandomForestClassifier

protein='FXA'
library='DEKOIS'
X = df_dks_FXA
y = y_true_FXA

for rep in range(15):
    estimator_name = 'RandForest'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    estimator = RandomForestClassifier(n_estimators=100, max_depth=3, max_features=10)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 4.58 s, sys: 41 ms, total: 4.63 s
Wall time: 10.6 s


In [39]:
%%time
from xgboost import XGBClassifier

protein='FXA'
library='DEKOIS'
X = df_dks_FXA
y = y_true_FXA

for rep in range(15):
    estimator_name = 'XGB_tree'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    estimator = XGBClassifier(n_estimators=100,
                              learning_rate=0.1,
                              max_depth=3
                             )
    
    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 9.65 s, sys: 101 ms, total: 9.75 s
Wall time: 4.09 s


In [40]:
row_names = ['Train_set', 'Test_set', 'Model name', 'Split', 
             
            'N_mols_train', 'N_actives_train',  'Num_mols_test', 'N_actives_test', 
            'Mean-CV-ROC', 'ROC-AUC_train', 'ROC-AUC_test', 'best_params',
             
            'DkS_max_ROC_train',  'DkSc_med_ROC_train', 'DkSc_mean_ROC_train', 
            'DkS_max_ROC_test',  'DkSc_med_ROC_test', 'DkSc_mean_ROC_test'
            ]

df_cross_docking_CDK2_ligs_20confs = pd.DataFrame(results_dict, index = row_names).T

# df_cross_docking_CDK2_ligs.to_pickle('df_cross_docking_CDK2_ligs.pkl')


df = df_cross_docking_CDK2_ligs_20confs
df['rep'] =  df['Model name'].str.split('_rep').str[1]
df['protein'] =  df['Model name'].str.split('_').str[-1]
df['Model name'] = df['Model name'].str.split('_').str[0]
df = df.reset_index(drop=True)
df.drop(['best_params', 'rep'], axis=1, inplace=True)
df.set_index(['Train_set', 'Test_set', 'Model name', 'Split', 'protein'], inplace=True)
df = df.sort_index(level=1, axis=1)

In [41]:
df_cross_docking_CDK2_ligs_20confs

Unnamed: 0,Train_set,Test_set,Model name,Split,N_mols_train,N_actives_train,Num_mols_test,N_actives_test,Mean-CV-ROC,ROC-AUC_train,ROC-AUC_test,best_params,DkS_max_ROC_train,DkSc_med_ROC_train,DkSc_mean_ROC_train,DkS_max_ROC_test,DkSc_med_ROC_test,DkSc_mean_ROC_test,rep,protein
DEKOIS_DEKOIS_LogReg_rep0_CDK2_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.665741,0.839667,0.884333,"{'C': 100, 'class_weight': None}",0.761907,0.679852,0.687333,0.915333,0.805583,0.807767,0_CDK2,CDK2
DEKOIS_DEKOIS_LogReg_rep1_CDK2_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.731481,0.868148,0.812667,"{'C': 100, 'class_weight': None}",0.785074,0.723574,0.724669,0.783333,0.6965,0.698733,1_CDK2,CDK2
DEKOIS_DEKOIS_LogReg_rep2_CDK2_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.813704,0.891259,0.703,"{'C': 100, 'class_weight': None}",0.808315,0.759352,0.758148,0.7145,0.5875,0.601192,2_CDK2,CDK2
DEKOIS_DEKOIS_LogReg_rep3_CDK2_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.789074,0.873481,0.786667,"{'C': 100, 'class_weight': None}",0.789741,0.732667,0.736927,0.767667,0.651583,0.66585,3_CDK2,CDK2
DEKOIS_DEKOIS_LogReg_rep4_CDK2_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.79463,0.883148,0.772,"{'C': 100, 'class_weight': None}",0.803185,0.736194,0.73968,0.728167,0.651417,0.65765,4_CDK2,CDK2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DEKOIS_DEKOIS_XGB_tree_rep10_FXA_random,DEKOIS,DEKOIS,XGB,random,930,30,310,10,0.728889,0.998444,0.5495,{},0.698778,0.603917,0.612542,0.6815,0.611333,0.611483,10_FXA,FXA
DEKOIS_DEKOIS_XGB_tree_rep11_FXA_random,DEKOIS,DEKOIS,XGB,random,930,30,310,10,0.630833,0.999407,0.781,{},0.698407,0.625278,0.626106,0.663833,0.572833,0.571958,11_FXA,FXA
DEKOIS_DEKOIS_XGB_tree_rep12_FXA_random,DEKOIS,DEKOIS,XGB,random,930,30,310,10,0.693056,0.999593,0.565667,{},0.722815,0.619398,0.617671,0.6675,0.597417,0.600317,12_FXA,FXA
DEKOIS_DEKOIS_XGB_tree_rep13_FXA_random,DEKOIS,DEKOIS,XGB,random,930,30,310,10,0.61,0.998926,0.661333,{},0.683167,0.618852,0.61323,0.7085,0.605583,0.609692,13_FXA,FXA


In [43]:
roc_cols = ['Mean-CV-ROC', 'ROC-AUC_train', 'ROC-AUC_test']

df.astype('float').filter(regex='ROC').groupby(level=[2,4]).aggregate(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,DkS_max_ROC_test,DkS_max_ROC_train,DkSc_mean_ROC_test,DkSc_mean_ROC_train,DkSc_med_ROC_test,DkSc_med_ROC_train,Mean-CV-ROC,ROC-AUC_test,ROC-AUC_train
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean
Model name,protein,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Bagg,CDK2,0.811978,0.786735,0.712063,0.720761,0.711194,0.719383,0.784691,0.7952,0.853242
Bagg,FXA,0.702856,0.690496,0.604042,0.616139,0.605939,0.617392,0.669988,0.662756,0.762091
LogReg,CDK2,0.806389,0.784726,0.716001,0.719701,0.713811,0.715855,0.767185,0.770089,0.872993
LogReg,FXA,0.712244,0.688648,0.623474,0.609794,0.623178,0.613683,0.639802,0.625978,0.774709
RandForest,CDK2,0.801733,0.789388,0.707937,0.722469,0.707133,0.719491,0.747401,0.773944,0.930423
RandForest,FXA,0.698422,0.694791,0.612656,0.613186,0.613817,0.61386,0.650864,0.648744,0.918077
XGB,CDK2,0.805978,0.78266,0.715014,0.71996,0.712422,0.718501,0.733648,0.7381,0.999437
XGB,FXA,0.707167,0.691364,0.612396,0.613189,0.614706,0.613331,0.660599,0.663633,0.998756
