#  FXa Ligands agaist CDK2 and FXa proteins
### Validation Method

In [27]:
import pandas as pd
import numpy as np
import glob, sys, os
sys.path.append('..')

In [28]:
from modules.plotting_metrics import PlotMetric
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', context='talk', font_scale=0.8)

## Evaluations using DEKOIS Datasets: FXa Molecules

#### FXa 

In [29]:
df_dks_FXA =pd.read_csv('../../../FXa/ANALISIS/2_Docking_analysis/DEKOIS2_VINARDO_137_prots_1240_mols.csv', index_col = 'ligand') 
df_dks_FXA = df_dks_FXA.reset_index().drop_duplicates(subset='ligand', keep='first').set_index('ligand')
# dekois_dksc, y_true_dekois = drop_duplicated_mols(dekois_dksc, 'DEKOIS')
y_true_FXA = df_dks_FXA.activity
df_dks_FXA.drop('activity', axis=1, inplace=True)
df_dks_FXA.index.name = 'name'
df_dks_FXA.shape

(1240, 136)

#### CDK2

In [30]:
df_dks_CDK2 = pd.read_csv('./DEKOIS_VINARDO_403_prots_1240_mols.csv', index_col = 'ligand')
df_dks_CDK2 = df_dks_CDK2.reset_index().drop_duplicates(subset='ligand', keep='first').set_index('ligand')
# dekois_dksc, y_true_dekois = drop_duplicated_mols(dekois_dksc, 'DEKOIS')
y_true_CDK2 = df_dks_CDK2.activity
df_dks_CDK2.drop('activity', axis=1, inplace=True)
df_dks_CDK2.index.name = 'name'
df_dks_CDK2.fillna(-7, inplace=True)
df_dks_CDK2.shape

(1240, 402)

In [31]:
(y_true_FXA == y_true_CDK2).all()

True

# Protein conformations Selection

## CDK2 Conformations

In [32]:
# Read the table of conformations metadata
path_to_json_file = '../data/TABLA_MTDATA_CDK2_402_crys_LIGS_INFO.json'

# Read the metadata table                      
df_prot = pd.read_json(path_to_json_file)
# Select the 20 conformations with the best Resolution
selected_confs = df_prot.sort_values('Inhib_mass', ascending=False).index[:20]
print(selected_confs)

# Keep the selected conformations inside the D scores matrix
df_dks_CDK2 = df_dks_CDK2.loc[:, selected_confs]

Index(['4bcm', '3eid', '2g9x', '1pkd', '4nj3', '4acm', '1aq1', '4ez7', '4erw',
       '3qtu', '1y91', '4bco', '4fkv', '1fvv', '2wih', '1urw', '3ezr', '2r64',
       '5nev', '5a14'],
      dtype='object')


## FXa conformations

In [33]:
# Read the table of conformations metadata
path_to_json_file = '../../../FXa/ANALISIS/1_Fetching_and_generating_data/TABLA_MTDATA_FXA_136_crys_LIGS_INFO.json' 

# Read the metadata table                      
df_prot = pd.read_json(path_to_json_file)
# Select the 20 conformations with the best Resolution
selected_confs = df_prot.sort_values('Inhib_mass', ascending=False).index[:20]
print(selected_confs)

# Keep the selected conformations inside the D scores matrix
df_dks_FXA = df_dks_FXA.loc[:, selected_confs]

Index(['4btt', '1iqe', '4bti', '1iqn', '3ffg', '3kqc', '3kqd', '3kqe', '1lpg',
       '5k0h', '1iqf', '3sw2', '2fzz', '4btu', '1g2l', '3kqb', '1ioe', '2j4i',
       '4y7b', '2vwm'],
      dtype='object')


### Helper Functions

In [34]:
results_dict = {}

In [35]:
%run ../6_Machine_Learning_Models/4_Helper_Functions_Model_Selection_Grid_Search.ipynb

#  Hyperparameter Tunning: Grid Search

***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    DEKOIS: CDK2 Molecules against 20 confs of CDK2 protein 
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

#### CDK2 Molecules 

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>DEKOIS</b>

In [36]:
%%time
from sklearn.linear_model import LogisticRegression

protein='CDK2'
library='DEKOIS'
X = df_dks_CDK2
y = y_true_CDK2

for rep in range(15):
    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein 
    hyperparams = {'C': [100], 
                   'class_weight': [None, 'balanced']}
    estimator = LogisticRegression(penalty='l1', solver='liblinear')

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, estimator, hyperparams, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 8.67 s, sys: 6.16 s, total: 14.8 s
Wall time: 14.1 s


In [37]:
# Bagging Classifier

In [38]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

protein='CDK2'
library='DEKOIS'
X = df_dks_CDK2
y = y_true_CDK2

for rep in range(15):
    estimator_name = 'Bagg_LogReg'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    # Base estimator
    estimator = LogisticRegression(
                                   C=100,
                                   penalty='l1', 
                                   solver='liblinear')
    
    # Bagging Classifier
    bag_estimator = BaggingClassifier(estimator, n_estimators=20, max_features=10)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 bag_estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 14.8 s, sys: 6.26 s, total: 21.1 s
Wall time: 23.7 s


In [39]:
%%time
from sklearn.ensemble import RandomForestClassifier

protein='CDK2'
library='DEKOIS'
X = df_dks_CDK2
y = y_true_CDK2

for rep in range(15):
    estimator_name = 'RandForest'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    estimator = RandomForestClassifier(n_estimators=100, max_depth=3, max_features=10)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 
                 X, y, 
                 estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 4.61 s, sys: 48.4 ms, total: 4.66 s
Wall time: 9.96 s


In [40]:
%%time
from xgboost import XGBClassifier

protein='CDK2'
library='DEKOIS'
X = df_dks_CDK2
y = y_true_CDK2

for rep in range(15):
    estimator_name = 'XGB_tree'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    estimator = XGBClassifier(n_estimators=100,
                              learning_rate=0.1,
                              max_depth=3
                             )
    
    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 10.6 s, sys: 44.5 ms, total: 10.7 s
Wall time: 4.35 s


***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    DEKOIS: FXa Molecules against FXa protein 
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

#### CDK2 Molecules

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>Merged Libraries</b>

In [41]:
%%time
from sklearn.linear_model import LogisticRegression

protein='FXA'
library='DEKOIS'
X = df_dks_FXA
y = y_true_FXA

for rep in range(15):
    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein 
    hyperparams = {'C': [100]}
    estimator = LogisticRegression(penalty='l1', solver='liblinear')

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, estimator, hyperparams, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 8.65 s, sys: 5.78 s, total: 14.4 s
Wall time: 8.73 s


In [42]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

protein='FXA'
library='DEKOIS'
X = df_dks_FXA
y = y_true_FXA

for rep in range(15):
    estimator_name = 'Bagg_LogReg'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    # Base estimator
    estimator = LogisticRegression(
                                   C=100,
                                   penalty='l1', 
                                   solver='liblinear')
    
    # Bagging Classifier
    bag_estimator = BaggingClassifier(estimator, n_estimators=20, max_features=10)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 bag_estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 21 s, sys: 6.27 s, total: 27.3 s
Wall time: 40.9 s


In [43]:
%%time
from sklearn.ensemble import RandomForestClassifier

protein='FXA'
library='DEKOIS'
X = df_dks_FXA
y = y_true_FXA

for rep in range(15):
    estimator_name = 'RandForest'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    estimator = RandomForestClassifier(n_estimators=100, max_depth=3, max_features=10)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 4.44 s, sys: 46.6 ms, total: 4.49 s
Wall time: 10.3 s


In [44]:
%%time
from xgboost import XGBClassifier

protein='FXA'
library='DEKOIS'
X = df_dks_FXA
y = y_true_FXA

for rep in range(15):
    estimator_name = 'XGB_tree'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    estimator = XGBClassifier(n_estimators=100,
                              learning_rate=0.1,
                              max_depth=3
                             )
    
    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 10.1 s, sys: 39.4 ms, total: 10.2 s
Wall time: 4.14 s


In [45]:
row_names = ['Train_set', 'Test_set', 'Model name', 'Split', 
             
            'N_mols_train', 'N_actives_train',  'Num_mols_test', 'N_actives_test', 
            'Mean-CV-ROC', 'ROC-AUC_train', 'ROC-AUC_test', 'best_params',
             
            'DkS_max_ROC_train',  'DkSc_med_ROC_train', 'DkSc_mean_ROC_train', 
            'DkS_max_ROC_test',  'DkSc_med_ROC_test', 'DkSc_mean_ROC_test'
            ]

df_cross_docking_FXA_ligs_20confs = pd.DataFrame(results_dict, index = row_names).T

# df_cross_docking_CDK2_ligs.to_pickle('df_cross_docking_CDK2_ligs.pkl')


df = df_cross_docking_FXA_ligs_20confs
df['rep'] =  df['Model name'].str.split('_rep').str[1]
df['protein'] =  df['Model name'].str.split('_').str[-1]
df['Model name'] = df['Model name'].str.split('_').str[0]
df = df.reset_index(drop=True)
df.drop(['best_params', 'rep'], axis=1, inplace=True)
df.set_index(['Train_set', 'Test_set', 'Model name', 'Split', 'protein'], inplace=True)
df = df.sort_index(level=1, axis=1)

In [46]:
df_cross_docking_FXA_ligs_20confs

Unnamed: 0,Train_set,Test_set,Model name,Split,N_mols_train,N_actives_train,Num_mols_test,N_actives_test,Mean-CV-ROC,ROC-AUC_train,ROC-AUC_test,best_params,DkS_max_ROC_train,DkSc_med_ROC_train,DkSc_mean_ROC_train,DkS_max_ROC_test,DkSc_med_ROC_test,DkSc_mean_ROC_test,rep,protein
DEKOIS_DEKOIS_LogReg_rep0_CDK2_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.695185,0.80737,0.702333,"{'C': 100, 'class_weight': None}",0.772759,0.711685,0.705339,0.793667,0.731,0.725317,0_CDK2,CDK2
DEKOIS_DEKOIS_LogReg_rep1_CDK2_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.710185,0.850111,0.657667,"{'C': 100, 'class_weight': 'balanced'}",0.815111,0.733222,0.737431,0.694167,0.614,0.6291,1_CDK2,CDK2
DEKOIS_DEKOIS_LogReg_rep2_CDK2_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.68,0.805222,0.677,"{'C': 100, 'class_weight': None}",0.757093,0.709657,0.705459,0.803333,0.70425,0.720767,2_CDK2,CDK2
DEKOIS_DEKOIS_LogReg_rep3_CDK2_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.592963,0.807296,0.704,"{'C': 100, 'class_weight': None}",0.74013,0.696185,0.693567,0.833667,0.760833,0.759375,3_CDK2,CDK2
DEKOIS_DEKOIS_LogReg_rep4_CDK2_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.715,0.842296,0.573667,"{'C': 100, 'class_weight': None}",0.783833,0.71975,0.717953,0.748,0.698917,0.68485,4_CDK2,CDK2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DEKOIS_DEKOIS_XGB_tree_rep10_FXA_random,DEKOIS,DEKOIS,XGB,random,930,30,310,10,0.830185,0.998815,0.839,{},0.901278,0.854296,0.855906,0.9045,0.83525,0.833108,10_FXA,FXA
DEKOIS_DEKOIS_XGB_tree_rep11_FXA_random,DEKOIS,DEKOIS,XGB,random,930,30,310,10,0.769722,0.999667,0.908333,{},0.877778,0.841815,0.842256,0.933167,0.867333,0.87315,11_FXA,FXA
DEKOIS_DEKOIS_XGB_tree_rep12_FXA_random,DEKOIS,DEKOIS,XGB,random,930,30,310,10,0.821852,0.999481,0.8185,{},0.89437,0.866787,0.865156,0.8765,0.823083,0.804542,12_FXA,FXA
DEKOIS_DEKOIS_XGB_tree_rep13_FXA_random,DEKOIS,DEKOIS,XGB,random,930,30,310,10,0.84,0.999778,0.877667,{},0.876167,0.836389,0.829871,0.946167,0.91525,0.91,13_FXA,FXA


In [48]:
roc_cols = ['Mean-CV-ROC', 'ROC-AUC_train', 'ROC-AUC_test']

df.astype('float').filter(regex='ROC').groupby(level=[2,4]).aggregate(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,DkS_max_ROC_test,DkS_max_ROC_train,DkSc_mean_ROC_test,DkSc_mean_ROC_train,DkSc_med_ROC_test,DkSc_med_ROC_train,Mean-CV-ROC,ROC-AUC_test,ROC-AUC_train
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean
Model name,protein,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Bagg,CDK2,0.7774,0.776567,0.688159,0.716497,0.691717,0.718745,0.741148,0.6938,0.814706
Bagg,FXA,0.887744,0.893189,0.827245,0.857541,0.829772,0.860622,0.887469,0.867356,0.919731
LogReg,CDK2,0.780333,0.774172,0.688335,0.716464,0.687111,0.716993,0.687247,0.636044,0.824336
LogReg,FXA,0.903467,0.892056,0.843597,0.851917,0.846667,0.854315,0.850173,0.854156,0.925706
RandForest,CDK2,0.8154,0.758219,0.715396,0.707601,0.714428,0.709812,0.729019,0.774989,0.889246
RandForest,FXA,0.914389,0.883993,0.854004,0.848334,0.856361,0.849284,0.85929,0.872833,0.95148
XGB,CDK2,0.821456,0.758128,0.732157,0.701882,0.732189,0.704026,0.680019,0.707178,0.999862
XGB,FXA,0.910211,0.889464,0.854572,0.848203,0.858656,0.849306,0.822716,0.850278,0.999237
