# FXa Ligands agaist FXa and CDK2 proteins
### Validation Method

In [1]:
import pandas as pd
import numpy as np
import glob, sys, os
sys.path.append('..')

In [2]:
from modules.plotting_metrics import PlotMetric
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', context='talk', font_scale=0.8)

## Evaluations using DEKOIS Datasets

#### FXa 

In [3]:
df_dks_FXA =pd.read_csv('../../../FXa/ANALISIS/2_Docking_analysis/DEKOIS2_VINARDO_137_prots_1240_mols.csv', index_col = 'ligand') 
df_dks_FXA = df_dks_FXA.reset_index().drop_duplicates(subset='ligand', keep='first').set_index('ligand')
# dekois_dksc, y_true_dekois = drop_duplicated_mols(dekois_dksc, 'DEKOIS')
y_true_FXA = df_dks_FXA.activity
df_dks_FXA.drop('activity', axis=1, inplace=True)
df_dks_FXA.index.name = 'name'
df_dks_FXA.shape

(1240, 136)

#### CDK2

In [4]:
df_dks_CDK2 = pd.read_csv('./DEKOIS_VINARDO_403_prots_1240_mols.csv', index_col = 'ligand')
df_dks_CDK2 = df_dks_CDK2.reset_index().drop_duplicates(subset='ligand', keep='first').set_index('ligand')
# dekois_dksc, y_true_dekois = drop_duplicated_mols(dekois_dksc, 'DEKOIS')
y_true_CDK2 = df_dks_CDK2.activity
df_dks_CDK2.drop('activity', axis=1, inplace=True)
df_dks_CDK2.index.name = 'name'
df_dks_CDK2.fillna(-7, inplace=True)
df_dks_CDK2.shape

(1240, 402)

In [5]:
(y_true_FXA == y_true_CDK2).all()

True

## Scaffold Splitting

In [6]:
#*************************************************
# Functions to compute stratify scaffold splitting
#*************************************************
sys.path.append('../2_Docking_analysis/')
from scaffold_splitter import train_test_scaffold_split

In [7]:
# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = '../../../FXa/ANALISIS/2_Docking_analysis/df_COCRYS_DUD_DEKOIS_Murcko_Scaffolds_SMILES.obj'

# df_scff_murcko = pd.read_pickle(file)
# df_scff_murcko.loc['DEKOIS']

### Helper Functions

In [8]:
results_dict = {}

In [9]:
%run ../6_Machine_Learning_Models/4_Helper_Functions_Model_Selection_Grid_Search.ipynb

#  Hyperparameter Tunning: Grid Search

***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    DEKOIS: FXa Molecules against FXa protein 
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

#### DEKOIS 

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>Merged Libraries</b>

In [10]:
%%time
from sklearn.linear_model import LogisticRegression

protein='FXA'
library='DEKOIS'
X = df_dks_FXA
y = y_true_FXA

for rep in range(15):
    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    hyperparams = {'C': [100], 
                   'class_weight': [None, 'balanced']}
    estimator = LogisticRegression(penalty='l1', solver='liblinear')

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, estimator, hyperparams, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)
    

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 51.7 s, sys: 6.72 s, total: 58.4 s
Wall time: 3min 17s


In [18]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

protein='FXA'
library='DEKOIS'
X = df_dks_FXA
y = y_true_FXA

for rep in range(15):
    estimator_name = 'Bagg_LogReg'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    # Base estimator
    estimator = LogisticRegression(
                                   C=100,
                                   penalty='l1', 
                                   solver='liblinear')
    
    # Bagging Classifier
    bag_estimator = BaggingClassifier(estimator, n_estimators=20, max_features=10)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 bag_estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2




rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 23.6 s, sys: 6.67 s, total: 30.2 s
Wall time: 39.7 s


In [45]:
%%time
from sklearn.ensemble import RandomForestClassifier

protein='FXA'
library='DEKOIS'
X = df_dks_FXA
y = y_true_FXA

for rep in range(15):
    estimator_name = 'RandForest'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    estimator = RandomForestClassifier(n_estimators=100, max_depth=3, max_features=10)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 8.1 s, sys: 72.2 ms, total: 8.17 s
Wall time: 12.6 s


In [53]:
%%time
from xgboost import XGBClassifier

protein='FXA'
library='DEKOIS'
X = df_dks_FXA
y = y_true_FXA

for rep in range(15):
    estimator_name = 'XGB_tree'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    estimator = XGBClassifier(n_estimators=100,
                              learning_rate=0.1,
                              max_depth=3
                             )
    
    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 16.5 s, sys: 67.3 ms, total: 16.5 s
Wall time: 12.9 s


***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    DEKOIS: FXa Molecules against CDK2 protein 
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

#### DEKOIS 

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>DEKOIS</b>

In [12]:
%%time
from sklearn.linear_model import LogisticRegression

protein='CDK2'
library='DEKOIS'
X = df_dks_CDK2
y = y_true_CDK2

for rep in range(15):
    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    hyperparams = {'C': [100], 
                   'class_weight': [None, 'balanced']}
    estimator = LogisticRegression(penalty='l1', solver='liblinear')

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, estimator, hyperparams, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 35.6 s, sys: 6.56 s, total: 42.2 s
Wall time: 2min 22s


In [22]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

protein='CDK2'
library='DEKOIS'
X = df_dks_CDK2
y = y_true_CDK2

for rep in range(15):
    estimator_name = 'Bagg_LogReg'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    # Base estimator
    estimator = LogisticRegression(
                                   C=100,
                                   penalty='l1', 
                                   solver='liblinear')
    
    # Bagging Classifier
    bag_estimator = BaggingClassifier(estimator, n_estimators=20, max_features=10)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 bag_estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 28.8 s, sys: 6.72 s, total: 35.5 s
Wall time: 37.4 s


In [49]:
%%time
from xgboost import XGBClassifier

protein='CDK2'
library='DEKOIS'
X = df_dks_CDK2
y = y_true_CDK2

for rep in range(15):
    estimator_name = 'XGB_tree'
    estimator_name = estimator_name + '_rep' + str(rep) + '_' + protein
    
    estimator = XGBClassifier(n_estimators=100,
                              learning_rate=0.1,
                              max_depth=3
                             )
    
    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y, 
                 estimator, {}, 
                 splitting='random', test_size=0.25, 
                 verbose=False,
                 scaffold_series=None)
    print('rep', rep)

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
CPU times: user 41.9 s, sys: 179 ms, total: 42 s
Wall time: 39.1 s


In [57]:
row_names = ['Train_set', 'Test_set', 'Model name', 'Split', 
             
            'N_mols_train', 'N_actives_train',  'Num_mols_test', 'N_actives_test', 
            'Mean-CV-ROC', 'ROC-AUC_train', 'ROC-AUC_test', 'best_params',
             
            'DkS_max_ROC_train',  'DkSc_med_ROC_train', 'DkSc_mean_ROC_train', 
            'DkS_max_ROC_test',  'DkSc_med_ROC_test', 'DkSc_mean_ROC_test'
            ]

df_cross_docking_Fxa_ligs = pd.DataFrame(results_dict, index = row_names).T
# df_cross_docking_Fxa_ligs.to_pickle('df_cross_docking_Fxa_ligs.pkl')

df = df_cross_docking_Fxa_ligs
df['rep'] =  df['Model name'].str.split('_rep').str[1]
df['protein'] =  df['Model name'].str.split('_').str[-1]
df['Model name'] = df['Model name'].str.split('_').str[0]
df = df.reset_index(drop=True)
df.drop(['best_params', 'rep'], axis=1, inplace=True)
df.set_index(['Train_set', 'Test_set', 'Model name', 'Split', 'protein'], inplace=True)
df = df.sort_index(level=1, axis=1)

In [55]:
df_cross_docking_Fxa_ligs.head()

Unnamed: 0,Train_set,Test_set,Model name,Split,N_actives_train,N_actives_test,N_mols_train,Num_mols_test,Mean-CV-ROC,ROC-AUC_train,ROC-AUC_test,best_params,DkS_max_ROC_train,DkSc_med_ROC_train,DkSc_mean_ROC_train,DkS_max_ROC_test,DkSc_med_ROC_test,DkSc_mean_ROC_test,rep,protein
DEKOIS_DEKOIS_LogReg_rep0_FXA_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.702037,1,0.821,"{'C': 100, 'class_weight': None}",0.907852,0.843287,0.843344,0.936,0.804167,0.798301,0_FXA,FXA
DEKOIS_DEKOIS_LogReg_rep1_FXA_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.857222,1,0.829,"{'C': 100, 'class_weight': None}",0.907519,0.84513,0.842408,0.9095,0.809667,0.801376,1_FXA,FXA
DEKOIS_DEKOIS_LogReg_rep2_FXA_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.817593,1,0.728,"{'C': 100, 'class_weight': None}",0.908667,0.84388,0.839692,0.898667,0.815583,0.805751,2_FXA,FXA
DEKOIS_DEKOIS_LogReg_rep3_FXA_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.785,1,0.645667,"{'C': 100, 'class_weight': None}",0.896037,0.836481,0.832214,0.954333,0.8435,0.833152,3_FXA,FXA
DEKOIS_DEKOIS_LogReg_rep4_FXA_random,DEKOIS,DEKOIS,LogReg,random,930,30,310,10,0.789815,1,0.819667,"{'C': 100, 'class_weight': None}",0.896278,0.831037,0.829223,0.9355,0.841417,0.84039,4_FXA,FXA


In [56]:
roc_cols = ['Mean-CV-ROC', 'ROC-AUC_train', 'ROC-AUC_test']

df.astype('float').filter(regex='ROC').groupby(level=[2, 4]).aggregate(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,DkS_max_ROC_test,DkS_max_ROC_train,DkSc_mean_ROC_test,DkSc_mean_ROC_train,DkSc_med_ROC_test,DkSc_med_ROC_train,Mean-CV-ROC,ROC-AUC_test,ROC-AUC_train
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean
Model name,protein,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AdaBoost,CDK2,0.868011,0.815904,0.722248,0.722876,0.7241,0.724314,0.71958,0.722978,0.999851
AdaBoost,FXA,0.922667,0.900989,0.827256,0.833909,0.830694,0.837422,0.818488,0.804311,0.997642
Bagg,CDK2,0.859411,0.815811,0.721163,0.72324,0.723589,0.724246,0.80737,0.8058,0.85582
Bagg,FXA,0.921133,0.903923,0.823018,0.835489,0.830422,0.839094,0.892827,0.8844,0.919146
LogReg,CDK2,0.877178,0.814216,0.726086,0.721744,0.7275,0.723217,0.82763,0.850756,1.0
LogReg,FXA,0.933767,0.899415,0.839885,0.829528,0.846506,0.832299,0.792346,0.802756,1.0
RandForest,CDK2,0.865922,0.812504,0.726237,0.721208,0.727839,0.722398,0.763031,0.786689,0.937301
RandForest,FXA,0.917656,0.906131,0.825425,0.83462,0.83,0.837715,0.878809,0.870767,0.95643
XGB,CDK2,0.875233,0.812332,0.726259,0.721623,0.728328,0.722637,0.76171,0.764489,1.0
XGB,FXA,0.908278,0.907042,0.802489,0.842088,0.808006,0.846431,0.859877,0.813567,1.0
