## Compute Consensus Scoring using Protein Conformational Selection

In [1]:
import pandas as pd
import numpy as np
import glob, os, sys
sys.path.append('..')
from modules.run_or_load_decorator import run_or_load

### Load the data
Load the data frame containing the raw docking scoring results.

In [2]:
file_name = './df_DkSc_results_COCRYS_CSAR_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape

(3466, 402)

### Load train test splitting murcko data frame
Load the murcko scaffolds dataframe to perform scaffold splitting

In [3]:
sys.path.append('../2_Docking_analysis/')
from scaffold_splitter import train_test_scaffold_split

# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = '../2_Docking_analysis/df_COCRYS_CSAR_DUD_DEKOIS_Murcko_Scaffolds_SMILES.obj'

df_scff_murcko = pd.read_pickle(file)
df_scff_murcko.shape

(3466, 3)

### Read RFE Selectors to get the preselected features

In [4]:
%run ./5_Helper_get_RFE_preselected_conformations.ipynb

In [5]:
# Instead of been a list as with ML results, here selectors is a dictionary containing selector name and preselected conformations
rfe_preselections.keys()

dict_keys(['LR_rand', 'RF_rand', 'XGB_rand', 'LR_scff', 'RF_scff', 'XGB_scff'])

### Import plotmetrics module to evaluate docking results

In [6]:
%run ../modules/plotting_metrics.py

### Import Consensus Scoring Related Functions

In [7]:
%run ./5_Helper_Consensus_Scoring.ipynb

### List of parameters to evaluate Consensus scoring

In [8]:
# none performs Consensus scoring using all molecules, scff and rand applies CS method over test set
splitting_methods = ['none', 'scff', 'rand'] 

scaffold_series = df_scff_murcko['scff_generic']

# Conf. Selectors
selectors = ['rand', 'LR', 'RF', 'XGB']

# Consensus Scoring Methods
consensus_methods = {
    'MEAN': get_mean_score,
    'MED':  get_median_score,
    'RANK': get_rank_score,
    'MIN':  get_min_score,
    'MAX':  get_max_score,
    'EUN':  get_euc_norm_score,
    'VOTE': get_vote_score,
    'ECR':  get_exp_consensus_ranking
}

# List of parameters to compute
roc_params = {'metric_name': 'roc_auc'}
nef_params = {'metric_name': 'nef_auc'}
pr_params = {'metric_name': 'pr_auc'}

# The Ra value for the testing set in FXa is 75/1559 = 0.05
# Therefore the maximum value of alpha for bedroc could be a=20
bedroc_20 = {'metric_name': 'bedroc', 'alpha': 20}
bedroc_10 = {'metric_name': 'bedroc', 'alpha': 10}
bedroc_2 = {'metric_name': 'bedroc', 'alpha': 2}
bedroc_05 = {'metric_name': 'bedroc', 'alpha': 0.5}

# ef values 0.001, 0.005, 0.02, 0.1, 0.2
ef_0001 = {'metric_name': 'ef', 'fraction': 0.001}
ef_0005 = {'metric_name': 'ef', 'fraction': 0.005}
ef_002 = {'metric_name': 'ef', 'fraction': 0.02}
ef_02 = {'metric_name': 'ef', 'fraction': 0.2}

# List of metrics
metrics = [roc_params, nef_params, pr_params,
           bedroc_20, bedroc_10, bedroc_2, bedroc_05,
           ef_0001, ef_0005, ef_002, ef_02]

# Run Analysis

In [9]:
%%time

prot_name = 'CDK2'
base_filename = f'./consensus_scoring_results/{prot_name}_CS_results_conformational_selection'

for metric_eval in metrics:
    metric_name = '_'.join([str(i) for i in metric_eval.values()])
    print(metric_name)
    df = aggregate_conf_selection_results_CS(f'{base_filename}-{metric_name}.obj', 
                                             X, y, 
                                             splitting_methods=splitting_methods, 
                                             selectors=selectors,
                                             cs_methods=consensus_methods, 
                                             metrics=[metric_eval], 
                                             nreps=10, 
                                             scaffold_series=scaffold_series)

roc_auc
File loaded: ./consensus_scoring_results/CDK2_CS_results_conformational_selection-roc_auc.obj
nef_auc
File loaded: ./consensus_scoring_results/CDK2_CS_results_conformational_selection-nef_auc.obj
pr_auc
File loaded: ./consensus_scoring_results/CDK2_CS_results_conformational_selection-pr_auc.obj
bedroc_20
File loaded: ./consensus_scoring_results/CDK2_CS_results_conformational_selection-bedroc_20.obj
bedroc_10
File loaded: ./consensus_scoring_results/CDK2_CS_results_conformational_selection-bedroc_10.obj
bedroc_2
File loaded: ./consensus_scoring_results/CDK2_CS_results_conformational_selection-bedroc_2.obj
bedroc_0.5
none/rand/MEAN/bedroc_0.5
none/rand/MED/bedroc_0.5
none/rand/RANK/bedroc_0.5
none/rand/MIN/bedroc_0.5
none/rand/MAX/bedroc_0.5
none/rand/EUN/bedroc_0.5
none/rand/VOTE/bedroc_0.5
none/rand/ECR/bedroc_0.5
scff/rand/MEAN/bedroc_0.5
scff/rand/MED/bedroc_0.5
scff/rand/RANK/bedroc_0.5
scff/rand/MIN/bedroc_0.5
scff/rand/MAX/bedroc_0.5
scff/rand/EUN/bedroc_0.5
scff/rand/VOTE

scff/XGB/MED/ef_0.2
scff/XGB/RANK/ef_0.2
scff/XGB/MIN/ef_0.2
scff/XGB/MAX/ef_0.2
scff/XGB/EUN/ef_0.2
scff/XGB/VOTE/ef_0.2
scff/XGB/ECR/ef_0.2
rand/rand/MEAN/ef_0.2
rand/rand/MED/ef_0.2
rand/rand/RANK/ef_0.2
rand/rand/MIN/ef_0.2
rand/rand/MAX/ef_0.2
rand/rand/EUN/ef_0.2
rand/rand/VOTE/ef_0.2
rand/rand/ECR/ef_0.2
rand/LR/MEAN/ef_0.2
rand/LR/MED/ef_0.2
rand/LR/RANK/ef_0.2
rand/LR/MIN/ef_0.2
rand/LR/MAX/ef_0.2
rand/LR/EUN/ef_0.2
rand/LR/VOTE/ef_0.2
rand/LR/ECR/ef_0.2
rand/RF/MEAN/ef_0.2
rand/RF/MED/ef_0.2
rand/RF/RANK/ef_0.2
rand/RF/MIN/ef_0.2
rand/RF/MAX/ef_0.2
rand/RF/EUN/ef_0.2
rand/RF/VOTE/ef_0.2
rand/RF/ECR/ef_0.2
rand/XGB/MEAN/ef_0.2
rand/XGB/MED/ef_0.2
rand/XGB/RANK/ef_0.2
rand/XGB/MIN/ef_0.2
rand/XGB/MAX/ef_0.2
rand/XGB/EUN/ef_0.2
rand/XGB/VOTE/ef_0.2
rand/XGB/ECR/ef_0.2
File saved: ./consensus_scoring_results/CDK2_CS_results_conformational_selection-ef_0.2.obj
CPU times: user 11h 33min 54s, sys: 8min 48s, total: 11h 42min 43s
Wall time: 11h 42min 45s


## Prepare results 

In [11]:
from glob import glob
prot_name = 'CDK2'
base_filename = f'./consensus_scoring_results/{prot_name}_CS_results_conformational_selection'

files = glob(base_filename + '*')
df = pd.concat([pd.read_pickle(i) for i in files]).round(4)
df.to_pickle(f'./{prot_name}_dash_app_Consensus_results.obj')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,0,1,2,3,4,5,6,7,8,9,...,392,393,394,395,396,397,398,399,400,401
split,selector,consensus,metric,desc,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
none,rand,MEAN,ef_0.001,mean,2.9232,3.1320,3.5496,3.3408,3.1320,2.7144,2.9232,3.7584,2.7144,3.1320,...,2.0880,2.0880,2.0880,2.0880,2.0880,2.0880,2.0880,2.0880,2.0880,2.0880
none,rand,MEAN,ef_0.001,std,2.2445,1.4764,1.0086,1.4599,1.1005,1.0086,1.0782,1.3206,1.0086,1.1005,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
none,rand,MED,ef_0.001,mean,2.9232,2.7144,3.1320,3.7584,3.5496,2.7144,3.5496,3.5496,3.1320,3.5496,...,2.0880,2.0880,2.0880,2.0880,2.0880,2.0880,2.0880,2.0880,2.0880,2.0880
none,rand,MED,ef_0.001,std,1.7608,1.4093,1.1005,1.3206,1.4093,1.0086,1.7190,1.0086,1.4764,1.7190,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
none,rand,RANK,ef_0.001,mean,2.2968,3.9672,3.7584,2.7144,3.7584,3.7584,3.5496,2.9232,3.3408,3.5496,...,4.1760,4.1760,4.1760,4.1760,4.1760,4.1760,4.1760,4.1760,4.1760,4.1760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rand,XGB,EUN,ef_0.005,std,1.9571,1.7924,1.5818,2.0869,1.7576,1.2304,1.6107,0.8786,1.6108,1.8347,...,1.8009,1.2304,1.8350,1.9253,1.5819,1.7221,1.5818,1.4600,1.8349,1.1787
rand,XGB,VOTE,ef_0.005,mean,3.5013,3.1679,2.5009,3.6682,2.6676,2.8346,4.0017,2.6676,4.3350,4.0016,...,4.0017,3.1681,3.1680,3.5015,3.5015,3.5014,4.0018,3.0011,3.1680,3.3346
rand,XGB,VOTE,ef_0.005,std,2.6597,1.6581,1.1792,1.5322,1.7924,1.3728,1.4059,1.9571,1.9570,2.1089,...,1.4059,0.9466,1.8349,1.6581,1.6581,1.6581,0.8608,2.0496,1.9962,1.5721
rand,XGB,ECR,ef_0.005,mean,3.6681,1.8341,1.8339,2.1674,2.3343,3.1679,3.0012,3.1680,2.5010,2.8345,...,3.1680,3.1679,2.3342,2.5011,3.3347,2.8346,2.3342,2.6678,3.1680,2.1676
