## Compute Consensus Scoring using Protein Conformational Selection

In [1]:
import pandas as pd
import numpy as np
import glob, os, sys
sys.path.append('..')
from modules.run_or_load_decorator import run_or_load

### Load the data
Load the data frame containing the raw docking scoring results.

In [2]:
file_name = './df_DkSc_results_COCRYS_CSAR_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape

(3466, 402)

### Load train test splitting murcko data frame
Load the murcko scaffolds dataframe to perform scaffold splitting

In [3]:
sys.path.append('../2_Docking_analysis/')
from scaffold_splitter import train_test_scaffold_split

# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = '../2_Docking_analysis/df_COCRYS_CSAR_DUD_DEKOIS_Murcko_Scaffolds_SMILES.obj'

df_scff_murcko = pd.read_pickle(file)
df_scff_murcko.shape

(3466, 3)

### Read RFE Selectors to get the preselected features

In [4]:
%run ./5_Helper_get_RFE_preselected_conformations.ipynb

In [5]:
# Instead of been a list as with ML results, here selectors is a dictionary containing selector name and preselected conformations
rfe_preselections.keys()

dict_keys(['LR_rand', 'RF_rand', 'XGB_rand', 'LR_scff', 'RF_scff', 'XGB_scff'])

### Import plotmetrics module to evaluate docking results

In [18]:
%run ../modules/plotting_metrics.py

### Import Consensus Scoring Related Functions

In [19]:
%run ./5_Helper_Consensus_Scoring.ipynb

### List of parameters to evaluate Consensus scoring

In [20]:
# none performs Consensus scoring using all molecules, scff and rand applies CS method over test set
splitting_methods = ['none', 'scff', 'rand'] 

scaffold_series = df_scff_murcko['scff_generic']

# Conf. Selectors
selectors = ['rand', 'LR', 'RF', 'XGB']

# Consensus Scoring Methods
consensus_methods = {
    'MEAN': get_mean_score,
    'MED':  get_median_score,
    'RANK': get_rank_score,
    'VOTE': get_vote_score,
    'MIN':  get_min_score,
    'MAX':  get_max_score,
    'EUN':  get_euc_norm_score,
    'ECR':  get_exp_consensus_ranking
}

# List of parameters to compute
roc_params = {'metric_name': 'roc_auc'}
nef_params = {'metric_name': 'nef_auc'}
pr_params = {'metric_name': 'pr_auc'}

# The Ra value for the testing set in FXa is 75/1559 = 0.05
# Therefore the maximum value of alpha for bedroc could be a=20
bedroc_20 = {'metric_name': 'bedroc', 'alpha': 20}
bedroc_10 = {'metric_name': 'bedroc', 'alpha': 10}
bedroc_2 = {'metric_name': 'bedroc', 'alpha': 2}
bedroc_05 = {'metric_name': 'bedroc', 'alpha': 0.5}

# ef values 0.001, 0.005, 0.02, 0.1, 0.2
ef_0001 = {'metric_name': 'ef', 'fraction': 0.001}
ef_0005 = {'metric_name': 'ef', 'fraction': 0.005}
ef_002 = {'metric_name': 'ef', 'fraction': 0.02}
ef_02 = {'metric_name': 'ef', 'fraction': 0.2}

# List of metrics
metrics = [roc_params, nef_params, pr_params,
           bedroc_20, bedroc_10, bedroc_2, bedroc_05,
           ef_0001, ef_0005, ef_002, ef_02]

# Run Analysis

In [21]:
%%time

prot_name = 'CDK2'
base_filename = f'./consensus_scoring_results/{prot_name}_CS_results_conformational_selection'

for metric_eval in metrics:
    metric_name = '_'.join([str(i) for i in metric_eval.values()])
    print(metric_name)
    df = aggregate_conf_selection_results_CS(f'{base_filename}-{metric_name}.obj', 
                                             X, y, 
                                             splitting_methods=splitting_methods, 
                                             selectors=selectors,
                                             cs_methods=consensus_methods, 
                                             metrics=[metric_eval], 
                                             nreps=15, 
                                             scaffold_series=scaffold_series)

roc_auc
File loaded: ./consensus_scoring_results/CDK2_CS_results_conformational_selection-roc_auc.obj
nef_auc
File loaded: ./consensus_scoring_results/CDK2_CS_results_conformational_selection-nef_auc.obj
pr_auc
File loaded: ./consensus_scoring_results/CDK2_CS_results_conformational_selection-pr_auc.obj
bedroc_20
File loaded: ./consensus_scoring_results/CDK2_CS_results_conformational_selection-bedroc_20.obj
bedroc_10
File loaded: ./consensus_scoring_results/CDK2_CS_results_conformational_selection-bedroc_10.obj
bedroc_2
File loaded: ./consensus_scoring_results/CDK2_CS_results_conformational_selection-bedroc_2.obj
bedroc_0.5
File loaded: ./consensus_scoring_results/CDK2_CS_results_conformational_selection-bedroc_0.5.obj
ef_0.001
File loaded: ./consensus_scoring_results/CDK2_CS_results_conformational_selection-ef_0.001.obj
ef_0.005
File loaded: ./consensus_scoring_results/CDK2_CS_results_conformational_selection-ef_0.005.obj
ef_0.02
File loaded: ./consensus_scoring_results/CDK2_CS_results

## Prepare results 

In [27]:
from glob import glob
prot_name = 'CDK2'
base_filename = f'./consensus_scoring_results/{prot_name}_CS_results_conformational_selection'

files = glob(base_filename + '*')
df = pd.concat([pd.read_pickle(i) for i in files]).round(4)
df.to_pickle(f'./{prot_name}_dash_app_Consensus_results.obj')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,0,1,2,3,4,5,6,7,8,9,...,392,393,394,395,396,397,398,399,400,401
split,selector,consensus,metric,desc,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
none,rand,RbN,ef_0.001,mean,3.7584,3.6192,2.7840,3.2016,2.7840,3.0624,3.4800,2.5056,3.0624,2.3664,...,2.0880,2.0880,2.0880,2.0880,2.0880,2.0880,2.0880,2.0880,2.0880,2.0880
none,rand,RbN,ef_0.001,std,1.6174,1.6679,1.2887,1.5519,1.0188,1.3362,1.2887,0.8645,1.3362,0.7347,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
none,rand,RbR,ef_0.001,mean,3.0624,3.0624,3.4800,4.3152,3.8976,3.3408,3.2016,2.9232,3.3408,3.6192,...,4.1760,4.1760,4.1760,4.1760,4.1760,4.1760,4.1760,4.1760,4.1760,4.1760
none,rand,RbR,ef_0.001,std,2.3500,1.5519,1.5112,1.4694,1.5519,1.3206,1.5519,1.3206,1.3206,1.4694,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
none,rand,BS,ef_0.001,mean,2.6448,3.2016,3.4800,3.2016,2.6448,3.3408,2.7840,2.2272,1.6704,3.4800,...,4.1760,3.8976,3.8976,4.1760,4.1760,4.1760,4.1760,4.1760,4.1760,4.1760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rand,XGB,RbR,ef_0.005,std,1.4736,1.2605,1.2916,2.0711,1.6433,1.8551,1.4733,2.1090,1.7599,1.1271,...,1.8766,1.0669,1.3901,1.7449,1.7676,0.8453,1.6025,1.8337,2.1710,1.5691
rand,XGB,BS,ef_0.005,mean,3.2235,2.6677,2.4453,2.7789,3.2235,2.7788,3.1125,2.7789,2.2229,2.6678,...,3.1123,3.0012,2.2231,2.7789,2.8901,3.2235,2.6678,2.6678,2.2231,2.3343
rand,XGB,BS,ef_0.005,std,1.6027,1.3809,1.3904,1.6273,1.6027,1.5003,1.3903,1.6273,1.3615,1.6434,...,2.0774,1.6911,1.6273,1.8553,1.3320,1.7221,1.2287,2.0710,2.1525,1.2287
rand,XGB,ECR,ef_0.005,mean,2.8899,1.6673,2.6678,2.5566,3.3347,2.8900,3.0013,2.8900,3.3347,3.1124,...,3.4459,2.4455,2.8900,2.8901,3.2235,3.2237,2.6677,2.0007,3.3347,2.3343
