## Compute Consensus Scoring using Protein Conformational Selection

In [1]:
import pandas as pd
import numpy as np
import glob, os, sys
sys.path.append('..')
from modules.run_or_load_decorator import run_or_load

### Load the data
Load the data frame containing the raw docking scoring results.

In [2]:
file_name = './df_DkSc_results_COCRYS_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape

(6233, 136)

### Load train test splitting murcko data frame
Load the murcko scaffolds dataframe to perform scaffold splitting

In [6]:
sys.path.append('../2_Docking_analysis/')
from scaffold_splitter import train_test_scaffold_split

# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = './../2_Docking_analysis/df_COCRYS_DUD_DEKOIS_Murcko_Scaffolds_SMILES.obj'

df_scff_murcko = pd.read_pickle(file)
df_scff_murcko.shape

(6233, 3)

### Read RFE Selectors to get the preselected features

In [9]:
%run ./5_Helper_get_RFE_preselected_conformations.ipynb

In [10]:
# Instead of been a list as with ML results, here selectors is a dictionary containing selector name and preselected conformations
rfe_preselections.keys()

dict_keys(['LR_rand', 'RF_rand', 'XGB_rand', 'LR_scff', 'RF_scff', 'XGB_scff'])

### Import plotmetrics module to evaluate docking results

In [11]:
%run ../modules/plotting_metrics.py

### Import Consensus Scoring Related Functions

In [12]:
%run ./5_Helper_Consensus_Scoring.ipynb

### List of parameters to evaluate Consensus scoring

In [13]:
# none performs Consensus scoring using all molecules, scff and rand applies CS method over test set
splitting_methods = ['none', 'scff', 'rand'] 

scaffold_series = df_scff_murcko['scff_generic']

# Conf. Selectors
selectors = ['rand', 'LR', 'RF', 'XGB']

# Consensus Scoring Methods
consensus_methods = {
    'MEAN': get_mean_score,
    'MED':  get_median_score,
    'RANK': get_rank_score,
    'MIN':  get_min_score,
    'MAX':  get_max_score,
    'EUN':  get_euc_norm_score,
    'VOTE': get_vote_score,
    'ECR':  get_exp_consensus_ranking
}

# List of parameters to compute
roc_params = {'metric_name': 'roc_auc'}
nef_params = {'metric_name': 'nef_auc'}
pr_params = {'metric_name': 'pr_auc'}

# The Ra value for the testing set in FXa is 75/1559 = 0.05
# Therefore the maximum value of alpha for bedroc could be a=20
bedroc_20 = {'metric_name': 'bedroc', 'alpha': 20}
bedroc_10 = {'metric_name': 'bedroc', 'alpha': 10}
bedroc_2 = {'metric_name': 'bedroc', 'alpha': 2}
bedroc_05 = {'metric_name': 'bedroc', 'alpha': 0.5}

# ef values 0.001, 0.005, 0.02, 0.1, 0.2
ef_0001 = {'metric_name': 'ef', 'fraction': 0.001}
ef_0005 = {'metric_name': 'ef', 'fraction': 0.005}
ef_002 = {'metric_name': 'ef', 'fraction': 0.02}
ef_02 = {'metric_name': 'ef', 'fraction': 0.2}

# List of metrics
metrics = [roc_params, nef_params, pr_params,
           bedroc_20, bedroc_10, bedroc_2, bedroc_05,
           ef_0001, ef_0005, ef_002, ef_02]

# Run Analysis

In [9]:
%%time

prot_name = 'FXA'
base_filename = f'./consensus_scoring_results/{prot_name}_CS_results_conformational_selection'

for metric_eval in metrics:
    metric_name = '_'.join([str(i) for i in metric_eval.values()])
    print(metric_name)
    df = aggregate_conf_selection_results_CS(f'{base_filename}-{metric_name}.obj', 
                                             X, y, 
                                             splitting_methods=splitting_methods, 
                                             selectors=selectors,
                                             cs_methods=consensus_methods, 
                                             metrics=[metric_eval], 
                                             nreps=15, 
                                             scaffold_series=scaffold_series)

roc_auc
File loaded: ./consensus_scoring_results/FXA_CS_results_conformational_selection-roc_auc.obj
nef_auc
File loaded: ./consensus_scoring_results/FXA_CS_results_conformational_selection-nef_auc.obj
pr_auc
File loaded: ./consensus_scoring_results/FXA_CS_results_conformational_selection-pr_auc.obj
bedroc_20
File loaded: ./consensus_scoring_results/FXA_CS_results_conformational_selection-bedroc_20.obj
bedroc_10
File loaded: ./consensus_scoring_results/FXA_CS_results_conformational_selection-bedroc_10.obj
bedroc_2
File loaded: ./consensus_scoring_results/FXA_CS_results_conformational_selection-bedroc_2.obj
bedroc_0.5
File loaded: ./consensus_scoring_results/FXA_CS_results_conformational_selection-bedroc_0.5.obj
ef_0.001
File loaded: ./consensus_scoring_results/FXA_CS_results_conformational_selection-ef_0.001.obj
ef_0.005
File loaded: ./consensus_scoring_results/FXA_CS_results_conformational_selection-ef_0.005.obj
ef_0.02
File loaded: ./consensus_scoring_results/FXA_CS_results_conformat

## Processing and saving Results

In [10]:
from glob import glob
prot_name = 'FXA'
base_filename = f'./consensus_scoring_results/{prot_name}_CS_results_conformational_selection'

files = glob(base_filename + '*')
df = pd.concat([pd.read_pickle(i) for i in files]).round(4)
df.to_pickle(f'./{prot_name}_dash_app_Consensus_results.obj')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,0,1,2,3,4,5,6,7,8,9,...,126,127,128,129,130,131,132,133,134,135
split,selector,consensus,metric,desc,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
none,rand,RbN,nef_auc,mean,0.6113,0.6160,0.6257,0.6264,0.6313,0.6419,0.6301,0.6343,0.6371,0.6380,...,0.6451,0.6445,0.6451,0.6449,0.6450,0.6451,0.6449,0.6449,0.6450,0.6450
none,rand,RbN,nef_auc,std,0.0384,0.0244,0.0190,0.0079,0.0181,0.0113,0.0142,0.0102,0.0124,0.0124,...,0.0008,0.0009,0.0006,0.0007,0.0005,0.0006,0.0005,0.0006,0.0004,0.0000
none,rand,RbR,nef_auc,mean,0.6131,0.6281,0.6302,0.6346,0.6302,0.6286,0.6287,0.6362,0.6351,0.6353,...,0.6383,0.6387,0.6387,0.6386,0.6379,0.6385,0.6383,0.6384,0.6381,0.6380
none,rand,RbR,nef_auc,std,0.0222,0.0192,0.0233,0.0124,0.0178,0.0169,0.0139,0.0113,0.0126,0.0070,...,0.0009,0.0008,0.0007,0.0007,0.0007,0.0007,0.0005,0.0005,0.0004,0.0000
none,rand,BS,nef_auc,mean,0.6161,0.6122,0.6282,0.6289,0.6205,0.6253,0.6254,0.6241,0.6249,0.6201,...,0.6230,0.6233,0.6238,0.6237,0.6233,0.6224,0.6232,0.6235,0.6234,0.6240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rand,XGB,RbR,ef_0.2,std,0.2778,0.1957,0.1909,0.2495,0.2392,0.2211,0.2982,0.2230,0.2184,0.1830,...,0.1880,0.1942,0.2042,0.2136,0.2324,0.2880,0.1814,0.2328,0.2554,0.2183
rand,XGB,BS,ef_0.2,mean,1.5722,1.8965,2.1054,1.9719,1.9276,2.0609,1.8655,1.7767,1.9232,1.8432,...,1.9809,2.1096,1.9764,1.9587,1.9587,2.0831,2.0341,2.0964,2.0519,2.0875
rand,XGB,BS,ef_0.2,std,0.2989,0.1508,0.2118,0.2277,0.3048,0.2201,0.2544,0.2290,0.3030,0.2707,...,0.2611,0.2219,0.2291,0.2360,0.2566,0.1771,0.2292,0.2058,0.1535,0.2176
rand,XGB,ECR,ef_0.2,mean,1.4567,2.0475,2.0431,2.1097,2.0565,2.1009,2.1275,2.1630,1.9765,2.1185,...,2.1230,2.2341,2.2297,2.2829,2.1631,2.1763,2.1629,2.0697,2.0831,2.1986
