# ML Evaluation: Conformational selection

In [1]:
import pandas as pd
import numpy as np
import pickle
import glob, sys, os
sys.path.append('..')

In [2]:
from modules.run_or_load_decorator import run_or_load

In [3]:
%run ../modules/plotting_metrics.py

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', context='talk', font_scale=0.8)

In [5]:
file_name = './df_DkSc_results_COCRYS_CSAR_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)

In [6]:
sys.path.append('../2_Docking_analysis/')
from scaffold_splitter import train_test_scaffold_split

# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = '../2_Docking_analysis/df_COCRYS_CSAR_DUD_DEKOIS_Murcko_Scaffolds_SMILES.obj'

df_scff_murcko = pd.read_pickle(file)

In [7]:
# Read dictionary of results from notebook 6
TRAIN_DB = 'MergedDB'
TEST_DB = 'MergedDB'
SCORE_TYPE = 'DkSc'
scaffold_series = df_scff_murcko['scff_generic']

FILEPATH = f'./ml_models/conf_selection_evaluation/{SCORE_TYPE}_{TRAIN_DB}-{TRAIN_DB}'

def format_filename(*args):
    return '_'.join([*args] ) + '.obj'

def format_name(*args, sep='/'):
    return sep.join([*args])

In [8]:
def process_best_dk_score(splitting_method, 
                           selector_name, 
                           classifier_name,
                           X_dksc_raw,
                           metric='roc_auc', **metric_params):
    SPLIT = splitting_method
    SELEC = selector_name
    CLF = classifier_name
    
    # Read the file
    if selector_name == 'rand':
        sel_str =  f'randSel'
    else:
        sel_str =  f'rfeSel-{SELEC}'
        
    file = FILEPATH + f'_{SPLIT}Split_{CLF}_{sel_str}_nreps15.obj'
    with open(file, 'rb') as f:
        main_dict = pickle.load(f)
        
    ref_scores_list = []
    # Iterate over reps
    for rep in main_dict.keys():
        test_index = main_dict[rep]['y_true_index']
        y_true = main_dict[rep]['y_true']
        
        # Subset docking scores matrix (X_dksc) to evaluate using only those molecules
        X_test_dks = X_dksc_raw.loc[test_index]
        X_test_dks = {col: X_test_dks[col].values for col in X_test_dks.columns}
        
        # Compute the requested metric using the docking scores and keep the max score
        metric_results = PlotMetric(y_true=y_true, y_pred_dict=X_test_dks, 
                                 decreasing=True).format_metric_results(metric_name=metric, **metric_params)
        max_score = metric_results.max().values[0]
        # Save the max score
        ref_scores_list.append(max_score)
        
    # Get the max score and format as dataframe with a multiindex
    # If the metric is EF or BEDROC and depends on a hyperparameter, concat this value to the metric name
    if len({**metric_params}) > 0:
        parm_str = str([*{**metric_params}.values()][0])
        metric = metric + '_' + parm_str
    
    idx = pd.MultiIndex.from_product([[SPLIT], [SELEC], [CLF], [metric]])
    max_score = max(ref_scores_list)
    median_score = np.median(ref_scores_list)
    max_score_df = pd.DataFrame({'best_dksc': max_score, 'median_dksc': median_score}, index=idx)
    return max_score_df

In [9]:
process_best_dk_score('rand', 'RF', 'LogReg', X_merged_dksc, 'nef_auc')

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,best_dksc,median_dksc
rand,RF,LogReg,nef_auc,0.752,0.687


In [10]:
def process_conf_selection_results(splitting_method, 
                                   selector_name, 
                                   classifier_name,
                                   metric, **metric_params):
    '''Reads a pickle file which contains y_pred arrays for a given selection method for 
    a k number if conformations; with k is in range of 1 to N conformations.'''
    SPLIT = splitting_method
    SELEC = selector_name
    CLF = classifier_name
    
    # Read the file
    if selector_name == 'rand':
        sel_str =  f'randSel'
    else:
        sel_str =  f'rfeSel-{SELEC}'
        
    file = FILEPATH + f'_{SPLIT}Split_{CLF}_{sel_str}_nreps15.obj'
    with open(file, 'rb') as f:
        main_dict = pickle.load(f)

    dic_metrics = {}

    # Iterate over reps
    for rep in main_dict.keys():
        # Extract y_true
        y_true = main_dict[rep]['y_true']
        # Extract the dict of predicted values
        dict_y_preds = main_dict[rep].copy()
        del dict_y_preds['y_true_index']

        # Compute the evaluation metric
        metric_results = PlotMetric(y_true=y_true, y_pred_dict=dict_y_preds, 
                                 decreasing=False).format_metric_results(metric_name=metric,
                                                                         **metric_params)
        dic_metrics[rep] = metric_results.T.values[0]
    
    # Get the max score and format as dataframe with a multiindex
    # If the metric is EF or BEDROC and depends on a hyperparameter, concat this value to the metric name
    if len({**metric_params}) > 0:
        parm_str = str([*{**metric_params}.values()][0])
        metric = metric + '_' + parm_str

    # convert results into a dataframe
    df_metrics = pd.DataFrame(dic_metrics)
    # Keep only mean and standard deviation
    df_metrics = df_metrics.apply([np.mean, np.std], axis=1).fillna(0)
    # Rename columns following the split-selection-classifier-metric pattern
    df_metrics.columns = [format_name(SPLIT, SELEC, CLF, metric, 'mean'),
                         format_name(SPLIT, SELEC, CLF, metric, 'std')]
    # Return the transposed matrix
    return df_metrics.T

In [19]:
splitting_methods = ['scff', 'rand']
selectors = ['rand', 'LR', 'RF', 'XGB']
classifiers = ['LogReg', 'rbfSVC', 'XGB_tree', '1NN']

# List of parameters to compute
roc_params = {'metric': 'roc_auc'}
nef_params = {'metric': 'nef_auc'}
pr_params = {'metric': 'pr_auc'}

# The Ra value for the testing set in FXa is 75/1559 = 0.05
# Therefore the maximum value of alpha for bedroc could be a=20
bedroc_20 = {'metric': 'bedroc', 'alpha': 20}
bedroc_10 = {'metric': 'bedroc', 'alpha': 10}
bedroc_2 = {'metric': 'bedroc', 'alpha': 2}
bedroc_05 = {'metric': 'bedroc', 'alpha': 0.5}

# ef values 0.001, 0.005, 0.02, 0.1, 0.2
ef_0001 = {'metric': 'ef', 'fraction': 0.001}
ef_0005 = {'metric': 'ef', 'fraction': 0.005}
ef_002 = {'metric': 'ef', 'fraction': 0.02}
ef_02 = {'metric': 'ef', 'fraction': 0.2}

# List of metrics
metrics = [roc_params, nef_params, pr_params,
           bedroc_20, bedroc_10, bedroc_2, bedroc_05,
           ef_0001, ef_0005, ef_002, ef_02]

## Create the output tables as a pickle object

In [20]:

@run_or_load
def aggregate_conf_selection_results(
    filename,
    splitting_methods,
    selectors,
    classifiers,
    metrics,
    X_dksc_raw):
    
    #****************************************
    # Compute Results from Raw Docking Scores
    #****************************************
    results_dksc = []
    for metric in metrics:
        for split in splitting_methods:
            for selec in selectors:
                for clf in classifiers:
                    result = process_best_dk_score(
                       splitting_method = split, 
                           selector_name = selec, 
                           classifier_name = clf,
                           X_dksc_raw = X_dksc_raw,
                           **metric)
                    results_dksc.append(result)

    # Format the dataframe
    X_dksc = pd.concat(results_dksc)
    X_dksc.rename_axis(("split", "selector", "classifier", "metric"), inplace=True)

    #************************************************
    # Compute Results from ML Models over 1 to N confs
    #************************************************
    results_ml = []
    for metric in metrics:
        for split in splitting_methods:
            for selec in selectors:
                for clf in classifiers:
                    result_ml = process_conf_selection_results(
                        splitting_method = split, 
                           selector_name = selec, 
                           classifier_name = clf,
                           **metric)
                    results_ml.append(result_ml)

    # Format the dataframe       
    X_ml = pd.concat(results_ml)
    X_ml.index = X_ml.index.str.split('/', expand=True)
    X_ml.rename_axis(("split", "selector", "classifier", "metric", "desc"), inplace=True)

    # Return a dictionary of results
    results_dict = {'X_dksc': X_dksc,
                    'X_ml': X_ml}
    
    return results_dict

In [21]:
%%time
filename = './CDK2_ML_results_conformational_selection.obj'

ALL_RESULTS = aggregate_conf_selection_results(
    filename,
    splitting_methods,
    selectors,
    classifiers,
    metrics,
    X_dksc_raw=X_merged_dksc)


File saved: ./CDK2_ML_results_conformational_selection.obj
CPU times: user 27min 20s, sys: 6.35 s, total: 27min 26s
Wall time: 27min 35s


In [16]:
X_merged_dksc

Unnamed: 0_level_0,Unnamed: 1_level_0,1aq1,1b38,1b39,1buh,1ckp,1di8,1dm2,1e1v,1e1x,1e9h,...,6q4b,6q4c,6q4d,6q4e,6q4f,6q4g,6q4h,6q4i,6q4j,6q4k
library,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
COCRYS,STU,-9.3,-7.3,-5.6,-6.4,-6.3,-6.3,-5.6,-7.9,-5.9,-6.1,...,-7.6,-7.1,-6.4,-6.9,-6.9,-7.3,-6.9,-7.4,-6.4,-7.2
COCRYS,ATP,-8.9,-9.3,-9.5,-9.0,-7.3,-8.0,-8.3,-8.2,-9.4,-7.4,...,-9.0,-8.4,-8.1,-8.0,-7.7,-7.3,-9.3,-8.8,-8.4,-7.6
COCRYS,PVB,-8.1,-7.4,-7.2,-6.4,-6.8,-7.8,-8.3,-8.0,-7.9,-7.5,...,-7.0,-6.8,-6.5,-6.7,-7.5,-6.6,-7.8,-7.2,-7.4,-7.4
COCRYS,DTQ,-9.4,-7.6,-7.9,-7.3,-7.9,-9.4,-8.0,-8.8,-9.2,-8.5,...,-7.4,-7.3,-7.5,-7.5,-7.9,-7.3,-8.3,-7.8,-8.6,-7.4
COCRYS,HMD,-7.3,-6.5,-6.4,-6.4,-6.7,-7.0,-7.7,-6.5,-7.2,-7.1,...,-5.7,-5.7,-5.8,-6.3,-6.0,-5.8,-6.4,-5.9,-6.9,-5.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DEKOIS,decoy_1196,-10.2,-9.3,-9.7,-7.6,-8.6,-9.4,-9.0,-6.5,-9.6,-10.0,...,-7.0,-6.9,-7.2,-6.2,-9.8,-8.8,-10.9,-7.6,-8.9,-7.2
DEKOIS,decoy_1197,-10.5,-8.6,-8.0,-8.0,-8.1,-7.8,-9.6,-8.4,-8.7,-9.2,...,-8.0,-6.7,-8.2,-8.0,-6.4,-6.6,-8.3,-8.4,-8.3,-7.6
DEKOIS,decoy_1198,-8.7,-7.2,-8.2,-8.2,-8.1,-8.5,-8.3,-7.1,-8.7,-8.7,...,-7.4,-7.6,-8.1,-7.8,-8.3,-8.9,-9.5,-7.8,-7.9,-8.3
DEKOIS,decoy_1199,-9.4,-9.0,-8.6,-8.3,-8.6,-8.9,-7.9,-9.1,-6.7,-8.0,...,-8.3,-8.3,-8.3,-7.5,-7.8,-8.3,-8.4,-7.4,-8.5,-7.9


In [21]:
import plotly.graph_objects as go

In [27]:
# Drawing a plot

# 
X_dksc = ALL_RESULTS['X_dksc']
X = ALL_RESULTS['X_ml']



# Diccionario de colores
cols_lines = {'LogReg'  : 'rgb(134, 102, 183)',
              'rbfSVC'  : 'rgb(229, 156, 48)',
              'XGB_tree': 'rgb(9, 153, 149)',
              '1NN'     : 'rgb(230, 73, 79)'}

cols_fill = {'LogReg'   : 'rgba(134, 102, 183, 0.25)',
             'rbfSVC'   : 'rgba(216, 143, 48, 0.25)',
             'XGB_tree' : 'rgba(9, 153, 149, 0.25)',
             '1NN'      : 'rgba(230, 73, 79, 0.25)'}

# Dictionary names
split_names = {'rand': 'Random',
               'scff': 'Scaffold'}

selector_names = {'rand' : 'Random',
                  'LR'   : 'RFE (Log. Reg.)',
                  'RF'   : 'RFE (Rand. Forest)',
                  'XGB'  : 'RFE (Grad. Boost)'}

clf_names_dict = {'LogReg'  : 'Log. Regression',
                  'rbfSVC'  : 'RBF SVM',
                  'XGB_tree': 'Gradient Boosting',
                  '1NN'     : '1-NN Classifier'}

metric_names = {'roc_auc'   : 'ROC-AUC',
                'nef_auc'   : 'NEF-AUC',
                'pr_auc'    : 'Pr & Rcll-AUC',
                'bedroc_20' : 'BEDROC (a=20)',
                'bedroc_10' : 'BEDROC (a=10)',
                'bedroc_2'  : 'BEDROC (a=2)',
                'bedroc_0.5': 'BEDROC (a=0.5)',
                'ef_0.001'  : 'EF (chi=0.1%)',
                'ef_0.005'  : 'EF (chi=0.5%)',
                'ef_0.02'   : 'EF (chi=2.0%)',
                'ef_0.2'    : 'EF (chi=20.0%)',
               }

# Si es nef o roc; 0.4 a 1,
# si es pr o bedroc 0 a 1
# si es ef, omitir limites
def line_plot_metrics(split, selector, metric):
    query = f"split == '{split}' & selector == '{selector}' & metric == '{metric}'"

    # Ref score
    best_ref = X_dksc.query(query).max()['best_dksc']
    median_ref = X_dksc.query(query).median()['median_dksc']

    # Results
    X_subset = X.query(query)
    X_subset = X_subset.reset_index().drop(['split', 'selector',  'metric', 0], axis=1)
    X_subset = X_subset.set_index(['desc', 'classifier']).T
    X_mean = X_subset.loc[:, 'mean']
    X_std = X_subset.loc[:, 'std']

    # Número de conformaciones
    n_confs = X_mean.shape[0]

    if (metric == 'roc_auc'):
        y_axis_params = dict(range=[0.4, 1], tick0=0.00, dtick=0.05)
    elif (metric == 'nef_auc'):
        y_axis_params = dict(range=[0.2, 1], tick0=0.00, dtick=0.05)
    elif 'ef_' in metric:
        y_axis_params = dict()
    else:
        y_axis_params = dict(range=[0.0, 1], tick0=0.00, dtick=0.1)


    traces = []
    for col in X_mean.columns:
        # Create the upper and lower bounds
        upper = X_mean[col] + X_std[col]
        lower = X_mean[col] - X_std[col]

        upper = go.Scatter(x=X_mean.index, 
                           y=X_mean[col] + X_std[col],
                           mode='lines',
                           name=clf_names_dict[col], 
                           legendgroup=clf_names_dict[col], 
                           showlegend=False,
                           line=dict(width=0),
                           fillcolor=cols_fill[col],
                           hoverinfo='skip',
                           fill='tonexty')

        line = go.Scatter(x=X_mean.index, 
                           y=X_mean[col],
                           mode='lines',
                           name=clf_names_dict[col],
                           hovertemplate = 
                           f'<b style="color: {cols_lines[col]}">{clf_names_dict[col]}</b>' +
                           '<br>' +
                           '<b><i>k</i> confs:</b> %{x}' +
                           '<br>' +
                           f'<b><i>{metric_names[metric]}</i>:</b> ' + 
                           '%{y:.2f}' +
                           '<extra></extra>',
                           legendgroup=clf_names_dict[col], 
                           line=dict(width=2.5,
                                     color=cols_lines[col]),
                           fillcolor=cols_fill[col],
                           fill='tonexty')

        lower = go.Scatter(x=X_mean.index, 
                           y=X_mean[col] - X_std[col],
                           mode='lines',
                           name=clf_names_dict[col], 
                           legendgroup=clf_names_dict[col], 
                           showlegend=False,
                           hoverinfo='skip',
                           line=dict(width=0),
                          )

        traces = traces + [lower, line, upper]

    fig = go.Figure(data=traces)   

    # Add ref DkSc best score
    # Best raw score
    fig.add_shape(dict(type='line', x0=0, x1=n_confs, y0=best_ref, y1=best_ref),
                 line=dict(color="#B7AF9E", width=1.5, dash = 'dot'))
    fig.add_annotation(x=n_confs - 10, y=best_ref,
                       showarrow=False,
                       font=dict(size=9),
                       text='max Dksc: <b>{:.2f}</b>'.format(best_ref), 
                       bgcolor="#CEC9BD")
    # Meadian raw score
    fig.add_shape(dict(type='line', x0=0, x1=n_confs, y0=median_ref, y1=median_ref),
                 line=dict(color="#689AA8", width=1.5, dash = 'dot'))
    fig.add_annotation(x=n_confs - 10, y=median_ref,
                       showarrow=False,
                       font=dict(size=9),
                       text='med Dksc: <b>{:.2f}</b>'.format(median_ref), 
                       bgcolor="#B5D3DC")
    # AXES
    fig.update_xaxes(ticks='outside', showline=True, linewidth=2, linecolor='#43494F', mirror = True)
    # Y axis changes
    fig.update_yaxes(y_axis_params)
    fig.update_yaxes(ticks='outside', showline=True, 
                     linewidth=2, linecolor='black', mirror = True)
    fig.update_layout(template='plotly_white',
                      hoverlabel=dict(
                         bgcolor = 'white',
                         font_size=11.5
                      ),
                      xaxis = dict(
                         title='Number of protein conformations used'
                      ),
                      yaxis = dict(
                         title=f'Metric Score:<br><b>{metric_names[metric]}</b>'
                      ),
                      legend=dict(
                         orientation="h",
                         yanchor="bottom",
                         y=0.02,
                         xanchor="center",
                         x=0.5,
                         bgcolor="#F5F3EF"
                        ),
                     margin=dict(l=20, r=20, t=20, b=20),
                     paper_bgcolor="LightSteelBlue",
                     )
    return fig

metric = 'roc_auc'
split = 'scff'
selector = 'RF'

fig = line_plot_metrics(split, selector, metric)
                  
fig.show()

In [45]:
metric = 'roc_auc'
query_X = f"split == 'rand' & selector == 'rand' & metric == '{metric}'"
best_ref = X_dksc.query(query_X)
median_ref = X_dksc.query(query_X).median()['mean_dksc']
best_ref

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,best_dksc,mean_dksc
split,selector,classifier,metric,Unnamed: 4_level_1,Unnamed: 5_level_1
rand,rand,LogReg,roc_auc,0.759,0.686
rand,rand,rbfSVC,roc_auc,0.723,0.7
rand,rand,XGB_tree,roc_auc,0.738,0.708
rand,rand,1NN,roc_auc,0.747,0.697


In [58]:
'roc' in 'bedroc'

True