# ML Evaluation: Conformational selection

In [1]:
import pandas as pd
import numpy as np
import pickle
import glob, sys, os
sys.path.append('..')

In [2]:
%run ../modules/plotting_metrics.py

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', context='talk', font_scale=0.8)

In [4]:
file_name = './df_DkSc_results_COCRYS_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)

In [5]:
from scaffold_splitter import train_test_scaffold_split

# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = './df_COCRYS_DUD_DEKOIS_Murcko_Scaffolds_SMILES.obj'

df_scff_murcko = pd.read_pickle(file)

In [6]:
# Read dictionary of results from notebook 6
TRAIN_DB = 'MergedDB'
TEST_DB = 'MergedDB'
SCORE_TYPE = 'DkSc'
scaffold_series = df_scff_murcko['scff_generic']

FILEPATH = f'./ml_models/conf_selection/{SCORE_TYPE}_{TRAIN_DB}-{TRAIN_DB}'

def format_filename(*args):
    return '_'.join([*args] ) + '.obj'

def format_name(*args, sep='/'):
    return sep.join([*args])

In [7]:
def process_best_dk_score(splitting_method, 
                           selector_name, 
                           classifier_name,
                           X_dks,
                           metric='roc_auc', **metric_params):
    SPLIT = splitting_method
    SELEC = selector_name
    CLF = classifier_name
    
    # Read the file
    if selector_name == 'rand':
        sel_str =  f'randSel'
    else:
        sel_str =  f'rfeSel-{SELEC}'
        
    file = FILEPATH + f'_{SPLIT}Split_{CLF}_{sel_str}_nreps15.obj'
    with open(file, 'rb') as f:
        main_dict = pickle.load(f)
        
    ref_scores_list = []
    # Iterate over reps
    for rep in main_dict.keys():
        test_index = main_dict[rep]['y_true_index']
        y_true = main_dict[rep]['y_true']
        
        # Subset docking scores matrix (X_dksc) to evaluate using only those molecules
        X_test_dks = X_dks.loc[test_index]
        X_test_dks = {col: X_test_dks[col].values for col in X_test_dks.columns}
        
        # Compute the requested metric using the docking scores and keep the max score
        metric_results = PlotMetric(y_true=y_true, y_pred_dict=X_test_dks, 
                                 decreasing=True).format_metric_results(metric_name=metric, **metric_params)
        max_score = metric_results.max().values[0]
        # Save the max score
        ref_scores_list.append(max_score)
        
    # Get the max score and format as dataframe with a multiindex
    # If the metric is EF or BEDROC and depends on a hyperparameter, concat this value to the metric name
    if len({**metric_params}) > 0:
        parm_str = str([*{**metric_params}.values()][0])
        metric = metric + '_' + parm_str
    
    idx = pd.MultiIndex.from_product([[SPLIT], [SELEC], [CLF], [metric]])
    max_score = max(ref_scores_list)
    max_score_df = pd.DataFrame({'best_dksc': max_score}, index=idx)
    return max_score_df

In [8]:
process_best_dk_score('rand', 'RF', 'LogReg', X_merged_dksc, 'roc_auc')

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,best_dksc
rand,RF,LogReg,roc_auc,0.731


In [16]:
def process_conf_selection_results(splitting_method, 
                                   selector_name, 
                                   classifier_name,
                                   metric='roc_auc',**metric_params):
    '''Reads a pickle file which contains y_pred arrays for a given selection method for 
    a k number if conformations; with k is in range of 1 to N conformations.'''
    SPLIT = splitting_method
    SELEC = selector_name
    CLF = classifier_name
    
    # Read the file
    if selector_name == 'rand':
        sel_str =  f'randSel'
    else:
        sel_str =  f'rfeSel-{SELEC}'
        
    file = FILEPATH + f'_{SPLIT}Split_{CLF}_{sel_str}_nreps15.obj'
    with open(file, 'rb') as f:
        main_dict = pickle.load(f)

    dic_metrics = {}

    # Iterate over reps
    for rep in main_dict.keys():
        # Extract y_true
        y_true = main_dict[rep]['y_true']
        # Extract the dict of predicted values
        dict_y_preds = main_dict[rep].copy()
        del dict_y_preds['y_true_index']

        # Compute the evaluation metric
        metric_results = PlotMetric(y_true=y_true, y_pred_dict=dict_y_preds, 
                                 decreasing=False).format_metric_results(metric_name=metric,
                                                                         **metric_params)
        dic_metrics[rep] = metric_results.T.values[0]

    # convert results into a dataframe
    df_metrics = pd.DataFrame(dic_metrics)
    # Keep only mean and standard deviation
    df_metrics = df_metrics.apply([np.mean, np.std], axis=1).fillna(0)
    # Rename columns following the split-selection-classifier-metric pattern
    df_metrics.columns = [format_name(SPLIT, SELEC, CLF, metric, 'mean'),
                         format_name(SPLIT, SELEC, CLF, metric, 'std')]
    # Return the transposed matrix
    return df_metrics.T

In [17]:
splitting_methods = ['scff', 'rand']
selectors = ['rand', 'LR', 'RF', 'XGB']
classifiers = ['LogReg', 'rbfSVC', 'XGB_tree', '1NN']
metrics = ['roc_auc', ]

In [18]:
%%time
results_dks = []

# params = {'metric': 'ef', 'fraction': 0.02}
params = {'metric': 'roc_auc'}

for split in splitting_methods:
    for selec in selectors:
        for clf in classifiers:
            result = process_best_dk_score(splitting_method = split, 
                                   selector_name = selec, 
                                   classifier_name = clf,
                                   X_dks = X_merged_dksc,
                                   **params)
            results_dks.append(result)
X_dks = pd.concat(results_dks)
X_dks.rename_axis(("split", "selector", "classifier", "metric"), inplace=True)

CPU times: user 1min 7s, sys: 581 ms, total: 1min 7s
Wall time: 1min 7s


In [19]:
%%time
results = []

# params = {'metric': 'ef', 'fraction': 0.02}
params = {'metric': 'roc_auc'}

for split in splitting_methods:
    for selec in selectors:
        for clf in classifiers:
            result = process_conf_selection_results(splitting_method = split, 
                                   selector_name = selec, 
                                   classifier_name = clf,
                                   **params)
            results.append(result)

X = pd.concat(results)
X.index = X.index.str.split('/', expand=True)
X.rename_axis(("split", "selector", "classifier", "metric", "desc"), inplace=True)


CPU times: user 1min 9s, sys: 579 ms, total: 1min 9s
Wall time: 1min 9s


In [20]:
import plotly.graph_objects as go

In [45]:
# Drawing a plot
query_X = "split == 'scff' & selector == 'rand'"

# Results
X_subset = X.query(query_X)
X_subset = X_subset.reset_index().drop(['split', 'selector',  'metric', 0], axis=1)
X_subset = X_subset.set_index(['desc', 'classifier']).T
X_mean = X_subset.loc[:, 'mean']
X_std = X_subset.loc[:, 'std']

# Número de conformaciones
n_confs = X_mean.shape[0]
n_mols = X_mean.shape[1]
print(n_mols)

# Diccionario de colores
cols_lines = {'LogReg': 'rgb(134, 102, 183)',
              'rbfSVC': 'rgb(229, 156, 48)',
              'XGB_tree': 'rgb(9, 153, 149)',
              '1NN': 'rgb(230, 73, 79)'}
cols_fill = {'LogReg': 'rgba(134, 102, 183, 0.25)',
             'rbfSVC': 'rgba(216, 143, 48, 0.25)',
             'XGB_tree': 'rgba(9, 153, 149, 0.25)',
             '1NN': 'rgba(230, 73, 79, 0.25)'}

# Dictionary names
split_names = {'rand': 'Random', 'scff': 'Scaffold'}

selector_names = {'rand': 'Random',
                  'LR': 'RFE (Log. Reg.)',
                  'RF': 'RFE (Rand. Forest)',
                  'XGB': 'RFE (Grad. Boost)'}

clf_names_dict = {'LogReg': 'Log. Regression',
                  'rbfSVC': 'RBF SVM',
                  'XGB_tree': 'Gradient Boosting',
                  '1NN': '1-NN Classifier'}

metric_names = {'roc_auc': 'ROC-AUC',
                'nef_auc': 'Norm. EF',
                'bedroc': 'BEDROC'}
metric = 'roc_auc'

traces = []
for col in X_mean.columns:
    # Create the upper and lower bounds
    upper = X_mean[col] + X_std[col]
    lower = X_mean[col] - X_std[col]
    
    upper = go.Scatter(x=X_mean.index, 
                       y=X_mean[col] + X_std[col],
                       mode='lines',
                       name=clf_names_dict[col], 
                       legendgroup=clf_names_dict[col], 
                       showlegend=False,
                       line=dict(width=0),
                       fillcolor=cols_fill[col],
                       hoverinfo='skip',
                       fill='tonexty')
    
    line = go.Scatter(x=X_mean.index, 
                       y=X_mean[col],
                       mode='lines',
                       name=clf_names_dict[col],
                       hovertemplate = 
                       f'<b style="color: {cols_lines[col]}">{clf_names_dict[col]}</b>' +
                       '<br>' +
                       '<b><i>k</i> confs:</b> %{x}' +
                       '<br>' +
                       f'<b><i>{metric_names[metric]}</i>:</b> ' + 
                       '%{y:.2f}' +
                       '<extra></extra>',
                       legendgroup=clf_names_dict[col], 
                       line=dict(width=2.5,
                                 color=cols_lines[col]),
                       fillcolor=cols_fill[col],
                       fill='tonexty')
    
    lower = go.Scatter(x=X_mean.index, 
                       y=X_mean[col] - X_std[col],
                       mode='lines',
                       name=clf_names_dict[col], 
                       legendgroup=clf_names_dict[col], 
                       showlegend=False,
                       hoverinfo='skip',
                       line=dict(width=0),
                      )
    
    traces = traces + [lower, line, upper]
    
fig = go.Figure(data=traces)   

# Add ref DkSc best score
# Ref score
best_ref = X_dks.query(query_X).max()[0]
fig.add_shape(dict(type='line', x0=0, x1=n_confs, y0=best_ref, y1=best_ref),
             line=dict(color="#B7AF9E", width=1.5, dash = 'dot'))

fig.add_annotation(x=n_confs - 10, y=best_ref,
                   showarrow=False,
                   font=dict(
                       size=9
                   ),
                   text='max Dksc: <b>{:.2f}</b>'.format(best_ref), 
                   bgcolor="#CEC9BD")

fig.update_xaxes(ticks='outside', showline=True, linewidth=2, linecolor='#43494F', mirror = True)
fig.update_yaxes(range=[0.3, 1], tick0=0.00, dtick=0.05, )
fig.update_yaxes(ticks='outside', showline=True, 
                 linewidth=2, linecolor='black', mirror = True)
fig.update_layout(template='plotly_white',
                  hoverlabel=dict(
                     bgcolor = 'white',
                     font_size=11.5
                  ),
                  xaxis = dict(
                     title='Number of protein conformations used'
                  ),
                  yaxis = dict(
                     title=f'Metric Score: <b>{metric_names[metric]}</b>'
                  ),
                  legend=dict(
                     orientation="h",
                     yanchor="bottom",
                     y=0.02,
                     xanchor="center",
                     x=0.5,
                     bgcolor="#F5F3EF"
                    ))
                  
fig.show()

4


In [None]:
# Save both dataframes as a pickle dictionary
fxa_results = {'Scores'}