# Conformational Selection using *k* conformations

- We will test the Machine Learning Classifiers and the Consensus Strategies

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
sys.path.append('../..')

In [2]:
# Load some auxiliary functions
%run 1_Helper_functions.ipynb

  **kwargs
  **kwargs


## Compute the SBVS methods performances
### Load docking results

In [3]:
prot_name = 'cdk2'
file_name = '../4_Ensemble_docking_results/' + \
            'df_DkSc_results_COCRYS_CSAR_DEKOIS_DUD.pkl'
df_dk_res = pd.read_pickle(file_name)

# Extract the features columns: Docking scores
X_dksc = df_dk_res.drop('activity', axis = 1)
X = X_dksc.values
# Extract the response variable: Activity
y = df_dk_res['activity'].values

In [4]:
# Proportion of active molecules R_2
R_a = round(y.sum() / len(y), 4)

### Machine Learning Classifiers

In [5]:
from sklearn.dummy import DummyClassifier

hyparams = {'strategy': "stratified"}
dclf = DummyClassifier(**hyparams)

In [6]:
from sklearn.linear_model import LogisticRegression 

hyparams = {'C': 0.01, 
            'penalty': 'l2', 
            'solver': 'lbfgs', 
            'max_iter':400}
lr = LogisticRegression(**hyparams)

In [7]:
from xgboost import XGBClassifier

hyparams = {'subsample': 0.5, 
            'n_estimators': 200, 
            'max_depth': 20, 
            'learning_rate': 0.05,
            'alpha': 0.01,
            'gamma': 0.01, 
            'colsample_bytree': 0.5}
xgb = XGBClassifier(**hyparams)

In [8]:
# ML Classifiers
ml_classifiers = {
    'ml_lr': lr,
    'ml_xgb': xgb,
    'ml_dclf': dclf
}

### Consensus Strategies

In [9]:
from helper_modules.consensus_strategies import \
    get_mean_score, get_min_score, get_geom_mean_score

In [10]:
cs_functions = {
    'cs_MEAN': get_mean_score,
    'cs_GEOM': get_geom_mean_score,
    'cs_MIN': get_min_score
}

### SBVS methods (estimators) dictionary

In [11]:
estimators = {**ml_classifiers, **cs_functions}

# Map new names for each VS method, for plotting
full_names = ['LR', 'GBT', 'DClf', 
              'csAVG', 'csGEO', 'csMIN']
sbvs_names = dict(zip(
    estimators.keys(),
    full_names))
sbvs_names

{'ml_lr': 'LR',
 'ml_xgb': 'GBT',
 'ml_dclf': 'DClf',
 'cs_MEAN': 'csAVG',
 'cs_GEOM': 'csGEO',
 'cs_MIN': 'csMIN'}

### Evaluation metrics

In [12]:
metrics = dict(
    # AUC-ROC
    roc_auc   = {'metric_name': 'roc_auc'},
    # Normalized Enrichment Factor
    nef_12_Ra = {'metric_name': 'ef', 
                 'fraction'   : R_a, 
                 'method'     : 'normalized'}
    )

## 30x4-fold Validation using k conformations

In [13]:
# Save the results to a file to ommit repeate the analysis
@run_or_load_joblib
def nk_rep_cross_validation_SAVE(filename, **kwargs):
    return nk_rep_cross_validation(**kwargs)

In [14]:
# Define the filenames to save the results and the k values
main_dir = f'./cachedir/k_confs_30x4cv/{prot_name}'
Path(main_dir).mkdir(parents=True, exist_ok=True)

max_confs = X.shape[1]
k_values = [1, 2, 4, 8, 16, 32, 64, 128, 256, max_confs]

n_reps = 30
n_splits = 4
full_results = {}

## Selection Criteria
1. Random Selection
2. Recursive Feature Elimination Ranking
3. Best to worst AUC-ROC conformations (single-conformation docking)
4. Worst to best AUC-ROC conformations (single-conformation docking)
5. Conformations with bigger ligands to smaller (or no ligands)
6. Conformations from largest to smallest pocket volume

### Random Selection

In [15]:
# ************
test_name = 'random'
full_results[test_name] = {}
# ************
evaluation_name = f'k-confs-{test_name}_{n_reps}_reps'

for k in k_values:
    # Name the output file
    filename = f'{main_dir}/{evaluation_name}_{prot_name}_nConfs-{k}'
    
    results = nk_rep_cross_validation_RANDOM_CONFS_SAVE(
                        filename  = filename,
                        X = X,  y = y, 
                        n_reps    = n_reps, 
                        n_splits  = n_splits,
                        k = k, 
                        max_confs = max_confs, 
                        metrics   = metrics
    )
        
    print(f'Evaluation with k={k} finished.')
    # *******************************
    full_results[test_name][f'k_{k}'] = results

File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-random_30_reps_cdk2_nConfs-1
Evaluation with k=1 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-random_30_reps_cdk2_nConfs-2
Evaluation with k=2 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-random_30_reps_cdk2_nConfs-4
Evaluation with k=4 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-random_30_reps_cdk2_nConfs-8
Evaluation with k=8 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-random_30_reps_cdk2_nConfs-16
Evaluation with k=16 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-random_30_reps_cdk2_nConfs-32
Evaluation with k=32 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-random_30_reps_cdk2_nConfs-64
Evaluation with k=64 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-random_30_reps_cdk2_nConfs-128
Evaluation with k=128 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-random_30_reps_cdk2_nConfs-256
Evaluation with k=2

### Recursive Feature Elimination (with GBT)

In [16]:
# Open RFE_estimator
dataset    = 'MERGED'
model_name = 'XGB_tree'
split      = 'random'
filename   = f'./cachedir/rfe_selectors/RFE_xgb_{prot_name}.joblib'
# Load the RFE selector (computed in the previos notebook)
rfe_selector = joblib.load(filename)
# Create a dataframe with the protein rankings
df_ranks     = pd.DataFrame({
                     'pdb_id'     : X_dksc.columns, 
                     'rfe_ranking': rfe_selector.ranking_
               })
df_ranks     = df_ranks.sort_values('rfe_ranking')

In [17]:
# ************
test_name = 'RFExgb'
full_results[test_name] = {}
# ************
evaluation_name = f'k-confs-{test_name}_{n_reps}_reps'

for k in k_values:
    # Name the output file
    filename = f'{main_dir}/{evaluation_name}_{prot_name}_nConfs-{k}'
    
    results = nk_rep_cross_validation_RANDOM_CONFS_SAVE(
                        filename  = filename,
                        X = X,  y = y, 
                        n_reps    = n_reps, 
                        n_splits  = n_splits,
                        k = k, 
                        max_confs = max_confs, 
                        metrics   = metrics
    )
        
    print(f'Evaluation with k={k} finished.')
    # *******************************
    full_results[test_name][f'k_{k}'] = results

File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-RFExgb_30_reps_cdk2_nConfs-1
Evaluation with k=1 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-RFExgb_30_reps_cdk2_nConfs-2
Evaluation with k=2 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-RFExgb_30_reps_cdk2_nConfs-4
Evaluation with k=4 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-RFExgb_30_reps_cdk2_nConfs-8
Evaluation with k=8 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-RFExgb_30_reps_cdk2_nConfs-16
Evaluation with k=16 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-RFExgb_30_reps_cdk2_nConfs-32
Evaluation with k=32 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-RFExgb_30_reps_cdk2_nConfs-64
Evaluation with k=64 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-RFExgb_30_reps_cdk2_nConfs-128
Evaluation with k=128 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-RFExgb_30_reps_cdk2_nConfs-256
Evaluation with k=2

### Best to worst AUC-ROC conformations (single-conformation docking)

In [18]:
# Import the conformations features table
DIR     = '../4_Ensemble_docking_results//'
file    = f'{DIR}/TABLE_Confs_Features_and_performances_cdk2.pkl'
df_feat = pd.read_pickle(file)

# Reset index to access each conformation per idx position
df_feat = df_feat.reset_index()
df_feat.head(3)

Unnamed: 0,index,Resolution,Inhib. MW,Pk. Volume,Pk. SASA,Apo,Single Entity,AUC-ROC,NEF
0,1aq1,2.0,440.0,887.0,628.0,holo,single,0.62262,0.24337
1,1b38,2.0,398.0,698.0,587.0,holo,single,0.59115,0.21205
2,1b39,2.1,398.0,672.0,559.0,holo,single,0.61789,0.27711


In [19]:
# ************
test_name = 'singConfAUC'
full_results[test_name] = {}
# ************
evaluation_name = f'k-confs-{test_name}_{n_reps}_reps'

# This will determine which conf. will be selected
df_ranks = df_feat.sort_values('AUC-ROC', 
                               ascending = False)

for k in k_values:
    # Name the output file
    filename = f'{main_dir}/{evaluation_name}_{prot_name}_nConfs-{k}'
    
    # Get the conformations using the selector
    conformations = df_ranks.index[:k]
    X_sub = X[:, conformations]
    
    results = nk_rep_cross_validation_RANDOM_CONFS_SAVE(
                        filename  = filename,
                        X = X,  y = y, 
                        n_reps    = n_reps, 
                        n_splits  = n_splits,
                        k = k, 
                        max_confs = max_confs, 
                        metrics   = metrics
    )
        
    print(f'Evaluation with k={k} finished.')
    # *******************************
    full_results[test_name][f'k_{k}'] = results

File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-singConfAUC_30_reps_cdk2_nConfs-1
Evaluation with k=1 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-singConfAUC_30_reps_cdk2_nConfs-2
Evaluation with k=2 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-singConfAUC_30_reps_cdk2_nConfs-4
Evaluation with k=4 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-singConfAUC_30_reps_cdk2_nConfs-8
Evaluation with k=8 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-singConfAUC_30_reps_cdk2_nConfs-16
Evaluation with k=16 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-singConfAUC_30_reps_cdk2_nConfs-32
Evaluation with k=32 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-singConfAUC_30_reps_cdk2_nConfs-64
Evaluation with k=64 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-singConfAUC_30_reps_cdk2_nConfs-128
Evaluation with k=128 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-singConfAU

### Worst to best AUC-ROC conformations (single-conformation docking)

In [20]:
# ************
test_name = 'singConfAUC-Worst'
full_results[test_name] = {}
# ************
evaluation_name = f'k-confs-{test_name}_{n_reps}_reps'

# This will determine which conf. will be selected
df_ranks = df_feat.sort_values('AUC-ROC', 
                               ascending = True)

for k in k_values:
    # Name the output file
    filename = f'{main_dir}/{evaluation_name}_{prot_name}_nConfs-{k}'
    
    # Get the conformations using the selector
    conformations = df_ranks.index[:k]
    X_sub = X[:, conformations]
    
    results = nk_rep_cross_validation_RANDOM_CONFS_SAVE(
                        filename  = filename,
                        X = X,  y = y, 
                        n_reps    = n_reps, 
                        n_splits  = n_splits,
                        k = k, 
                        max_confs = max_confs, 
                        metrics   = metrics
    )
        
    print(f'Evaluation with k={k} finished.')
    # *******************************
    full_results[test_name][f'k_{k}'] = results

File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-singConfAUC-Worst_30_reps_cdk2_nConfs-1
Evaluation with k=1 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-singConfAUC-Worst_30_reps_cdk2_nConfs-2
Evaluation with k=2 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-singConfAUC-Worst_30_reps_cdk2_nConfs-4
Evaluation with k=4 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-singConfAUC-Worst_30_reps_cdk2_nConfs-8
Evaluation with k=8 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-singConfAUC-Worst_30_reps_cdk2_nConfs-16
Evaluation with k=16 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-singConfAUC-Worst_30_reps_cdk2_nConfs-32
Evaluation with k=32 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-singConfAUC-Worst_30_reps_cdk2_nConfs-64
Evaluation with k=64 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-singConfAUC-Worst_30_reps_cdk2_nConfs-128
Evaluation with k=128 finished.
File loaded: .

### Conformations with bigger ligands to smaller (or no ligands)

In [21]:
# ************
test_name = 'bigLigMW'
full_results[test_name] = {}
# ************
evaluation_name = f'k-confs-{test_name}_{n_reps}_reps'

# This will determine which conf. will be selected
df_ranks = df_feat.sort_values('Inhib. MW', 
                               ascending = False)

for k in k_values:
    # Name the output file
    filename = f'{main_dir}/{evaluation_name}_{prot_name}_nConfs-{k}'
    
    # Get the conformations using the selector
    conformations = df_ranks.index[:k]
    X_sub = X[:, conformations]
    
    results = nk_rep_cross_validation_RANDOM_CONFS_SAVE(
                        filename  = filename,
                        X = X,  y = y, 
                        n_reps    = n_reps, 
                        n_splits  = n_splits,
                        k = k, 
                        max_confs = max_confs, 
                        metrics   = metrics
    )
        
    print(f'Evaluation with k={k} finished.')
    # *******************************
    full_results[test_name][f'k_{k}'] = results

File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigLigMW_30_reps_cdk2_nConfs-1
Evaluation with k=1 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigLigMW_30_reps_cdk2_nConfs-2
Evaluation with k=2 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigLigMW_30_reps_cdk2_nConfs-4
Evaluation with k=4 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigLigMW_30_reps_cdk2_nConfs-8
Evaluation with k=8 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigLigMW_30_reps_cdk2_nConfs-16
Evaluation with k=16 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigLigMW_30_reps_cdk2_nConfs-32
Evaluation with k=32 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigLigMW_30_reps_cdk2_nConfs-64
Evaluation with k=64 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigLigMW_30_reps_cdk2_nConfs-128
Evaluation with k=128 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigLigMW_30_reps_cdk2_nConfs-256
E

### Conformations from largest to smallest pocket volume

In [22]:
# ************
test_name = 'bigPkVolume'
full_results[test_name] = {}
# ************
evaluation_name = f'k-confs-{test_name}_{n_reps}_reps'

# This will determine which conf. will be selected
df_ranks = df_feat.sort_values('Pk. Volume', 
                               ascending=False)

for k in k_values:
    # Name the output file
    filename = f'{main_dir}/{evaluation_name}_{prot_name}_nConfs-{k}'
    
    # Get the conformations using the selector
    conformations = df_ranks.index[:k]
    X_sub = X[:, conformations]
    
    results = nk_rep_cross_validation_RANDOM_CONFS_SAVE(
                        filename  = filename,
                        X = X,  y = y, 
                        n_reps    = n_reps, 
                        n_splits  = n_splits,
                        k = k, 
                        max_confs = max_confs, 
                        metrics   = metrics
    )
        
    print(f'Evaluation with k={k} finished.')
    # *******************************
    full_results[test_name][f'k_{k}'] = results

File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigPkVolume_30_reps_cdk2_nConfs-1
Evaluation with k=1 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigPkVolume_30_reps_cdk2_nConfs-2
Evaluation with k=2 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigPkVolume_30_reps_cdk2_nConfs-4
Evaluation with k=4 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigPkVolume_30_reps_cdk2_nConfs-8
Evaluation with k=8 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigPkVolume_30_reps_cdk2_nConfs-16
Evaluation with k=16 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigPkVolume_30_reps_cdk2_nConfs-32
Evaluation with k=32 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigPkVolume_30_reps_cdk2_nConfs-64
Evaluation with k=64 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigPkVolume_30_reps_cdk2_nConfs-128
Evaluation with k=128 finished.
File loaded: ./cachedir/k_confs_30x4cv/cdk2/k-confs-bigPkVolum

## Save the results

In [23]:
import joblib
path_to_file = './conformational_selection_30x4cv_using_k_confs.obj'

if not Path(path_to_file).exists():
    with open(path_to_file, 'wb') as f:
        joblib.dump(value = full_results, filename = f)