# CDK2: Model Selection - y values shuffled
### Validation Method

In [1]:
import pandas as pd
import numpy as np
import glob, sys, os
sys.path.append('..')

In [2]:
from modules.plotting_metrics import PlotMetric
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', context='talk', font_scale=0.8)

In [3]:
file_name = './df_DkSc_results_COCRYS_CSAR_DEKOIS_DUD.pkl' # Created in 3_Calculating metrics...
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape


(3466, 402)

## Scaffold Splitting

In [4]:
#*************************************************
# Functions to compute stratify scaffold splitting
#*************************************************
sys.path.append('../2_Docking_analysis/')
from scaffold_splitter import train_test_scaffold_split

In [5]:
# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = '../2_Docking_analysis/df_COCRYS_CSAR_DUD_DEKOIS_Murcko_Scaffolds_SMILES.obj'

df_scff_murcko = pd.read_pickle(file)
df_scff_murcko.shape

(3466, 3)

In [6]:
%run 4_Helper_Functions_Model_Selection_Grid_Search.ipynb

#  Hyperparameter Tunning: Grid Search

In [7]:
def randomize_y_labels(y_target, random_chi=0.1):
    '''Función para distribuir de forma aleatoria una fracción 
    chi del vector de etiquetas, de forma estratificada'''
    
    # Make a copy of the original vector
    y_copy = y.copy()
    
    # Get the number of actives inside the y_target vector
    n_actives = y_target.sum()
    random_size = np.floor(random_chi * n_actives)
    # Initialize the counters
    act_count = random_size
    inact_count = random_size
    
    # Create the randomized list of idexes
    idx_shuffled = np.random.choice(range(len(y)), len(y), replace=False)
    # iterate over idx_shuffled until act and inact counters == 0
    for l in idx_shuffled:
        if act_count > 0:
            if y_copy[l] == 1: # Is active, then change it to inactive
                y_copy[l] = 0
                act_count = act_count - 1
                continue
            if inact_count > 0: # If is inactive, change it to active
                y_copy[l] = 1
                inact_count = inact_count - 1
                continue
        else:
            break
    return(y_copy)
    

***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    Merged libraries: Shuffle *y* target values in the train set
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

#### DEKOIS, DUD and COCRYS are  treated as one unique library
#### Target values (y) will be shuffled

In [8]:
# Train and test over 
X = X_merged_dksc
# ***** Permutate y values *****
y = y_true_merged#.sample(frac=1, replace=False)

library = 'Merged'
scaffold_series = df_scff_murcko['scff_generic']

# Create an empty dictionary to save results

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>Merged Libraries</b>

In [9]:
%%time
from sklearn.linear_model import LogisticRegression

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    print('*'*60, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*60)

    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = LogisticRegression(max_iter=300)
    hyperparams = {'C':  np.geomspace(1e-8, 1e2, 6),
                   'penalty': ['l1', 'l2'], 
                   'solver': ['lbfgs', 'liblinear']}

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='random', test_size=0.25, 
                 scaffold_series=None)
    
    print('\n')

************************************************************ 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
************************************************************
LogReg_chi0 => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 2599, with 311 actives.
No. of molecules in test set: 867, with 104 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.892
- Train ROC-AUC:  	0.941
- Test ROC-AUC:   	0.872
- Best hyperparameters {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.671 	> median: 0.616, mean: 0.610
> Test best conf. ROC-AUC: 0.678 	> median: 0.608, mean: 0.605
****************************************************************


************************************************************ 
 Randomized y fraction (actives): 0.1
Number of actives/inactives shuffled: 41.5
****

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


No. of molecules in train set: 2599, with 311 actives.
No. of molecules in test set: 867, with 104 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.801
- Train ROC-AUC:  	0.881
- Test ROC-AUC:   	0.780
- Best hyperparameters {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.643 	> median: 0.595, mean: 0.591
> Test best conf. ROC-AUC: 0.665 	> median: 0.597, mean: 0.597
****************************************************************


************************************************************ 
 Randomized y fraction (actives): 0.3
Number of actives/inactives shuffled: 124.5
************************************************************
LogReg_chi0.3 => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 2599, with 311 actives.
No. of molecules in test set: 867, with 104 actives.

********** GRID SEARCH RESULTS

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


No. of molecules in train set: 2599, with 311 actives.
No. of molecules in test set: 867, with 104 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.619
- Train ROC-AUC:  	0.800
- Test ROC-AUC:   	0.696
- Best hyperparameters {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.583 	> median: 0.557, mean: 0.555
> Test best conf. ROC-AUC: 0.612 	> median: 0.549, mean: 0.549
****************************************************************


************************************************************ 
 Randomized y fraction (actives): 1
Number of actives/inactives shuffled: 415
************************************************************
LogReg_chi1 => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 2599, with 311 actives.
No. of molecules in test set: 867, with 104 actives.

********** GRID SEARCH RESULTS *****

In [11]:
%%time
from sklearn.linear_model import LogisticRegression

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    print('*'*60, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*60)

    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = LogisticRegression(max_iter=300)
    hyperparams = {'C':  np.geomspace(1e-8, 1e2, 6),
                   'penalty': ['l1', 'l2'], 
                   'solver': ['lbfgs', 'liblinear']}

    # SCAFFOLD Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='scaffold', test_size=0.25, 
                 scaffold_series=scaffold_series)

************************************************************ 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
************************************************************
LogReg_chi0 => Train: Merged; Test: Merged; split: scaffold
No. of molecules in train set: 2599, with 311 actives.
No. of molecules in test set: 867, with 104 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.857
- Train ROC-AUC:  	0.949
- Test ROC-AUC:   	0.821
- Best hyperparameters {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.654 	> median: 0.602, mean: 0.596
> Test best conf. ROC-AUC: 0.746 	> median: 0.642, mean: 0.640
****************************************************************
************************************************************ 
 Randomized y fraction (actives): 0.1
Number of actives/inactives shuffled: 41.5
****

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


No. of molecules in train set: 2599, with 311 actives.
No. of molecules in test set: 867, with 104 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.775
- Train ROC-AUC:  	0.895
- Test ROC-AUC:   	0.694
- Best hyperparameters {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.641 	> median: 0.585, mean: 0.581
> Test best conf. ROC-AUC: 0.645 	> median: 0.569, mean: 0.568
****************************************************************
************************************************************ 
 Randomized y fraction (actives): 0.3
Number of actives/inactives shuffled: 124.5
************************************************************
LogReg_chi0.3 => Train: Merged; Test: Merged; split: scaffold


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


No. of molecules in train set: 2599, with 311 actives.
No. of molecules in test set: 867, with 104 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.738
- Train ROC-AUC:  	0.871
- Test ROC-AUC:   	0.646
- Best hyperparameters {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.622 	> median: 0.579, mean: 0.574
> Test best conf. ROC-AUC: 0.651 	> median: 0.586, mean: 0.585
****************************************************************
************************************************************ 
 Randomized y fraction (actives): 0.4
Number of actives/inactives shuffled: 166.0
************************************************************
LogReg_chi0.4 => Train: Merged; Test: Merged; split: scaffold
No. of molecules in train set: 2599, with 311 actives.
No. of molecules in test set: 867, with 104 actives.

********** GRID SEARCH RESULTS

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


No. of molecules in train set: 2599, with 311 actives.
No. of molecules in test set: 867, with 104 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.639
- Train ROC-AUC:  	0.817
- Test ROC-AUC:   	0.576
- Best hyperparameters {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.590 	> median: 0.555, mean: 0.553
> Test best conf. ROC-AUC: 0.567 	> median: 0.515, mean: 0.515
****************************************************************
************************************************************ 
 Randomized y fraction (actives): 1
Number of actives/inactives shuffled: 415
************************************************************
LogReg_chi1 => Train: Merged; Test: Merged; split: scaffold
No. of molecules in train set: 2599, with 311 actives.
No. of molecules in test set: 867, with 104 actives.

********** GRID SEARCH RESULTS *****

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: X Gradient Boosting </h3>
<b>Merged Libraries</b>

In [12]:
%%time

from xgboost import XGBClassifier

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)
    
    estimator_name = 'XGB_tree'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = XGBClassifier()
    hyperparams = {'n_estimators': [200, 300],
                   'max_depth': [2, 3, 10, 20],
                   'learning_rate': [0.05, 0.1],
                   'gamma': [0.01, 0.1, 0.5, 1],
                   'alpha': [0.01, 0.1, 0.5, 1],
                   'subsample': [0.3, 0.5],
                   'colsample_bytree': [0.3, 0.5, 1]
                }

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='random', test_size=0.25, 
                 scaffold_series=None,
                 # RandomizedGS
                 randomGS=True, n_iter=30)
    
    print('\n')

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
XGB_tree_chi0 => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 2599, with 311 actives.
No. of molecules in test set: 867, with 104 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.898
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.924
- Best hyperparameters {'subsample': 0.5, 'n_estimators': 300, 'max_depth': 20, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.5, 'alpha': 0.1}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.676 	> median: 0.617, mean: 0.611
> Test best conf. ROC-AUC: 0.661 	> median: 0.606, mean: 0.603
****************************************************************


********************

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Radial Basis Function SVM</h3>
<b>Merged Libraries</b>

In [13]:
%%time
from sklearn.svm import SVC

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    estimator_name = 'rbfSVC'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = SVC(kernel = 'rbf', probability=True)
    hyperparams = {'C': [1]} #{'C': np.geomspace(1e0, 1e2, 3), 'gamma': np.geomspace(1e-4, 1e0, 3)}

    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='random', test_size=0.25, 
                 scaffold_series=None)
    
    print('\n')

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
rbfSVC_chi0 => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 2599, with 311 actives.
No. of molecules in test set: 867, with 104 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.903
- Train ROC-AUC:  	0.974
- Test ROC-AUC:   	0.937
- Best hyperparameters {'C': 1}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.667 	> median: 0.610, mean: 0.605
> Test best conf. ROC-AUC: 0.693 	> median: 0.629, mean: 0.623
****************************************************************


******************************************************************************** 
 Randomized y fraction (actives): 0.1
Number of actives/inactive

In [14]:
%%time

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    estimator_name = 'rbfSVC'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = SVC(kernel = 'rbf', probability=True)
    hyperparams = {'C': [1]} #{'C': np.geomspace(1e0, 1e2, 3), 'gamma': np.geomspace(1e-4, 1e0, 3)}
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)
    
    # SCAFFOLD Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='scaffold', test_size=0.25, 
                 scaffold_series=scaffold_series)
    
    print('\n')

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
rbfSVC_chi0 => Train: Merged; Test: Merged; split: scaffold
No. of molecules in train set: 2599, with 311 actives.
No. of molecules in test set: 867, with 104 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.860
- Train ROC-AUC:  	0.975
- Test ROC-AUC:   	0.791
- Best hyperparameters {'C': 1}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.654 	> median: 0.602, mean: 0.596
> Test best conf. ROC-AUC: 0.746 	> median: 0.642, mean: 0.640
****************************************************************


******************************************************************************** 
 Randomized y fraction (actives): 0.1
Number of actives/inacti

<h3 style='color: #F84122; padding: 0px;; margin:ade0px'>GS: kNN Calssifier </h3>
<b>Merged Libraries</b>

In [15]:
%%time
from sklearn.neighbors import KNeighborsClassifier 


for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    estimator_name = '1-NN'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = KNeighborsClassifier()
    hyperparams = {'n_neighbors': [1], 
                   'p': [1, 2]
                     }
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='random', test_size=0.25, 
                 scaffold_series=None)
    print('\n')

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
1-NN_chi0 => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 2599, with 311 actives.
No. of molecules in test set: 867, with 104 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.772
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.804
- Best hyperparameters {'n_neighbors': 1, 'p': 2}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.669 	> median: 0.614, mean: 0.608
> Test best conf. ROC-AUC: 0.675 	> median: 0.620, mean: 0.613
****************************************************************


******************************************************************************** 
 Randomized y fraction (actives): 0.1
Number of 

In [16]:
%%time
from sklearn.neighbors import KNeighborsClassifier 


for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    estimator_name = '1-NN'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = KNeighborsClassifier()
    hyperparams = {'n_neighbors': [1], 
                   'p': [1, 2]
                     }
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                splitting='scaffold', test_size=0.25, 
                 scaffold_series=scaffold_series) 
    print('\n')

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
1-NN_chi0 => Train: Merged; Test: Merged; split: scaffold
No. of molecules in train set: 2599, with 311 actives.
No. of molecules in test set: 867, with 104 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.688
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.672
- Best hyperparameters {'n_neighbors': 1, 'p': 2}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.654 	> median: 0.602, mean: 0.596
> Test best conf. ROC-AUC: 0.746 	> median: 0.642, mean: 0.640
****************************************************************


******************************************************************************** 
 Randomized y fraction (actives): 0.1
Number o

***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    DEKOIS: Shuffle *y* target values in the train set
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

#### DEKOIS 
#### Target values (y) will be shuffled

In [17]:
library = 'DEKOIS'

# Train and test over DUDU
X = X_merged_dksc.loc[library]
y = y_true_merged.loc[library]
scaffold_series = df_scff_murcko['scff_generic'].loc[library]

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>Merged Libraries</b>

In [18]:
%%time
from sklearn.linear_model import LogisticRegression

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)

    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = LogisticRegression(max_iter=300)
    hyperparams = {'C':  np.geomspace(1e-8, 1e2, 6),
                   'penalty': ['l1', 'l2'], 
                   'solver': ['lbfgs', 'liblinear']}

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='random', test_size=0.25, 
                 scaffold_series=None)
    
    print('\n')

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
LogReg_chi0 => Train: DEKOIS; Test: DEKOIS; split: random
No. of molecules in train set: 929, with 30 actives.
No. of molecules in test set: 310, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.864
- Train ROC-AUC:  	0.948
- Test ROC-AUC:   	0.864
- Best hyperparameters {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.793 	> median: 0.687, mean: 0.688
> Test best conf. ROC-AUC: 0.855 	> median: 0.722, mean: 0.719
****************************************************************


******************************************************************************** 
 Randomized y fraction (acti

In [20]:
%%time
from sklearn.linear_model import LogisticRegression

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)

    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = LogisticRegression(max_iter=300)
    hyperparams = {'C':  np.geomspace(1e-8, 1e2, 6),
                   'penalty': ['l1', 'l2'], 
                   'solver': ['lbfgs', 'liblinear']}

    # SCAFFOLD Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='scaffold', test_size=0.25, 
                 scaffold_series=scaffold_series)

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
LogReg_chi0 => Train: DEKOIS; Test: DEKOIS; split: scaffold
No. of molecules in train set: 929, with 30 actives.
No. of molecules in test set: 310, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.832
- Train ROC-AUC:  	0.936
- Test ROC-AUC:   	0.831
- Best hyperparameters {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.827 	> median: 0.728, mean: 0.726
> Test best conf. ROC-AUC: 0.810 	> median: 0.601, mean: 0.603
****************************************************************
******************************************************************************** 
 Randomized y fraction (acti

***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    DUD: Shuffle *y* target values in the train set
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

#### DUD 
#### Target values (y) will be shuffled

In [21]:
library = 'DUD'

# Train and test over DUDU
X = X_merged_dksc.loc[library]
y = y_true_merged.loc[library]
scaffold_series = df_scff_murcko['scff_generic'].loc[library]

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>Merged Libraries</b>

In [22]:
%%time
from sklearn.linear_model import LogisticRegression

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)

    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = LogisticRegression(max_iter=300)
    hyperparams = {'C':  np.geomspace(1e-8, 1e2, 6),
                   'penalty': ['l1', 'l2'], 
                   'solver': ['lbfgs', 'liblinear']}

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='random', test_size=0.25, 
                 scaffold_series=None)
    
    print('\n')

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
LogReg_chi0 => Train: DUD; Test: DUD; split: random


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


No. of molecules in train set: 1368, with 43 actives.
No. of molecules in test set: 457, with 15 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.941
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.956
- Best hyperparameters {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.696 	> median: 0.599, mean: 0.581
> Test best conf. ROC-AUC: 0.701 	> median: 0.538, mean: 0.531
****************************************************************


******************************************************************************** 
 Randomized y fraction (actives): 0.1
Number of actives/inactives shuffled: 5.800000000000001
********************************************************************************
LogReg_chi0.1 => Train: DUD; Test: DUD; split: random


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


No. of molecules in train set: 1368, with 43 actives.
No. of molecules in test set: 457, with 15 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.888
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.875
- Best hyperparameters {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.692 	> median: 0.594, mean: 0.577
> Test best conf. ROC-AUC: 0.600 	> median: 0.454, mean: 0.456
****************************************************************


******************************************************************************** 
 Randomized y fraction (actives): 0.2
Number of actives/inactives shuffled: 11.600000000000001
********************************************************************************
LogReg_chi0.2 => Train: DUD; Test: DUD; split: random
No. of molecules in train set: 1368, with 43 actives.
No. of molecules in test set: 457, with 1

In [23]:
%%time
from sklearn.linear_model import LogisticRegression

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)

    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = LogisticRegression(max_iter=300)
    hyperparams = {'C':  np.geomspace(1e-8, 1e2, 6),
                   'penalty': ['l1', 'l2'], 
                   'solver': ['lbfgs', 'liblinear']}

    # SCAFFOLD Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='scaffold', test_size=0.25, 
                 scaffold_series=scaffold_series)

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
LogReg_chi0 => Train: DUD; Test: DUD; split: scaffold


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


No. of molecules in train set: 1368, with 43 actives.
No. of molecules in test set: 457, with 15 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.913
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.909
- Best hyperparameters {'C': 100.0, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.644 	> median: 0.528, mean: 0.517
> Test best conf. ROC-AUC: 0.844 	> median: 0.734, mean: 0.716
****************************************************************
******************************************************************************** 
 Randomized y fraction (actives): 0.1
Number of actives/inactives shuffled: 5.800000000000001
********************************************************************************
LogReg_chi0.1 => Train: DUD; Test: DUD; split: scaffold
No. of molecules in train set: 1368, with 43 actives.
No. of molecules in test set: 457, with 

***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    CSAR: Shuffle *y* target values in the train set
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

#### CSAR 
#### Target values (y) will be shuffled

In [25]:
library = 'CSAR'

# Train and test over DUDU
X = X_merged_dksc.loc[library]
y = y_true_merged.loc[library]
scaffold_series = df_scff_murcko['scff_generic'].loc[library]

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>Merged Libraries</b>

In [26]:
%%time
from sklearn.linear_model import LogisticRegression

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)

    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = LogisticRegression(max_iter=300)
    hyperparams = {'C':  np.geomspace(1e-8, 1e2, 6),
                   'penalty': ['l1', 'l2'], 
                   'solver': ['lbfgs', 'liblinear']}

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='random', test_size=0.25, 
                 scaffold_series=None)
    
    print('\n')

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
LogReg_chi0 => Train: CSAR; Test: CSAR; split: random
No. of molecules in train set: 82, with 19 actives.
No. of molecules in test set: 28, with 6 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.809
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.917
- Best hyperparameters {'C': 100.0, 'penalty': 'l2', 'solver': 'liblinear'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.667 	> median: 0.521, mean: 0.520
> Test best conf. ROC-AUC: 0.799 	> median: 0.621, mean: 0.621
****************************************************************


******************************************************************************** 
 Randomized y fraction (active

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


No. of molecules in train set: 82, with 19 actives.
No. of molecules in test set: 28, with 6 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.810
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.591
- Best hyperparameters {'C': 100.0, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.656 	> median: 0.515, mean: 0.514
> Test best conf. ROC-AUC: 0.515 	> median: 0.303, mean: 0.312
****************************************************************


******************************************************************************** 
 Randomized y fraction (actives): 0.3
Number of actives/inactives shuffled: 7.5
********************************************************************************
LogReg_chi0.3 => Train: CSAR; Test: CSAR; split: random
No. of molecules in train set: 82, with 19 actives.
No. of molecules in test set: 28, with 6 actives.

*******

In [27]:
%%time
from sklearn.linear_model import LogisticRegression

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)

    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = LogisticRegression(max_iter=300)
    hyperparams = {'C':  np.geomspace(1e-8, 1e2, 6),
                   'penalty': ['l1', 'l2'], 
                   'solver': ['lbfgs', 'liblinear']}

    # SCAFFOLD Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='scaffold', test_size=0.25, 
                 scaffold_series=scaffold_series)

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
LogReg_chi0 => Train: CSAR; Test: CSAR; split: scaffold
No. of molecules in train set: 82, with 18 actives.
No. of molecules in test set: 28, with 7 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.735
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.905
- Best hyperparameters {'C': 100.0, 'penalty': 'l1', 'solver': 'liblinear'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.752 	> median: 0.619, mean: 0.616
> Test best conf. ROC-AUC: 0.721 	> median: 0.337, mean: 0.342
****************************************************************
******************************************************************************** 
 Randomized y fraction (active

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


No. of molecules in train set: 82, with 18 actives.
No. of molecules in test set: 28, with 7 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.688
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.912
- Best hyperparameters {'C': 100.0, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.691 	> median: 0.564, mean: 0.562
> Test best conf. ROC-AUC: 0.752 	> median: 0.378, mean: 0.383
****************************************************************
******************************************************************************** 
 Randomized y fraction (actives): 0.3
Number of actives/inactives shuffled: 7.5
********************************************************************************
LogReg_chi0.3 => Train: CSAR; Test: CSAR; split: scaffold
No. of molecules in train set: 82, with 18 actives.
No. of molecules in test set: 28, with 7 actives.

*******

# Results

In [28]:

row_names = ['Train_set', 'Test_set', 'Model name', 'Split', 
             'N_actives_train', 'N_actives_test', 'N_mols_train', 'Num_mols_test',
            'Mean-CV-ROC', 'ROC-AUC_train', 'ROC-AUC_test', 'best_params',
            'DkS_max_ROC_train',  'DkSc_med_ROC_train', 'DkSc_mean_ROC_train', 
            'DkS_max_ROC_test',  'DkSc_med_ROC_test', 'DkSc_mean_ROC_test'
            ]

y_shuffled_ml_model_selecion = pd.DataFrame(results_dict, index = row_names).T
y_shuffled_ml_model_selecion.to_pickle('df_y_shuffled_ml_model_selecion.pkl')

Unnamed: 0,Train_set,Test_set,Model name,Split,N_actives_train,N_actives_test,N_mols_train,Num_mols_test,Mean-CV-ROC,ROC-AUC_train,ROC-AUC_test,best_params,DkS_max_ROC_train,DkSc_med_ROC_train,DkSc_mean_ROC_train,DkS_max_ROC_test,DkSc_med_ROC_test,DkSc_mean_ROC_test
Merged_Merged_LogReg_chi0_random,Merged,Merged,LogReg_chi0,random,2599,311,867,104,0.892267,0.94111,0.871799,"{'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}",0.671243,0.616292,0.610195,0.678408,0.608346,0.605304
Merged_Merged_LogReg_chi0.1_random,Merged,Merged,LogReg_chi0.1,random,2599,311,867,104,0.845772,0.910249,0.839261,"{'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}",0.646154,0.597902,0.592605,0.660878,0.60128,0.597059
Merged_Merged_LogReg_chi0.2_random,Merged,Merged,LogReg_chi0.2,random,2599,311,867,104,0.800964,0.880588,0.779627,"{'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}",0.642749,0.594762,0.590658,0.664545,0.597379,0.596545
Merged_Merged_LogReg_chi0.3_random,Merged,Merged,LogReg_chi0.3,random,2599,311,867,104,0.753632,0.858808,0.767731,"{'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}",0.630674,0.592439,0.587313,0.630986,0.575067,0.574174
Merged_Merged_LogReg_chi0.4_random,Merged,Merged,LogReg_chi0.4,random,2599,311,867,104,0.686639,0.828816,0.715533,"{'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}",0.575917,0.536211,0.534196,0.659687,0.584812,0.582878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CSAR_CSAR_LogReg_chi0.2_scaffold,CSAR,CSAR,LogReg_chi0.2,scaffold,82,18,28,7,0.688141,1,0.911565,"{'C': 100.0, 'penalty': 'l2', 'solver': 'lbfgs'}",0.690972,0.563802,0.561663,0.751701,0.377551,0.383296
CSAR_CSAR_LogReg_chi0.3_scaffold,CSAR,CSAR,LogReg_chi0.3,scaffold,82,18,28,7,0.558654,0.598958,0.588435,"{'C': 0.01, 'penalty': 'l1', 'solver': 'liblin...",0.536024,0.44401,0.442518,0.659864,0.472789,0.47042
CSAR_CSAR_LogReg_chi0.4_scaffold,CSAR,CSAR,LogReg_chi0.4,scaffold,82,18,28,7,0.742628,0.690104,0.537415,"{'C': 1e-08, 'penalty': 'l2', 'solver': 'lbfgs'}",0.756076,0.659505,0.656449,0.687075,0.517007,0.512328
CSAR_CSAR_LogReg_chi0.5_scaffold,CSAR,CSAR,LogReg_chi0.5,scaffold,82,18,28,7,0.5,0.5,0.5,"{'C': 1e-08, 'penalty': 'l1', 'solver': 'libli...",0.644097,0.484809,0.485898,0.758503,0.547619,0.545267
