# CDK2: Model Selection - y values shuffled
### Validation Method

In [1]:
import pandas as pd
import numpy as np
import glob, sys, os
sys.path.append('..')

In [2]:
from modules.plotting_metrics import PlotMetric
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', context='talk', font_scale=0.8)

In [3]:
file_name = './df_DkSc_results_COCRYS_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape
y_true_merged.loc['DEKOIS'].sum()

40

## Scaffold Splitting

In [4]:
#*************************************************
# Functions to compute stratify scaffold splitting
#*************************************************
sys.path.append('../2_Docking_analysis/')
from scaffold_splitter import train_test_scaffold_split

In [5]:
# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = '../2_Docking_analysis/df_COCRYS_DUD_DEKOIS_Murcko_Scaffolds_SMILES.obj'

df_scff_murcko = pd.read_pickle(file)
df_scff_murcko.shape

(6233, 3)

In [6]:
%run 4_Helper_Functions_Model_Selection_Grid_Search.ipynb

#  Hyperparameter Tunning: Grid Search

In [7]:
def randomize_y_labels(y_target, random_chi=0.1):
    '''Función para distribuir de forma aleatoria una fracción 
    chi del vector de etiquetas, de forma estratificada'''
    
    # Make a copy of the original vector
    y_copy = y.copy()
    
    # Get the number of actives inside the y_target vector
    n_actives = y_target.sum()
    random_size = np.floor(random_chi * n_actives)
    # Initialize the counters
    act_count = random_size
    inact_count = random_size
    
    # Create the randomized list of idexes
    idx_shuffled = np.random.choice(range(len(y)), len(y), replace=False)
    # iterate over idx_shuffled until act and inact counters == 0
    for l in idx_shuffled:
        if act_count > 0:
            if y_copy[l] == 1: # Is active, then change it to inactive
                y_copy[l] = 0
                act_count = act_count - 1
                continue
            if inact_count > 0: # If is inactive, change it to active
                y_copy[l] = 1
                inact_count = inact_count - 1
                continue
        else:
            break
    return(y_copy)
    

***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    Merged libraries: Shuffle *y* target values in the train set
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

#### DEKOIS, DUD and COCRYS are  treated as one unique library
#### Target values (y) will be shuffled

In [8]:
# Train and test over 
X = X_merged_dksc
# ***** Permutate y values *****
y = y_true_merged#.sample(frac=1, replace=False)

library = 'Merged'
scaffold_series = df_scff_murcko['scff_generic']

# Create an empty dictionary to save results

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>Merged Libraries</b>

In [9]:
%%time
from sklearn.linear_model import LogisticRegression

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    print('*'*60, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*60)

    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = LogisticRegression(max_iter=300)
    hyperparams = {'C':  np.geomspace(1e-8, 1e2, 6),
                   'penalty': ['l1', 'l2'], 
                   'solver': ['lbfgs', 'liblinear']}

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='random', test_size=0.25, 
                 scaffold_series=None)
    
    print('\n')

************************************************************ 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
************************************************************
LogReg_chi0 => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.822
- Train ROC-AUC:  	0.884
- Test ROC-AUC:   	0.854
- Best hyperparameters {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.677 	> median: 0.598, mean: 0.598
> Test best conf. ROC-AUC: 0.713 	> median: 0.642, mean: 0.635
****************************************************************


************************************************************ 
 Randomized y fraction (actives): 0.1
Number of actives/inactives shuffled: 30.0
*

In [10]:
%%time
from sklearn.linear_model import LogisticRegression

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    print('*'*60, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*60)

    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = LogisticRegression(max_iter=300)
    hyperparams = {'C':  np.geomspace(1e-8, 1e2, 6),
                   'penalty': ['l1', 'l2'], 
                   'solver': ['lbfgs', 'liblinear']}

    # SCAFFOLD Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='scaffold', test_size=0.25, 
                 scaffold_series=scaffold_series)

************************************************************ 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
************************************************************
LogReg_chi0 => Train: Merged; Test: Merged; split: scaffold
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.818
- Train ROC-AUC:  	0.903
- Test ROC-AUC:   	0.745
- Best hyperparameters {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.687 	> median: 0.594, mean: 0.593
> Test best conf. ROC-AUC: 0.716 	> median: 0.654, mean: 0.647
****************************************************************
************************************************************ 
 Randomized y fraction (actives): 0.1
Number of actives/inactives shuffled: 30.0
*

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: X Gradient Boosting </h3>
<b>Merged Libraries</b>

In [11]:
%%time

from xgboost import XGBClassifier

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)
    
    estimator_name = 'XGB_tree'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = XGBClassifier()
    hyperparams = {'n_estimators': [200, 300],
                   'max_depth': [2, 3, 10, 20],
                   'learning_rate': [0.05, 0.1],
                   'gamma': [0.01, 0.1, 0.5, 1],
                   'alpha': [0.01, 0.1, 0.5, 1],
                   'subsample': [0.3, 0.5],
                   'colsample_bytree': [0.3, 0.5, 1]
                }

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='random', test_size=0.25, 
                 scaffold_series=None,
                 # RandomizedGS
                 randomGS=True, n_iter=30)
    
    print('\n')

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
XGB_tree_chi0 => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.880
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.892
- Best hyperparameters {'subsample': 0.5, 'n_estimators': 200, 'max_depth': 20, 'learning_rate': 0.1, 'gamma': 0.5, 'colsample_bytree': 1, 'alpha': 0.5}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.690 	> median: 0.610, mean: 0.609
> Test best conf. ROC-AUC: 0.672 	> median: 0.603, mean: 0.600
****************************************************************


***********************

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Radial Basis Function SVM</h3>
<b>Merged Libraries</b>

In [12]:
%%time
from sklearn.svm import SVC

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    estimator_name = 'rbfSVC'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = SVC(kernel = 'rbf', probability=True)
    hyperparams = {'C': [1]} #{'C': np.geomspace(1e0, 1e2, 3), 'gamma': np.geomspace(1e-4, 1e0, 3)}

    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='random', test_size=0.25, 
                 scaffold_series=None)
    
    print('\n')

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
rbfSVC_chi0 => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.843
- Train ROC-AUC:  	0.957
- Test ROC-AUC:   	0.872
- Best hyperparameters {'C': 1}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.684 	> median: 0.602, mean: 0.601
> Test best conf. ROC-AUC: 0.696 	> median: 0.629, mean: 0.626
****************************************************************


******************************************************************************** 
 Randomized y fraction (actives): 0.1
Number of actives/inactive

In [13]:
%%time

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    estimator_name = 'rbfSVC'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = SVC(kernel = 'rbf', probability=True)
    hyperparams = {'C': [1]} #{'C': np.geomspace(1e0, 1e2, 3), 'gamma': np.geomspace(1e-4, 1e0, 3)}
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)
    
    # SCAFFOLD Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='scaffold', test_size=0.25, 
                 scaffold_series=scaffold_series)
    
    print('\n')

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
rbfSVC_chi0 => Train: Merged; Test: Merged; split: scaffold
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.833
- Train ROC-AUC:  	0.966
- Test ROC-AUC:   	0.718
- Best hyperparameters {'C': 1}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.687 	> median: 0.594, mean: 0.593
> Test best conf. ROC-AUC: 0.716 	> median: 0.654, mean: 0.647
****************************************************************


******************************************************************************** 
 Randomized y fraction (actives): 0.1
Number of actives/inacti

<h3 style='color: #F84122; padding: 0px;; margin:ade0px'>GS: kNN Calssifier </h3>
<b>Merged Libraries</b>

In [14]:
%%time
from sklearn.neighbors import KNeighborsClassifier 


for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    estimator_name = '1-NN'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = KNeighborsClassifier()
    hyperparams = {'n_neighbors': [1], 
                   'p': [1, 2]
                     }
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='random', test_size=0.25, 
                 scaffold_series=None)
    print('\n')

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
1-NN_chi0 => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.714
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.682
- Best hyperparameters {'n_neighbors': 1, 'p': 1}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.695 	> median: 0.618, mean: 0.615
> Test best conf. ROC-AUC: 0.691 	> median: 0.581, mean: 0.584
****************************************************************


******************************************************************************** 
 Randomized y fraction (actives): 0.1
Number of 

In [15]:
%%time
from sklearn.neighbors import KNeighborsClassifier 


for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    estimator_name = '1-NN'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = KNeighborsClassifier()
    hyperparams = {'n_neighbors': [1], 
                   'p': [1, 2]
                     }
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                splitting='scaffold', test_size=0.25, 
                 scaffold_series=scaffold_series) 
    print('\n')

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
1-NN_chi0 => Train: Merged; Test: Merged; split: scaffold
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.628
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.581
- Best hyperparameters {'n_neighbors': 1, 'p': 1}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.687 	> median: 0.594, mean: 0.593
> Test best conf. ROC-AUC: 0.716 	> median: 0.654, mean: 0.647
****************************************************************


******************************************************************************** 
 Randomized y fraction (actives): 0.1
Number o

***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    DEKOIS: Shuffle *y* target values in the train set
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

#### DEKOIS 
#### Target values (y) will be shuffled

In [16]:
library = 'DEKOIS'

# Train and test over DUDU
X = X_merged_dksc.loc[library]
y = y_true_merged.loc[library]
scaffold_series = df_scff_murcko['scff_generic'].loc[library]

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>Merged Libraries</b>

In [17]:
%%time
from sklearn.linear_model import LogisticRegression

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)

    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = LogisticRegression(max_iter=300)
    hyperparams = {'C':  np.geomspace(1e-8, 1e2, 6),
                   'penalty': ['l1', 'l2'], 
                   'solver': ['lbfgs', 'liblinear']}

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='random', test_size=0.25, 
                 scaffold_series=None)
    
    print('\n')

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
LogReg_chi0 => Train: DEKOIS; Test: DEKOIS; split: random
No. of molecules in train set: 915, with 30 actives.
No. of molecules in test set: 306, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.888
- Train ROC-AUC:  	0.890
- Test ROC-AUC:   	0.886
- Best hyperparameters {'C': 1e-08, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.899 	> median: 0.842, mean: 0.841
> Test best conf. ROC-AUC: 0.954 	> median: 0.813, mean: 0.798
****************************************************************


******************************************************************************** 
 Randomized y fraction (act

In [18]:
%%time
from sklearn.linear_model import LogisticRegression

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)

    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = LogisticRegression(max_iter=300)
    hyperparams = {'C':  np.geomspace(1e-8, 1e2, 6),
                   'penalty': ['l1', 'l2'], 
                   'solver': ['lbfgs', 'liblinear']}

    # SCAFFOLD Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='scaffold', test_size=0.25, 
                 scaffold_series=scaffold_series)

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
LogReg_chi0 => Train: DEKOIS; Test: DEKOIS; split: scaffold
No. of molecules in train set: 915, with 30 actives.
No. of molecules in test set: 306, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.866
- Train ROC-AUC:  	0.870
- Test ROC-AUC:   	0.950
- Best hyperparameters {'C': 0.0001, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.883 	> median: 0.816, mean: 0.811
> Test best conf. ROC-AUC: 0.967 	> median: 0.897, mean: 0.893
****************************************************************
******************************************************************************** 
 Randomized y fraction (ac

***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    DUD: Shuffle *y* target values in the train set
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

#### DUD 
#### Target values (y) will be shuffled

In [19]:
library = 'DUD'

# Train and test over DUDU
X = X_merged_dksc.loc[library]
y = y_true_merged.loc[library]
scaffold_series = df_scff_murcko['scff_generic'].loc[library]

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>Merged Libraries</b>

In [20]:
%%time
from sklearn.linear_model import LogisticRegression

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)

    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = LogisticRegression(max_iter=300)
    hyperparams = {'C':  np.geomspace(1e-8, 1e2, 6),
                   'penalty': ['l1', 'l2'], 
                   'solver': ['lbfgs', 'liblinear']}

    # RANDOM Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='random', test_size=0.25, 
                 scaffold_series=None)
    
    print('\n')

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
LogReg_chi0 => Train: DUD; Test: DUD; split: random
No. of molecules in train set: 3669, with 106 actives.
No. of molecules in test set: 1224, with 35 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.835
- Train ROC-AUC:  	0.873
- Test ROC-AUC:   	0.836
- Best hyperparameters {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.720 	> median: 0.584, mean: 0.582
> Test best conf. ROC-AUC: 0.717 	> median: 0.532, mean: 0.538
****************************************************************


******************************************************************************** 
 Randomized y fraction (actives

In [21]:
%%time
from sklearn.linear_model import LogisticRegression

chi_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1]

for chi in chi_values:
    y_rd = randomize_y_labels(y, chi)
    
    print('*'*80, '\n', 'Randomized y fraction (actives):', chi)
    print('Number of actives/inactives shuffled:', chi*y.sum())
    print('*'*80)

    estimator_name = 'LogReg'
    estimator_name = estimator_name + '_chi' + str(chi)
    estimator = LogisticRegression(max_iter=300)
    hyperparams = {'C':  np.geomspace(1e-8, 1e2, 6),
                   'penalty': ['l1', 'l2'], 
                   'solver': ['lbfgs', 'liblinear']}

    # SCAFFOLD Train test splitting
    split_and_gs(library, library, estimator_name,
                 X, y_rd, estimator, hyperparams,
                 splitting='scaffold', test_size=0.25, 
                 scaffold_series=scaffold_series)

******************************************************************************** 
 Randomized y fraction (actives): 0
Number of actives/inactives shuffled: 0
********************************************************************************
LogReg_chi0 => Train: DUD; Test: DUD; split: scaffold
No. of molecules in train set: 3669, with 105 actives.
No. of molecules in test set: 1224, with 36 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.886
- Train ROC-AUC:  	0.967
- Test ROC-AUC:   	0.638
- Best hyperparameters {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.747 	> median: 0.545, mean: 0.547
> Test best conf. ROC-AUC: 0.718 	> median: 0.636, mean: 0.634
****************************************************************
******************************************************************************** 
 Randomized y fraction (acti

# Results

In [23]:

row_names = ['Train_set', 'Test_set', 'Model name', 'Split', 
             'N_actives_train', 'N_actives_test', 'N_mols_train', 'Num_mols_test',
            'Mean-CV-ROC', 'ROC-AUC_train', 'ROC-AUC_test', 'best_params',
            'DkS_max_ROC_train',  'DkSc_med_ROC_train', 'DkSc_mean_ROC_train', 
            'DkS_max_ROC_test',  'DkSc_med_ROC_test', 'DkSc_mean_ROC_test'
            ]


y_shuffled_ml_model_selecion = pd.DataFrame(results_dict, index = row_names).T
y_shuffled_ml_model_selecion.to_pickle('df_y_shuffled_ml_model_selecion.pkl')

Unnamed: 0,Train_set,Test_set,Model name,Split,N_actives_train,N_actives_test,N_mols_train,Num_mols_test,Mean-CV-ROC,ROC-AUC_train,ROC-AUC_test,best_params,DkS_max_ROC_train,DkSc_med_ROC_train,DkSc_mean_ROC_train,DkS_max_ROC_test,DkSc_med_ROC_test,DkSc_mean_ROC_test
Merged_Merged_LogReg_chi0_random,Merged,Merged,LogReg_chi0,random,4674,225,1559,75,0.822164,0.883594,0.853836,"{'C': 1.0, 'penalty': 'l1', 'solver': 'libline...",0.67739,0.598271,0.597901,0.71296,0.641739,0.635027
Merged_Merged_LogReg_chi0.1_random,Merged,Merged,LogReg_chi0.1,random,4674,225,1559,75,0.786952,0.854789,0.763666,"{'C': 1.0, 'penalty': 'l1', 'solver': 'libline...",0.683756,0.608788,0.607451,0.652147,0.588576,0.586541
Merged_Merged_LogReg_chi0.2_random,Merged,Merged,LogReg_chi0.2,random,4674,225,1559,75,0.746461,0.809871,0.72124,"{'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}",0.642746,0.583939,0.583097,0.637269,0.566651,0.567458
Merged_Merged_LogReg_chi0.3_random,Merged,Merged,LogReg_chi0.3,random,4674,225,1559,75,0.734265,0.802656,0.65823,"{'C': 0.01, 'penalty': 'l2', 'solver': 'liblin...",0.631774,0.57297,0.571391,0.651572,0.587819,0.588254
Merged_Merged_LogReg_chi0.4_random,Merged,Merged,LogReg_chi0.4,random,4674,225,1559,75,0.664255,0.741662,0.681815,"{'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}",0.620917,0.581907,0.581113,0.60376,0.548845,0.547769
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DUD_DUD_LogReg_chi0.2_scaffold,DUD,DUD,LogReg_chi0.2,scaffold,3669,105,1224,36,0.839057,0.924117,0.583216,"{'C': 0.01, 'penalty': 'l2', 'solver': 'liblin...",0.740615,0.562833,0.563833,0.574986,0.489917,0.489191
DUD_DUD_LogReg_chi0.3_scaffold,DUD,DUD,LogReg_chi0.3,scaffold,3669,105,1224,36,0.763736,0.853143,0.454288,"{'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}",0.722719,0.577601,0.578109,0.579054,0.494657,0.496354
DUD_DUD_LogReg_chi0.4_scaffold,DUD,DUD,LogReg_chi0.4,scaffold,3669,105,1224,36,0.761527,0.904195,0.541059,"{'C': 1.0, 'penalty': 'l1', 'solver': 'libline...",0.687605,0.54984,0.548905,0.577581,0.444146,0.445062
DUD_DUD_LogReg_chi0.5_scaffold,DUD,DUD,LogReg_chi0.5,scaffold,3669,105,1224,36,0.737072,0.834255,0.510078,"{'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}",0.700665,0.598624,0.596589,0.609556,0.486029,0.48855
