# FXa: Model Selection

In [1]:
import pandas as pd
import numpy as np
import glob, sys, os
sys.path.append('..')

In [2]:
from modules.plotting_metrics import PlotMetric
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', context='talk', font_scale=0.8)

In [3]:
file_name = './df_DkSc_results_COCRYS_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape
y_true_merged.loc['DEKOIS'].sum()

40

### Timeout Decorator

In [4]:
import signal
from functools import wraps

def timeout(n_seconds=300):
    '''Stops a function execution after n seconds'''
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            # Set alarm for n seconds
            signal.alarm(n_seconds)
            try:
                # Call decorated func
                return func(*args, **kwargs)
#             except TimeoutError as e:
                print(f'Execution finished after {n_seconds}:', e)
            finally:
                # Cancel Alarm
                signal.alarm(0)
        return wrapper
    return decorator

## Scaffold Splitting

In [5]:
#*************************************************
# Functions to compute stratify scaffold splitting
#*************************************************
from scaffold_splitter import train_test_scaffold_split

In [6]:
# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = './df_COCRYS_DUD_DEKOIS_Murcko_Scaffolds_SMILES.obj'

df_scff_murcko = pd.read_pickle(file)

## Train/test on the same dataset 

### Learning Curves

In [7]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import learning_curve

In [8]:
def plot_learning_curves(estimator, X, y, title, ylim=[0.5,1], axes=None,
                         cv=3, train_sizes=np.linspace(0.1, 1.0, 10), 
                         scoring='roc_auc', n_jobs=4):
    '''
    Plot estimator performance on the training and validation
    sets as a function of the training set size.

    Parameters
    ----------
    estimator: sklearn estimator object type
       Object type that implements the "fit" and "predict method"

    X: array-like, shape (m_samples, n_features)
        Training array with m_samples and n_features.
    y: array-like, shape (m_samples)
        Target array relative to X with m labels.
    axes: array of 3 axes
        matplotlib axes array to append the generated plot
    ylim: array
       
    '''
    if axes == None:
        _, axes = plt.subplots(1, 1, figsize=(5, 5))
        
    axes.set(title=title, ylim=ylim, xlabel='Training examples', ylabel=f'Metric: {scoring}')
    # Use learning_curve function from sklearn
    train_sizes, train_scores, test_scores, fit_times, _ = \
       learning_curve(estimator, X, y, scoring=scoring, return_times = True,
                      cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    # Compute useful metrics
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
#     fit_times_mean = np.mean(fit_times, axis=1)
#     fit_times_std = np.std(fit_times, axis=1)
    # Plot the learning curve
    axes.grid()
    axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
                        train_scores_mean + train_scores_std, alpha=0.1, color='r')
    axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
                        test_scores_mean + test_scores_std, alpha=0.1, color='g')
    axes.plot(train_sizes, train_scores_mean, 'o-', color='r',
                label='Train score')
    axes.plot(train_sizes, test_scores_mean, 'o-', color='g',
                label='Cross-validation score')
    axes.legend(loc='lower right')
    

### Hyperparameters tunning: Grid Search

In [9]:
def run_grid_search(estimator, X_train, y_train, X_test, y_test, hyperparams,  cv_value=5, 
                    scoring='roc_auc', splitting='random', std_scale=False,
                   randomGS = False, n_iter=10):
    # Format the hyperparms
    hyperparams_dict = {'estimator__' + key: val for key, val in hyperparams.items()}
    
    if std_scale:
        # Create the Pipe
        scaler = StandardScaler()
        pipe = Pipeline([('scaler', scaler),
                         ('estimator', estimator)])
    else:
        pipe = estimator
    
    # Do Grid Search
    if randomGS:
        gs = RandomizedSearchCV(estimator = pipe,  param_distributions = hyperparams,
                     cv = cv_value, scoring = scoring, n_jobs = 6, refit = True, n_iter=n_iter)
    else:
        gs = GridSearchCV(estimator = pipe, param_grid = hyperparams,
                     cv = cv_value, scoring = scoring, n_jobs = 6, refit = True)
    # Train the model
    gs.fit(X_train, y_train) 

    # Predictions
    y_train_predict = gs.predict_proba(X_train)
    y_test_predict = gs.predict_proba(X_test)
    
    # Values to return
    n_train_mols    = y_train.shape[0]
    n_train_actives = y_train.sum()
    n_test_mols     = y_test.shape[0]
    n_test_actives  = y_test.sum()
    mean_cv_roc     = gs.best_score_
    train_roc       = roc_auc_score(y_train, y_train_predict[:, 1])
    test_roc        = roc_auc_score(y_test, y_test_predict[:, 1])
    best_params     = gs.best_params_

    # Print some values
    print(f'No. of molecules in train set: {n_train_mols}, with {n_train_actives} actives.')
    print(f'No. of molecules in test set: {n_test_mols}, with {n_test_actives} actives.')
    print('')
    print('*'*10, 'GRID SEARCH RESULTS', '*'*10)
    print('- Mean CV ROC-AUC:\t{:.3f}'.format(mean_cv_roc))
    print('- Train ROC-AUC:  \t{:.3f}'.format(train_roc))
    print('- Test ROC-AUC:   \t{:.3f}'.format(test_roc))
    print('- Best hyperparameters', best_params)
    print('**'*21)
    print('')
    
    return [n_train_mols, n_train_actives, n_test_mols, n_test_actives,
            mean_cv_roc, train_roc, test_roc, best_params]

### Function to report the Best Conformation's ROC-AUC for a given subset of samples

In [10]:
#************************************************************************
# Returns the best conformatio's ROC-AUC value of a given subset X and y
#************************************************************************

def get_roc_auc_DkSc(X_train, y_train, X_test, y_test, verbose=True):
    roc_auc_train = X_train.apply(
        lambda x: roc_auc_score(y_true= y_train, y_score= -x), axis=0)
    roc_auc_test = X_test.apply(
        lambda x: roc_auc_score(y_true= y_test, y_score= -x), axis=0)
    # Values to return
    train_best_roc = roc_auc_train.max()
    train_median   = roc_auc_train.median()
    train_mean     = roc_auc_train.mean()
    test_best_roc = roc_auc_test.max()
    test_median   = roc_auc_test.median()
    test_mean     = roc_auc_test.mean()
    
    if verbose:
        print("***** Best Conformation's ROC-AUC using docking scores *****")
        
        print("> Train best conf. ROC-AUC: {:.3f}".format(train_best_roc) +
              " \t> median: {:.3f}".format(train_median) +
              ', mean: {:.3f}'.format(train_mean))
        
        print("> Test best conf. ROC-AUC: {:.3f}".format(test_best_roc) +
              " \t> median: {:.3f}".format(test_median) +
              ', mean: {:.3f}'.format(test_mean))
        print('**'*32)
    # return a list of results to capture by the  wrapper function
    return [train_best_roc, train_median, train_mean,
            test_best_roc, test_median, test_mean]
        

In [28]:
#******************************************
# Decorator functions to capture GS results
#******************************************
from functools import wraps

def capture_GS_results(results_dict=None, capture=True):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            if results_dict != None and capture:
                results = func(*args, **kwargs)
                # Create a key with the first four values
                key = '_'.join(results[:4])
                # Append results to the results dictionary
                results_dict[key] = results
            else:
                return func(*args, **kwargs)
        return wrapper
    return decorator
            
#******************************************************
# Dictionary to capture GS results using the decorator 
#******************************************************
results_dict = {}

#******************************************************
# Function to Split and run Grid Search
#******************************************************
@capture_GS_results(results_dict)
def split_and_gs(train_name, test_name, estimator_name,
                 X, y, estimator, hyperparams, splitting='random', 
                 test_size=0.25, scaffold_series=None, random_state=None, **kwargs):
    '''Given a X and y sets, a sklean estimator and an splitting method, 
    performs Grid Search CV using the parsed hyperparams'''
    #**************
    # Do the split
    #**************
    if splitting == 'scaffold':
        X_train, X_test, y_train, y_test = \
            train_test_scaffold_split(X, y, scaffold_series = scaffold_series,
                test_size=test_size, stratify=y)
    elif splitting == 'random':
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=test_size, stratify=y,
                            random_state=random_state)
        
    # Verbose
    print(f'{estimator_name} => Train: {train_name}; Test: {test_name}; split: {splitting}')
    
    # Function to run Grid Search
    #----------------------------
    gs_results = run_grid_search(estimator, 
                    X_train, y_train, X_test, y_test, 
                    hyperparams = hyperparams,  **kwargs)

    # Function to extract ROC results from DkSc values 
    #-------------------------------------------------
    dksc_results = get_roc_auc_DkSc(X_train, y_train, X_test, y_test)
    
    # Return both list of results to captured by the decorador function
    return [train_name, test_name, estimator_name, splitting] + gs_results + dksc_results

In [12]:
results_dict

{}

#  Hyperparameter Tunning: Grid Search
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    Train-Test with DEKOIS Library
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

In [13]:
#### library = 'DEKOIS'
library = 'DEKOIS'

# Train and test over DEKOIS
X = X_merged_dksc.loc[library]
y = y_true_merged.loc[library]
scaffold_series = df_scff_murcko['scff_generic'].loc[library]

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Linear SVM </h3>
<b>DEKOIS  - Random and Stratified Scaffold Splitting</b>

In [14]:
%%time
from sklearn.svm import SVC

estimator_name = 'LinearSVC'
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-8, 1e4, 7)}

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

LinearSVC => Train: DEKOIS; Test: DEKOIS; split: random
No. of molecules in train set: 915, with 30 actives.
No. of molecules in test set: 306, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.889
- Train ROC-AUC:  	0.893
- Test ROC-AUC:   	0.888
- Best hyperparameters {'C': 1e-08}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.902 	> median: 0.835, mean: 0.832
> Test best conf. ROC-AUC: 0.916 	> median: 0.839, mean: 0.828
****************************************************************
CPU times: user 913 ms, sys: 419 ms, total: 1.33 s
Wall time: 5.73 s


In [1]:
%%time
# from sklearn.svm import SVC

estimator_name = 'LinearSVC'
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-8, 1e4, 7)}

# SCAFFOLD Train test splitting
split_and_gs(library, library, estimator_name,
              X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

NameError: name 'SVC' is not defined

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Radial Basis Function SVM </h3>
<b>DEKOIS  - Random and Stratified Scaffold Splitting</b>

In [16]:
%%time
estimator_name = 'rbfSVC'
estimator = SVC(kernel = 'rbf', probability=True)
hyperparams = {'C': np.geomspace(1e-8, 1e4, 7), 
               'gamma': np.geomspace(1e-8, 1e4, 7)}

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

rbfSVC => Train: DEKOIS; Test: DEKOIS; split: random
No. of molecules in train set: 915, with 30 actives.
No. of molecules in test set: 306, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.899
- Train ROC-AUC:  	0.907
- Test ROC-AUC:   	0.874
- Best hyperparameters {'C': 1e-08, 'gamma': 0.0001}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.918 	> median: 0.835, mean: 0.831
> Test best conf. ROC-AUC: 0.930 	> median: 0.832, mean: 0.831
****************************************************************
CPU times: user 1.64 s, sys: 414 ms, total: 2.06 s
Wall time: 12 s


In [17]:
%%time
estimator_name = 'rbfSVC'
estimator = SVC(kernel = 'rbf', probability=True)
hyperparams = {'C': np.geomspace(1e-8, 1e4, 7), 
               'gamma': np.geomspace(1e-8, 1e4, 7)}

# SCAFFOLD Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

rbfSVC => Train: DEKOIS; Test: DEKOIS; split: scaffold
No. of molecules in train set: 915, with 30 actives.
No. of molecules in test set: 306, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.864
- Train ROC-AUC:  	0.875
- Test ROC-AUC:   	0.954
- Best hyperparameters {'C': 1e-08, 'gamma': 1e-06}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.883 	> median: 0.816, mean: 0.811
> Test best conf. ROC-AUC: 0.967 	> median: 0.897, mean: 0.893
****************************************************************
CPU times: user 1.83 s, sys: 432 ms, total: 2.26 s
Wall time: 12.6 s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'> GS: Logistic Regression </h3>
<b>DEKOIS  - Random and Stratified Scaffold Splitting</b>

In [18]:
%%time
from sklearn.linear_model import LogisticRegression

estimator_name = 'LogReg'
estimator = LogisticRegression(max_iter=150)
hyperparams = {'C': np.geomspace(1e-8, 1e4, 7), 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

LogReg => Train: DEKOIS; Test: DEKOIS; split: random
No. of molecules in train set: 915, with 30 actives.
No. of molecules in test set: 306, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.873
- Train ROC-AUC:  	0.874
- Test ROC-AUC:   	0.934
- Best hyperparameters {'C': 0.0001, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.880 	> median: 0.824, mean: 0.818
> Test best conf. ROC-AUC: 0.971 	> median: 0.868, mean: 0.867
****************************************************************
CPU times: user 1.2 s, sys: 477 ms, total: 1.68 s
Wall time: 11.4 s


In [19]:
%%time
# from sklearn.linear_model import LogisticRegression

estimator_name = 'LogReg'
estimator = LogisticRegression(max_iter=150)
hyperparams = {'C': np.geomspace(1e-8, 1e4, 7), 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# SCAFFOLD Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

LogReg => Train: DEKOIS; Test: DEKOIS; split: scaffold
No. of molecules in train set: 915, with 30 actives.
No. of molecules in test set: 306, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.866
- Train ROC-AUC:  	0.870
- Test ROC-AUC:   	0.950
- Best hyperparameters {'C': 0.0001, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.883 	> median: 0.816, mean: 0.811
> Test best conf. ROC-AUC: 0.967 	> median: 0.897, mean: 0.893
****************************************************************
CPU times: user 1.61 s, sys: 680 ms, total: 2.29 s
Wall time: 10.3 s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: K-Neighbors Classifier</h3>
<b>DEKOIS  - Random and Stratified Scaffold Splitting</b>

In [20]:
%%time
from sklearn.neighbors import KNeighborsClassifier 

estimator_name = 'kNN'
estimator = KNeighborsClassifier()
hyperparams = {'n_neighbors': [25, 55, 125, 225], 
               'p': [1, 2]
             }

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None)

kNN => Train: DEKOIS; Test: DEKOIS; split: random
No. of molecules in train set: 915, with 30 actives.
No. of molecules in test set: 306, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.902
- Train ROC-AUC:  	0.910
- Test ROC-AUC:   	0.826
- Best hyperparameters {'n_neighbors': 225, 'p': 2}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.915 	> median: 0.853, mean: 0.847
> Test best conf. ROC-AUC: 0.887 	> median: 0.788, mean: 0.784
****************************************************************
CPU times: user 1.03 s, sys: 121 ms, total: 1.15 s
Wall time: 1.65 s


In [21]:
%%time
from sklearn.neighbors import KNeighborsClassifier 

estimator_name = 'kNN'
estimator = KNeighborsClassifier()
hyperparams = {'n_neighbors': [25, 55, 125, 225], 
               'p': [1, 2]
             }

# SCAFFOLD Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

kNN => Train: DEKOIS; Test: DEKOIS; split: scaffold
No. of molecules in train set: 915, with 30 actives.
No. of molecules in test set: 306, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.842
- Train ROC-AUC:  	0.870
- Test ROC-AUC:   	0.947
- Best hyperparameters {'n_neighbors': 225, 'p': 2}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.883 	> median: 0.816, mean: 0.811
> Test best conf. ROC-AUC: 0.967 	> median: 0.897, mean: 0.893
****************************************************************
CPU times: user 1.08 s, sys: 75 ms, total: 1.16 s
Wall time: 1.5 s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Decision Tree Classifier</h3>
<b>DEKOIS  - Random and Stratified Scaffold Splitting</b>

In [22]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator_name = 'DTree'
estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2, 0.25, 0.3],
               'max_features': [None, 'sqrt', 'log2']}

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None)

DTree => Train: DEKOIS; Test: DEKOIS; split: random
No. of molecules in train set: 915, with 30 actives.
No. of molecules in test set: 306, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.850
- Train ROC-AUC:  	0.859
- Test ROC-AUC:   	0.870
- Best hyperparameters {'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 0.25}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.895 	> median: 0.837, mean: 0.832
> Test best conf. ROC-AUC: 0.936 	> median: 0.832, mean: 0.827
****************************************************************
CPU times: user 1.6 s, sys: 404 ms, total: 2 s
Wall time: 2.28 s


In [23]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator_name = 'DTree'
estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2, 0.25, 0.3],
               'max_features': [None, 'sqrt', 'log2']}

# SCAFFOLD Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

DTree => Train: DEKOIS; Test: DEKOIS; split: scaffold
No. of molecules in train set: 915, with 30 actives.
No. of molecules in test set: 306, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.822
- Train ROC-AUC:  	0.895
- Test ROC-AUC:   	0.883
- Best hyperparameters {'criterion': 'entropy', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 0.2}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.883 	> median: 0.816, mean: 0.811
> Test best conf. ROC-AUC: 0.967 	> median: 0.897, mean: 0.893
****************************************************************
CPU times: user 1.19 s, sys: 259 ms, total: 1.45 s
Wall time: 1.73 s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Bagging Classifier (k-NN as base estimator) </h3>
<b>DEKOIS  - Random and Stratified Scaffold Splitting</b>

In [24]:
%%time
from sklearn.ensemble import BaggingClassifier

estimator_name = 'BagClf-kNN'
knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')
estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=6, oob_score=True)

hyperparams = {'n_estimators': [300]}

# RANDOM Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='random', test_size=0.25, 
#              scaffold_series=None)

CPU times: user 124 ms, sys: 69.4 ms, total: 193 ms
Wall time: 189 ms


In [25]:
%%time
# from sklearn.ensemble import BaggingClassifier

estimator_name = 'BagClf-kNN'
knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')
estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=6, oob_score=True)

hyperparams = {'n_estimators': [300]}

# SCAFFOLD Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='scaffold', test_size=0.25, 
#              scaffold_series=scaffold_series)

CPU times: user 33 µs, sys: 7 µs, total: 40 µs
Wall time: 47.7 µs


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS:Random Forest </h3>
<b>DEKOIS  - Random and Stratified Scaffold Splitting</b>

In [26]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator_name = 'RandForest'
estimator = RandomForestClassifier()
hyperparams = {'n_estimators': [300, 500],
             'max_depth': [2,  5],
             'min_samples_leaf': [0.1, 0.2],
             'max_features': ['log2', 'sqrt']
            }

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None)

RandForest => Train: DEKOIS; Test: DEKOIS; split: random
No. of molecules in train set: 915, with 30 actives.
No. of molecules in test set: 306, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.908
- Train ROC-AUC:  	0.941
- Test ROC-AUC:   	0.820
- Best hyperparameters {'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 0.1, 'n_estimators': 500}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.921 	> median: 0.857, mean: 0.853
> Test best conf. ROC-AUC: 0.890 	> median: 0.767, mean: 0.765
****************************************************************
CPU times: user 1.99 s, sys: 260 ms, total: 2.25 s
Wall time: 21 s


In [27]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator_name = 'RandForest'
estimator = RandomForestClassifier()
hyperparams = {'n_estimators': [300, 500],
             'max_depth': [2,  5],
             'min_samples_leaf': [0.1, 0.2],
             'max_features': ['log2', 'sqrt']
            }

# SCAFFOLD Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

RandForest => Train: DEKOIS; Test: DEKOIS; split: scaffold
No. of molecules in train set: 915, with 30 actives.
No. of molecules in test set: 306, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.858
- Train ROC-AUC:  	0.896
- Test ROC-AUC:   	0.941
- Best hyperparameters {'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 0.2, 'n_estimators': 300}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.883 	> median: 0.816, mean: 0.811
> Test best conf. ROC-AUC: 0.967 	> median: 0.897, mean: 0.893
****************************************************************
CPU times: user 1.64 s, sys: 250 ms, total: 1.89 s
Wall time: 21.4 s


***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    Train-Test with DUD Library
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

In [28]:
library = 'DUD'

# Train and test over DUDU
X = X_merged_dksc.loc[library]
y = y_true_merged.loc[library]
scaffold_series = df_scff_murcko['scff_generic'].loc[library]

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Linear SVM </h3>
<b>DUD  - Random and Stratified Scaffold Splitting</b>

In [29]:
%%time
from sklearn.svm import SVC

estimator_name = 'LinearSVC'
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-8, 1e2, 6)}

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None)

LinearSVC => Train: DUD; Test: DUD; split: random
No. of molecules in train set: 3669, with 106 actives.
No. of molecules in test set: 1224, with 35 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.850
- Train ROC-AUC:  	0.899
- Test ROC-AUC:   	0.817
- Best hyperparameters {'C': 1.0}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.709 	> median: 0.542, mean: 0.544
> Test best conf. ROC-AUC: 0.818 	> median: 0.652, mean: 0.652
****************************************************************
CPU times: user 6.88 s, sys: 142 ms, total: 7.02 s
Wall time: 12min 36s


In [30]:
%%time
# from sklearn.svm import SVC

estimator_name = 'LinearSVC'
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-8, 1e2, 6)}

# SCAFFOLD Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

LinearSVC => Train: DUD; Test: DUD; split: scaffold
No. of molecules in train set: 3669, with 105 actives.
No. of molecules in test set: 1224, with 36 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.895
- Train ROC-AUC:  	0.891
- Test ROC-AUC:   	0.609
- Best hyperparameters {'C': 1e-06}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.747 	> median: 0.545, mean: 0.547
> Test best conf. ROC-AUC: 0.718 	> median: 0.636, mean: 0.634
****************************************************************
CPU times: user 1.56 s, sys: 208 ms, total: 1.77 s
Wall time: 3min 54s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Radial Basis Function SVM </h3>
<b>DUD  - Random and Stratified Scaffold Splitting</b>

In [31]:
%%time

estimator_name = 'rbfSVC'
estimator = SVC(kernel = 'rbf', probability=True)
hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None)

rbfSVC => Train: DUD; Test: DUD; split: random
No. of molecules in train set: 3669, with 106 actives.
No. of molecules in test set: 1224, with 35 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.881
- Train ROC-AUC:  	0.980
- Test ROC-AUC:   	0.910
- Best hyperparameters {'C': 0.01, 'gamma': 0.01}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.704 	> median: 0.552, mean: 0.553
> Test best conf. ROC-AUC: 0.799 	> median: 0.626, mean: 0.625
****************************************************************
CPU times: user 3.6 s, sys: 207 ms, total: 3.81 s
Wall time: 34.6 s


In [32]:
%%time

estimator_name = 'rbfSVC'
estimator = SVC(kernel = 'rbf', probability=True)
hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# SCAFFOLD Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

rbfSVC => Train: DUD; Test: DUD; split: scaffold
No. of molecules in train set: 3669, with 105 actives.
No. of molecules in test set: 1224, with 36 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.941
- Train ROC-AUC:  	0.997
- Test ROC-AUC:   	0.588
- Best hyperparameters {'C': 0.01, 'gamma': 0.01}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.747 	> median: 0.545, mean: 0.547
> Test best conf. ROC-AUC: 0.718 	> median: 0.636, mean: 0.634
****************************************************************
CPU times: user 3.44 s, sys: 181 ms, total: 3.62 s
Wall time: 36.1 s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'> GS: Logistic Regression </h3>
<b>DUD  - Random and Stratified Scaffold Splitting</b>

In [33]:
%%time
from sklearn.linear_model import LogisticRegression

estimator_name = 'LogReg'
estimator = LogisticRegression(max_iter=150)
hyperparams = {'C': np.geomspace(1e-8, 1e2, 6), 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None)

LogReg => Train: DUD; Test: DUD; split: random
No. of molecules in train set: 3669, with 106 actives.
No. of molecules in test set: 1224, with 35 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.844
- Train ROC-AUC:  	0.889
- Test ROC-AUC:   	0.818
- Best hyperparameters {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.718 	> median: 0.563, mean: 0.564
> Test best conf. ROC-AUC: 0.743 	> median: 0.590, mean: 0.591
****************************************************************
CPU times: user 1.52 s, sys: 464 ms, total: 1.99 s
Wall time: 34.3 s


In [34]:
%%time
# from sklearn.linear_model import LogisticRegression

estimator_name = 'LogReg'
estimator = LogisticRegression(max_iter=150)
hyperparams = {'C': np.geomspace(1e-8, 1e2, 6), 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# SCAFFOLD Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

LogReg => Train: DUD; Test: DUD; split: scaffold
No. of molecules in train set: 3669, with 105 actives.
No. of molecules in test set: 1224, with 36 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.886
- Train ROC-AUC:  	0.967
- Test ROC-AUC:   	0.638
- Best hyperparameters {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.747 	> median: 0.545, mean: 0.547
> Test best conf. ROC-AUC: 0.718 	> median: 0.636, mean: 0.634
****************************************************************
CPU times: user 5.59 s, sys: 442 ms, total: 6.04 s
Wall time: 58.6 s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: K-Neighbors Classifier</h3>
<b>DUD  - Random and Stratified Scaffold Splitting</b>

In [35]:
%%time
from sklearn.neighbors import KNeighborsClassifier 

estimator_name = 'kNN'
estimator = KNeighborsClassifier()
hyperparams = {'n_neighbors': [25, 55, 125, 225], 
               'p': [1, 2]
             }

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None)

kNN => Train: DUD; Test: DUD; split: random
No. of molecules in train set: 3669, with 106 actives.
No. of molecules in test set: 1224, with 35 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.812
- Train ROC-AUC:  	0.875
- Test ROC-AUC:   	0.839
- Best hyperparameters {'n_neighbors': 125, 'p': 1}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.718 	> median: 0.566, mean: 0.568
> Test best conf. ROC-AUC: 0.739 	> median: 0.573, mean: 0.578
****************************************************************
CPU times: user 5.11 s, sys: 184 ms, total: 5.29 s
Wall time: 17.9 s


In [36]:
%%time
from sklearn.neighbors import KNeighborsClassifier 

estimator_name = 'kNN'
estimator = KNeighborsClassifier()
hyperparams = {'n_neighbors': [25, 55, 125, 225], 
               'p': [1, 2]
             }

# SCAFFOLD Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

kNN => Train: DUD; Test: DUD; split: scaffold
No. of molecules in train set: 3669, with 105 actives.
No. of molecules in test set: 1224, with 36 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.828
- Train ROC-AUC:  	0.951
- Test ROC-AUC:   	0.603
- Best hyperparameters {'n_neighbors': 55, 'p': 1}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.747 	> median: 0.545, mean: 0.547
> Test best conf. ROC-AUC: 0.718 	> median: 0.636, mean: 0.634
****************************************************************
CPU times: user 4.91 s, sys: 85.8 ms, total: 5 s
Wall time: 15.6 s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Decision Tree Classifier</h3>
<b>DUD  - Random and Stratified Scaffold Splitting</b>

In [37]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator_name = 'DTree'
estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2, 0.25, 0.3],
               'max_features': [None, 'sqrt', 'log2']}

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None)

DTree => Train: DUD; Test: DUD; split: random
No. of molecules in train set: 3669, with 106 actives.
No. of molecules in test set: 1224, with 35 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.774
- Train ROC-AUC:  	0.854
- Test ROC-AUC:   	0.779
- Best hyperparameters {'criterion': 'entropy', 'max_depth': 5, 'max_features': None, 'min_samples_split': 0.25}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.740 	> median: 0.588, mean: 0.586
> Test best conf. ROC-AUC: 0.690 	> median: 0.523, mean: 0.524
****************************************************************
CPU times: user 1.47 s, sys: 135 ms, total: 1.61 s
Wall time: 2.9 s


In [38]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator_name = 'DTree'
estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2, 0.25, 0.3],
               'max_features': [None, 'sqrt', 'log2']}

# SCAFFOLD Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

DTree => Train: DUD; Test: DUD; split: scaffold
No. of molecules in train set: 3669, with 105 actives.
No. of molecules in test set: 1224, with 36 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.809
- Train ROC-AUC:  	0.874
- Test ROC-AUC:   	0.579
- Best hyperparameters {'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'min_samples_split': 0.2}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.747 	> median: 0.545, mean: 0.547
> Test best conf. ROC-AUC: 0.718 	> median: 0.636, mean: 0.634
****************************************************************
CPU times: user 1.46 s, sys: 144 ms, total: 1.6 s
Wall time: 2.75 s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Bagging Classifier (k-NN as base estimator) </h3>
<b>DUD  - Random and Stratified Scaffold Splitting</b>

In [39]:
%%time
from sklearn.ensemble import BaggingClassifier

estimator_name = 'BagClf-kNN'
knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')
estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=6, oob_score=True)

hyperparams = {'n_estimators': [300]}

# RANDOM Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='random', test_size=0.25, 
#              scaffold_series=None)

CPU times: user 129 µs, sys: 16 µs, total: 145 µs
Wall time: 156 µs


In [40]:
%%time
# from sklearn.ensemble import BaggingClassifier

estimator_name = 'BagClf-kNN'
knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')
estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=6, oob_score=True)

hyperparams = {'n_estimators': [300]}

# SCAFFOLD Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='scaffold', test_size=0.25, 
#              scaffold_series=scaffold_series)

CPU times: user 36 µs, sys: 5 µs, total: 41 µs
Wall time: 47.4 µs


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS:Random Forest </h3>
<b>DUD  - Random and Stratified Scaffold Splitting</b>

In [41]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator_name = 'RandForest'
estimator = RandomForestClassifier()
hyperparams = {'n_estimators': [300, 500],
             'max_depth': [2,  5],
             'min_samples_leaf': [0.1, 0.2],
             'max_features': ['log2', 'sqrt']
            }

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None)

RandForest => Train: DUD; Test: DUD; split: random
No. of molecules in train set: 3669, with 106 actives.
No. of molecules in test set: 1224, with 35 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.776
- Train ROC-AUC:  	0.833
- Test ROC-AUC:   	0.868
- Best hyperparameters {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'n_estimators': 300}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.699 	> median: 0.558, mean: 0.558
> Test best conf. ROC-AUC: 0.809 	> median: 0.606, mean: 0.611
****************************************************************
CPU times: user 2.95 s, sys: 94.6 ms, total: 3.04 s
Wall time: 32.5 s


In [42]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator_name = 'RandForest'
estimator = RandomForestClassifier()
hyperparams = {'n_estimators': [300, 500],
             'max_depth': [2,  5],
             'min_samples_leaf': [0.1, 0.2],
             'max_features': ['log2', 'sqrt']
            }

# SCAFFOLD Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

RandForest => Train: DUD; Test: DUD; split: scaffold
No. of molecules in train set: 3669, with 105 actives.
No. of molecules in test set: 1224, with 36 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.808
- Train ROC-AUC:  	0.890
- Test ROC-AUC:   	0.650
- Best hyperparameters {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'n_estimators': 500}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.747 	> median: 0.545, mean: 0.547
> Test best conf. ROC-AUC: 0.718 	> median: 0.636, mean: 0.634
****************************************************************
CPU times: user 3.56 s, sys: 81.5 ms, total: 3.64 s
Wall time: 34 s


***

<h2 style='background-color: #F9E5AB; padding: 5px;'>
    Train-Test with different libraries 
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

##### Here, Random or Scaffold Splitting are not used

In [43]:
# Train DEKOIS, test DUD
@capture_GS_results(results_dict)
def split_by_library_and_gs(train_name, test_name, estimator_name,
                            X, y, lib_train_name, lib_test_name, **kwargs):
    '''Train-Test "split" by library, and run Grid Search. Splits the main dataframe by library
    using different molecular libraries for Training and Testing.'''
    splitting='by_library'
    
    X_train = X.loc[lib_train_name]
    y_train = y.loc[lib_train_name]

    X_test = X.loc[lib_test_name]
    y_test = y.loc[lib_test_name]

    # Verbose
    print(f'{estimator_name} => Train: {train_name}; Test: {test_name}; split: {splitting}')
    
    # Function to run Grid Search
    #----------------------------
    gs_results = run_grid_search(estimator, 
                    X_train, y_train, X_test, y_test, 
                    hyperparams = hyperparams,  **kwargs)

    # Function to extract ROC results from DkSc values 
    #-------------------------------------------------
    dksc_results = get_roc_auc_DkSc(X_train, y_train, X_test, y_test)
    
    # Return both list of results to captured by the decorador function
    return [train_name, test_name, estimator_name, splitting] + gs_results + dksc_results

In [44]:
# **********************************************
# Train and Test sets from diferent libraries
# **********************************************
X = X_merged_dksc
y = y_true_merged

<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Linear SVM  </h3>
<b>Train and Test with different libraries</b>

In [45]:
%%time
from sklearn.svm import SVC

estimator_name = 'LinearSVC'
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-8, 1e2, 6)}
# Train DEKOIS, test DUD

train_lib = 'DEKOIS'
test_lib = 'DUD'
split_by_library_and_gs(
    train_lib, test_lib, estimator_name,
    X, y, train_lib, test_lib)

LinearSVC => Train: DEKOIS; Test: DUD; split: by_library
No. of molecules in train set: 1221, with 40 actives.
No. of molecules in test set: 4893, with 141 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.889
- Train ROC-AUC:  	0.894
- Test ROC-AUC:   	0.607
- Best hyperparameters {'C': 1e-08}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.894 	> median: 0.833, mean: 0.831
> Test best conf. ROC-AUC: 0.720 	> median: 0.571, mean: 0.571
****************************************************************
CPU times: user 754 ms, sys: 59.6 ms, total: 814 ms
Wall time: 1.78 s


In [46]:
%%time
# from sklearn.svm import SVC

estimator_name = 'LinearSVC'
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-8, 1e2, 6)}

# Train DEKOIS, test DUD
train_lib = 'DUD'
test_lib = 'DEKOIS'
split_by_library_and_gs(
    train_lib, test_lib, estimator_name,
    X, y, train_lib, test_lib)

LinearSVC => Train: DUD; Test: DEKOIS; split: by_library
No. of molecules in train set: 4893, with 141 actives.
No. of molecules in test set: 1221, with 40 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.821
- Train ROC-AUC:  	0.871
- Test ROC-AUC:   	0.716
- Best hyperparameters {'C': 0.0001}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.720 	> median: 0.571, mean: 0.571
> Test best conf. ROC-AUC: 0.894 	> median: 0.833, mean: 0.831
****************************************************************
CPU times: user 2.86 s, sys: 147 ms, total: 3.01 s
Wall time: 19min 10s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Radial Basis Function SVM </h3>
<b>Train and Test with different libraries</b>

In [47]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

estimator_name = 'rbfSVC'
hyperparams = {'C': np.geomspace(1e-8, 1e2, 6), 
               'gamma': np.geomspace(1e-8, 1e2, 6)}

# Train DEKOIS, test DUD
train_lib = 'DEKOIS'
test_lib = 'DUD'
split_by_library_and_gs(
    train_lib, test_lib, estimator_name,
    X, y, train_lib, test_lib)

rbfSVC => Train: DEKOIS; Test: DUD; split: by_library
No. of molecules in train set: 1221, with 40 actives.
No. of molecules in test set: 4893, with 141 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.891
- Train ROC-AUC:  	0.895
- Test ROC-AUC:   	0.604
- Best hyperparameters {'C': 1e-06, 'gamma': 1e-08}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.894 	> median: 0.833, mean: 0.831
> Test best conf. ROC-AUC: 0.720 	> median: 0.571, mean: 0.571
****************************************************************
CPU times: user 1.69 s, sys: 214 ms, total: 1.9 s
Wall time: 15.7 s


In [48]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

estimator_name = 'rbfSVC'
hyperparams = {'C': np.geomspace(1e-8, 1e2, 6), 
               'gamma': np.geomspace(1e-8, 1e2, 6)}

# Train DEKOIS, test DUD
train_lib = 'DUD'
test_lib = 'DEKOIS'
split_by_library_and_gs(
    train_lib, test_lib, estimator_name,
    X, y, train_lib, test_lib)

rbfSVC => Train: DUD; Test: DEKOIS; split: by_library
No. of molecules in train set: 4893, with 141 actives.
No. of molecules in test set: 1221, with 40 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.869
- Train ROC-AUC:  	0.986
- Test ROC-AUC:   	0.658
- Best hyperparameters {'C': 1.0, 'gamma': 0.01}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.720 	> median: 0.571, mean: 0.571
> Test best conf. ROC-AUC: 0.894 	> median: 0.833, mean: 0.831
****************************************************************
CPU times: user 9.19 s, sys: 363 ms, total: 9.55 s
Wall time: 5min 20s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>Train and Test with different libraries</b>

In [49]:
%%time
from sklearn.linear_model import LogisticRegression

estimator_name = 'LogReg'
estimator = LogisticRegression(max_iter=150)
hyperparams = {'C': np.geomspace(1e-8, 1e2, 6),
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# Train DEKOIS, test DUD
train_lib = 'DEKOIS'
test_lib = 'DUD'
split_by_library_and_gs(
    train_lib, test_lib, estimator_name,
    X, y, train_lib, test_lib)

LogReg => Train: DEKOIS; Test: DUD; split: by_library
No. of molecules in train set: 1221, with 40 actives.
No. of molecules in test set: 4893, with 141 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.891
- Train ROC-AUC:  	0.890
- Test ROC-AUC:   	0.603
- Best hyperparameters {'C': 0.0001, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.894 	> median: 0.833, mean: 0.831
> Test best conf. ROC-AUC: 0.720 	> median: 0.571, mean: 0.571
****************************************************************
CPU times: user 1.43 s, sys: 379 ms, total: 1.81 s
Wall time: 12.7 s


In [50]:
%%time
# from sklearn.linear_model import LogisticRegression

estimator_name = 'LogReg'
estimator = LogisticRegression(max_iter=150)
hyperparams = {'C': np.geomspace(1e-8, 1e2, 6), 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# Train DEKOIS, test DUD
train_lib = 'DUD'
test_lib = 'DEKOIS'
split_by_library_and_gs(
    train_lib, test_lib, estimator_name,
    X, y, train_lib, test_lib)

LogReg => Train: DUD; Test: DEKOIS; split: by_library
No. of molecules in train set: 4893, with 141 actives.
No. of molecules in test set: 1221, with 40 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.815
- Train ROC-AUC:  	0.922
- Test ROC-AUC:   	0.758
- Best hyperparameters {'C': 100.0, 'penalty': 'l2', 'solver': 'liblinear'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.720 	> median: 0.571, mean: 0.571
> Test best conf. ROC-AUC: 0.894 	> median: 0.833, mean: 0.831
****************************************************************
CPU times: user 1.93 s, sys: 460 ms, total: 2.39 s
Wall time: 34 s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS:Random Forest  </h3>
<b>Train and Test with different libraries</b>

In [51]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator_name = 'RandForest'
estimator = RandomForestClassifier()
hyperparams = {'n_estimators': [300, 500],
             'max_depth': [3,  5, 7],
             'min_samples_leaf': [0.03, 0.05],
               'min_samples_split': [0.2,  0.3],
             'max_features': ['sqrt']
            }

# Train DEKOIS, test DUD
train_lib = 'DEKOIS'
test_lib = 'DUD'
split_by_library_and_gs(
    train_lib, test_lib, estimator_name,
    X, y, train_lib, test_lib)

RandForest => Train: DEKOIS; Test: DUD; split: by_library
No. of molecules in train set: 1221, with 40 actives.
No. of molecules in test set: 4893, with 141 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.884
- Train ROC-AUC:  	0.930
- Test ROC-AUC:   	0.573
- Best hyperparameters {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.03, 'min_samples_split': 0.3, 'n_estimators': 500}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.894 	> median: 0.833, mean: 0.831
> Test best conf. ROC-AUC: 0.720 	> median: 0.571, mean: 0.571
****************************************************************
CPU times: user 2.84 s, sys: 137 ms, total: 2.98 s
Wall time: 43 s


In [52]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator_name = 'RandForest'
estimator = RandomForestClassifier()
hyperparams = {'n_estimators': [300, 500],
             'max_depth': [3,  5, 7],
             'min_samples_leaf': [0.03, 0.05],
               'min_samples_split': [0.2,  0.3],
             'max_features': ['sqrt']
            }

# Train DEKOIS, test DUD
train_lib = 'DUD'
test_lib = 'DEKOIS'
split_by_library_and_gs(
    train_lib, test_lib, estimator_name,
    X, y, train_lib, test_lib)

RandForest => Train: DUD; Test: DEKOIS; split: by_library
No. of molecules in train set: 4893, with 141 actives.
No. of molecules in test set: 1221, with 40 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.801
- Train ROC-AUC:  	0.866
- Test ROC-AUC:   	0.671
- Best hyperparameters {'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 0.03, 'min_samples_split': 0.2, 'n_estimators': 300}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.720 	> median: 0.571, mean: 0.571
> Test best conf. ROC-AUC: 0.894 	> median: 0.833, mean: 0.831
****************************************************************
CPU times: user 4.32 s, sys: 550 ms, total: 4.87 s
Wall time: 1min 28s


***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    Merged libraries 
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

#### DEKOIS, DUD and COCRYS are  treated as one unique library

In [26]:
# Train and test over 
X = X_merged_dksc
# **** Original y values ****
y = y_true_merged

library = 'Merged'
scaffold_series = df_scff_murcko['scff_generic']


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Linear SVM </h3>
<b>Merged Libraries</b>

In [14]:
%%time
from sklearn.svm import SVC

estimator_name = 'LinearSVC'
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-8, 1e2, 6)}

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None)

LinearSVC => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.817
- Train ROC-AUC:  	0.855
- Test ROC-AUC:   	0.808
- Best hyperparameters {'C': 1.0}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.686 	> median: 0.610, mean: 0.608
> Test best conf. ROC-AUC: 0.680 	> median: 0.608, mean: 0.606
****************************************************************
CPU times: user 20.2 s, sys: 879 ms, total: 21.1 s
Wall time: 43min 34s


In [37]:
%%time
from sklearn.svm import SVC

estimator_name = 'LinearSVC'
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-8, 1e2, 6)}

# SCAFFOLD Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

CPU times: user 0 ns, sys: 791 µs, total: 791 µs
Wall time: 751 µs


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Radial Basis Function SVM</h3>
<b>Merged Libraries</b>

In [15]:
%%time
from sklearn.svm import SVC

estimator_name = 'rbfSVC'
estimator = SVC(kernel = 'rbf', probability=True)
hyperparams = {'C':np.geomspace(1e-8, 1e2, 6), 
               'gamma': np.geomspace(1e-8, 1e4, 7)}

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None)

rbfSVC => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.874
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.905
- Best hyperparameters {'C': 100.0, 'gamma': 0.01}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.699 	> median: 0.621, mean: 0.618
> Test best conf. ROC-AUC: 0.672 	> median: 0.574, mean: 0.573
****************************************************************
CPU times: user 13.8 s, sys: 1.34 s, total: 15.1 s
Wall time: 8min 31s


In [16]:
%%time

estimator_name = 'rbfSVC'
estimator = SVC(kernel = 'rbf', probability=True)
hyperparams = {'C': np.geomspace(1e-8, 1e2, 6), 
               'gamma': np.geomspace(1e-8, 1e2, 6)}

# SCAFFOLD Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

rbfSVC => Train: Merged; Test: Merged; split: scaffold
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.844
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.771
- Best hyperparameters {'C': 100.0, 'gamma': 0.01}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.687 	> median: 0.594, mean: 0.593
> Test best conf. ROC-AUC: 0.716 	> median: 0.654, mean: 0.647
****************************************************************
CPU times: user 11 s, sys: 517 ms, total: 11.5 s
Wall time: 6min 40s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>Merged Libraries</b>

In [27]:
%%time
from sklearn.linear_model import LogisticRegression

estimator_name = 'LogReg'
estimator = LogisticRegression(max_iter=250)
hyperparams = {'C': np.geomspace(1e-8, 1e2, 6), 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None, random_state=42)

LogReg => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.845
- Train ROC-AUC:  	0.900
- Test ROC-AUC:   	0.801
- Best hyperparameters {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.689 	> median: 0.614, mean: 0.613
> Test best conf. ROC-AUC: 0.678 	> median: 0.592, mean: 0.590
****************************************************************
CPU times: user 6.35 s, sys: 473 ms, total: 6.82 s
Wall time: 33.2 s


In [18]:
%%time
# from sklearn.linear_model import LogisticRegression

estimator_name = 'LogReg'
estimator = LogisticRegression(max_iter=150)
hyperparams = {'C': np.geomspace(1e-8, 1e2, 6), 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# SCAFFOLD Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

LogReg => Train: Merged; Test: Merged; split: scaffold
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.818
- Train ROC-AUC:  	0.903
- Test ROC-AUC:   	0.745
- Best hyperparameters {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.687 	> median: 0.594, mean: 0.593
> Test best conf. ROC-AUC: 0.716 	> median: 0.654, mean: 0.647
****************************************************************
CPU times: user 10.7 s, sys: 658 ms, total: 11.4 s
Wall time: 49.2 s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Decision Tree Classifier</h3>
<b>Merged Libraries</b>

In [19]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator_name = 'DTree'
estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2,  0.3],
               'min_samples_leaf': [0.02, 0.05, 0.1],
               'max_features': [None, 'sqrt', 'log2']}

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None)

DTree => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.750
- Train ROC-AUC:  	0.811
- Test ROC-AUC:   	0.662
- Best hyperparameters {'criterion': 'entropy', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 0.02, 'min_samples_split': 0.2}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.699 	> median: 0.621, mean: 0.619
> Test best conf. ROC-AUC: 0.660 	> median: 0.575, mean: 0.572
****************************************************************
CPU times: user 3.56 s, sys: 343 ms, total: 3.9 s
Wall time: 9.16 s


In [20]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator_name = 'DTree'
estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2,  0.3],
               'min_samples_leaf': [0.02, 0.05, 0.1],
               'max_features': [None, 'sqrt', 'log2']}

# SCAFFOLD Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

DTree => Train: Merged; Test: Merged; split: scaffold
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.715
- Train ROC-AUC:  	0.796
- Test ROC-AUC:   	0.617
- Best hyperparameters {'criterion': 'entropy', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 0.02, 'min_samples_split': 0.3}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.687 	> median: 0.594, mean: 0.593
> Test best conf. ROC-AUC: 0.716 	> median: 0.654, mean: 0.647
****************************************************************
CPU times: user 3.08 s, sys: 237 ms, total: 3.32 s
Wall time: 7.12 s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: KKN - 1-NN as reference estimator </h3>
<b>Merged Libraries</b>

In [21]:
%%time
from sklearn.neighbors import KNeighborsClassifier 

estimator_name = '1-NN'
estimator = KNeighborsClassifier()
hyperparams = {'n_neighbors': [1], 
               'p': [1, 2]
             }

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None)

1-NN => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.689
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.737
- Best hyperparameters {'n_neighbors': 1, 'p': 1}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.683 	> median: 0.605, mean: 0.604
> Test best conf. ROC-AUC: 0.700 	> median: 0.621, mean: 0.616
****************************************************************
CPU times: user 2.98 s, sys: 32.1 ms, total: 3.01 s
Wall time: 7.57 s


In [36]:
%%time
from sklearn.neighbors import KNeighborsClassifier 

estimator_name = '1-NN'
estimator = KNeighborsClassifier()
hyperparams = {'n_neighbors': [1], 
               'p': [1, 2]
             }

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

1-NN => Train: Merged; Test: Merged; split: scaffold
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.628
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.581
- Best hyperparameters {'n_neighbors': 1, 'p': 1}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.687 	> median: 0.594, mean: 0.593
> Test best conf. ROC-AUC: 0.716 	> median: 0.654, mean: 0.647
****************************************************************
CPU times: user 3.31 s, sys: 378 ms, total: 3.69 s
Wall time: 20.5 s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Random Forest </h3>
<b>Merged Libraries</b>

In [23]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator_name = 'RandForest'
estimator = RandomForestClassifier()
hyperparams = {'n_estimators': [300, 400],
               'max_depth': [2, 3],
               'min_samples_split': [0.1,  0.3],
               'min_samples_leaf': [0.02, 0.05],
               'max_features': ['sqrt']
            }

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None)

RandForest => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.739
- Train ROC-AUC:  	0.774
- Test ROC-AUC:   	0.732
- Best hyperparameters {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.05, 'min_samples_split': 0.1, 'n_estimators': 300}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.677 	> median: 0.593, mean: 0.593
> Test best conf. ROC-AUC: 0.721 	> median: 0.655, mean: 0.651
****************************************************************
CPU times: user 3.02 s, sys: 135 ms, total: 3.15 s
Wall time: 40.7 s


In [24]:
%%time
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)
estimator_name = 'RandForest'
estimator = RandomForestClassifier()
hyperparams = {'n_estimators': [300, 400],
               'max_depth': [2, 3],
               'min_samples_split': [0.1,  0.3],
               'min_samples_leaf': [0.02, 0.05],
               'max_features': ['sqrt']
            }

# SCAFFOLD Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

RandForest => Train: Merged; Test: Merged; split: scaffold
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.733
- Train ROC-AUC:  	0.796
- Test ROC-AUC:   	0.659
- Best hyperparameters {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.02, 'min_samples_split': 0.3, 'n_estimators': 400}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.687 	> median: 0.594, mean: 0.593
> Test best conf. ROC-AUC: 0.716 	> median: 0.654, mean: 0.647
****************************************************************
CPU times: user 4.2 s, sys: 114 ms, total: 4.31 s
Wall time: 41.5 s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: X Gradient Boosting</h3>
<b>Merged Libraries</b>

In [25]:
%%time

from xgboost import XGBClassifier

estimator_name = 'XGB_tree'
estimator = XGBClassifier()
hyperparams = {'n_estimators': [200, 300],
               'max_depth': [2, 3, 10, 20],
               'learning_rate': [0.05, 0.1],
               'gamma': [0.01, 0.1, 0.5, 1],
               'alpha': [0.01, 0.1, 0.5, 1],
               'subsample': [0.3, 0.5],
               'colsample_bytree': [0.3, 0.5, 1]
            }

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None,
             # RandomizedGS
            randomGS=True, n_iter=50)

XGB_tree => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.879
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.881
- Best hyperparameters {'subsample': 0.5, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0.5, 'colsample_bytree': 1, 'alpha': 0.5}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.677 	> median: 0.595, mean: 0.597
> Test best conf. ROC-AUC: 0.715 	> median: 0.647, mean: 0.639
****************************************************************
CPU times: user 22.9 s, sys: 2.58 s, total: 25.4 s
Wall time: 4min 7s


In [26]:
%%time

from xgboost import XGBClassifier

estimator_name = 'XGB_tree'
estimator = XGBClassifier()
hyperparams = {'n_estimators': [200, 300],
               'max_depth': [2, 3, 10],
               'learning_rate': [0.05, 0.1],
               'gamma': [0.01, 0.1, 0.5, 1],
               'alpha': [0.01, 0.1, 0.5, 1],
               'subsample': [0.3, 0.5],
               'colsample_bytree': [0.3, 0.5, 1]
            }

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series,
             # RandomizedGS
             randomGS=True, n_iter=50)

XGB_tree => Train: Merged; Test: Merged; split: scaffold
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.825
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.774
- Best hyperparameters {'subsample': 0.5, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.3, 'alpha': 0.5}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.687 	> median: 0.594, mean: 0.593
> Test best conf. ROC-AUC: 0.716 	> median: 0.654, mean: 0.647
****************************************************************
CPU times: user 10.8 s, sys: 2.39 s, total: 13.2 s
Wall time: 3min 44s


## Get, Format, and Save the Results

In [1]:
import pandas as pd
### Save Results
row_names = ['Train_set', 'Test_set', 'Model name', 'Split', 
             'N_actives_train', 'N_actives_test', 'N_mols_train', 'Num_mols_test',
            'Mean-CV-ROC', 'ROC-AUC_train', 'ROC-AUC_test', 'best_params',
            'DkS_max_ROC_train',  'DkSc_med_ROC_train', 'DkSc_mean_ROC_train', 
            'DkS_max_ROC_test',  'DkSc_med_ROC_test', 'DkSc_mean_ROC_test']


# Load or save
save=False
filename='./GRID_SEARCH_CV_results.csv'
if save:
    results_df = pd.DataFrame(results_dict, index=row_names).T.reset_index().drop('index', axis=1)
    results_df.to_csv(filename)
else:
    results_df = pd.read_csv(filename)
    
results_df

Unnamed: 0.1,Unnamed: 0,Train_set,Test_set,Model name,Split,N_actives_train,N_actives_test,N_mols_train,Num_mols_test,Mean-CV-ROC,ROC-AUC_train,ROC-AUC_test,best_params,DkS_max_ROC_train,DkSc_med_ROC_train,DkSc_mean_ROC_train,DkS_max_ROC_test,DkSc_med_ROC_test,DkSc_mean_ROC_test
0,0,Merged,Merged,rbfSVC,random,4674,225,1559,75,0.874269,1.0,0.905337,"{'C': 100.0, 'gamma': 0.01}",0.698589,0.621331,0.618417,0.67186,0.574059,0.573174
1,1,Merged,Merged,rbfSVC,scaffold,4674,225,1559,75,0.843799,1.0,0.771222,"{'C': 100.0, 'gamma': 0.01}",0.686529,0.59396,0.592621,0.715836,0.653908,0.647123
2,2,Merged,Merged,LogReg,random,4674,225,1559,75,0.834655,0.896315,0.824537,"{'C': 1.0, 'penalty': 'l1', 'solver': 'libline...",0.676247,0.597546,0.59652,0.7281,0.639944,0.638174
3,3,Merged,Merged,LogReg,scaffold,4674,225,1559,75,0.817614,0.902569,0.745346,"{'C': 1.0, 'penalty': 'l1', 'solver': 'libline...",0.686529,0.59396,0.592621,0.715836,0.653908,0.647123
4,4,Merged,Merged,DTree,random,4674,225,1559,75,0.74964,0.810925,0.661563,"{'criterion': 'entropy', 'max_depth': 5, 'max_...",0.69911,0.620677,0.618846,0.660328,0.575258,0.571804
5,5,Merged,Merged,DTree,scaffold,4674,225,1559,75,0.71549,0.795888,0.616662,"{'criterion': 'entropy', 'max_depth': 5, 'max_...",0.686529,0.59396,0.592621,0.715836,0.653908,0.647123
6,6,Merged,Merged,1-NN,random,4674,225,1559,75,0.686099,1.0,0.718365,"{'n_neighbors': 1, 'p': 1}",0.688421,0.612761,0.610996,0.679672,0.595384,0.59515
7,7,Merged,Merged,RandForest,random,4674,225,1559,75,0.738817,0.773626,0.732255,"{'max_depth': 3, 'max_features': 'sqrt', 'min_...",0.677436,0.592768,0.592647,0.720979,0.65491,0.650579
8,8,Merged,Merged,RandForest,scaffold,4674,225,1559,75,0.733276,0.795698,0.65903,"{'max_depth': 3, 'max_features': 'sqrt', 'min_...",0.686529,0.59396,0.592621,0.715836,0.653908,0.647123
9,9,Merged,Merged,XGB_tree,random,4674,225,1559,75,0.878854,0.99996,0.880809,"{'subsample': 0.5, 'n_estimators': 200, 'max_d...",0.677461,0.595364,0.596622,0.71451,0.647307,0.638951


***
<h2 style='background-color: #F9E5AB; padding: 5px;'>
    Merged libraries: Shuffle *y* target values in the train set
</h2>
<div style='background-color: #FE8460; min-height: 5px'></div>

#### DEKOIS, DUD and COCRYS are  treated as one unique library

In [32]:
# Train and test over 
X = X_merged_dksc
# ***** Permutate y values *****
y = y_true_merged.sample(frac=1)

library = 'Merged'
scaffold_series = df_scff_murcko['scff_generic']


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Linear SVM </h3>
<b>Merged Libraries</b>

In [29]:
%%time
from sklearn.svm import SVC

estimator_name = 'LinearSVC'
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-9, 3)}

# RANDOM Train test splitting
# split_and_gs(library, library, estimator_name,
#              X, y, estimator, hyperparams,
#              splitting='random', test_size=0.25, 
#              scaffold_series=None)

CPU times: user 306 µs, sys: 31 µs, total: 337 µs
Wall time: 298 µs


In [30]:
%%time
# from sklearn.svm import SVC

estimator_name = 'LinearSVC'
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-9, 3)}

# SCAFFOLD Train test splitting
# split_and_gs(library, library, estimator_name,
#              X, y, estimator, hyperparams,
#              splitting='scaffold', test_size=0.25, 
#              scaffold_series=scaffold_series)

CPU times: user 0 ns, sys: 1.08 ms, total: 1.08 ms
Wall time: 689 µs


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Radial Basis Function SVM</h3>
<b>Merged Libraries</b>

In [31]:
%%time
from sklearn.svm import SVC

estimator_name = 'rbfSVC'
estimator = SVC(kernel = 'rbf', probability=True)
hyperparams = {'C': np.geomspace(1e0, 1e2, 3), 
               'gamma': np.geomspace(1e-4, 1e0, 3)}

# RANDOM Train test splitting
# split_and_gs(library, library, estimator_name,
#              X, y, estimator, hyperparams,
#              splitting='random', test_size=0.25, 
#              scaffold_series=None)

CPU times: user 389 µs, sys: 39 µs, total: 428 µs
Wall time: 391 µs


In [32]:
%%time

estimator_name = 'rbfSVC'
estimator = SVC(kernel = 'rbf', probability=True)
hyperparams = {'C': np.geomspace(1e0, 1e2, 3), 
               'gamma': np.geomspace(1e-4, 1e0, 3)}

# SCAFFOLD Train test splitting
# split_and_gs(library, library, estimator_name,
#              X, y, estimator, hyperparams,
#              splitting='scaffold', test_size=0.25, 
#              scaffold_series=scaffold_series)

CPU times: user 810 µs, sys: 0 ns, total: 810 µs
Wall time: 745 µs


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: Logistic Regression</h3>
<b>Merged Libraries</b>

In [31]:
%%time
from sklearn.linear_model import LogisticRegression

estimator_name = 'LogReg'
estimator = LogisticRegression(max_iter=250)
hyperparams = {'C': np.geomspace(1e-8, 1e2, 6), 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None, random_state=42)

LogReg => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.533
- Train ROC-AUC:  	0.544
- Test ROC-AUC:   	0.459
- Best hyperparameters {'C': 1e-08, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.555 	> median: 0.525, mean: 0.525
> Test best conf. ROC-AUC: 0.515 	> median: 0.462, mean: 0.464
****************************************************************
CPU times: user 2.06 s, sys: 650 ms, total: 2.71 s
Wall time: 20.9 s


In [34]:
%%time
# from sklearn.linear_model import LogisticRegression

estimator_name = 'LogReg'
estimator = LogisticRegression()
hyperparams = {'C': np.geomspace(1e-4, 1e5, 4), 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# SCAFFOLD Train test splitting
# split_and_gs(library, library, estimator_name,
#              X, y, estimator, hyperparams,
#              splitting='scaffold', test_size=0.25, 
#              scaffold_series=scaffold_series)

CPU times: user 271 µs, sys: 27 µs, total: 298 µs
Wall time: 259 µs


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: KKN - 1-NN as reference estimator </h3>
<b>Merged Libraries</b>

In [20]:
%%time
from sklearn.neighbors import KNeighborsClassifier 

estimator_name = '1-NN'
estimator = KNeighborsClassifier()
hyperparams = {'n_neighbors': [1], 
               'p': [1, 2]
             }

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None)

1-NN => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.494
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.496
- Best hyperparameters {'n_neighbors': 1, 'p': 2}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.519 	> median: 0.485, mean: 0.486
> Test best conf. ROC-AUC: 0.583 	> median: 0.524, mean: 0.527
****************************************************************
CPU times: user 2.84 s, sys: 15.4 ms, total: 2.85 s
Wall time: 7.1 s


<h3 style='color: #F84122; padding: 0px;; margin: 0px'>GS: X Gradient Boosting</h3>
<b>Merged Libraries</b>

In [33]:
%%time

from xgboost import XGBClassifier

estimator_name = 'XGB_tree'
estimator = XGBClassifier()
hyperparams = {'n_estimators': [200, 300],
               'max_depth': [2, 3, 10, 20],
               'learning_rate': [0.05, 0.1],
               'gamma': [0.01, 0.1, 0.5, 1],
               'alpha': [0.01, 0.1, 0.5, 1],
               'subsample': [0.3, 0.5],
               'colsample_bytree': [0.3, 0.5, 1]
            }

# RANDOM Train test splitting
split_and_gs(library, library, estimator_name,
             X, y, estimator, hyperparams,
             splitting='random', test_size=0.25, 
             scaffold_series=None,
             # RandomizedGS
            randomGS=True, n_iter=50)

XGB_tree => Train: Merged; Test: Merged; split: random
No. of molecules in train set: 4674, with 225 actives.
No. of molecules in test set: 1559, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.569
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.484
- Best hyperparameters {'subsample': 0.5, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.05, 'gamma': 0.01, 'colsample_bytree': 0.3, 'alpha': 0.01}
******************************************

***** Best Conformation's ROC-AUC using docking scores *****
> Train best conf. ROC-AUC: 0.513 	> median: 0.469, mean: 0.471
> Test best conf. ROC-AUC: 0.539 	> median: 0.493, mean: 0.493
****************************************************************
CPU times: user 22.8 s, sys: 162 ms, total: 22.9 s
Wall time: 4min 27s
