# Model Selection

In [1]:
import pandas as pd
import numpy as np
import glob, sys, os
sys.path.append('..')

In [2]:
from modules.plotting_metrics import PlotMetric
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', context='talk', font_scale=0.8)

In [3]:
## Run the helper fucntions form the local CDK2 directory
# %run ./helper_1_load_data.ipynb

In [4]:
# Helper fucntion to train and test
# %run ./helper_functions_S6.ipynb

In [5]:
file_name = './df_DkSc_results_COCRYS_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape

(5839, 136)

### Timeout Decorator

In [6]:
import signal
from functools import wraps

def timeout(n_seconds=7*60):
    '''Stops a function execution after n seconds'''
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            # Set alarm for n seconds
            signal.alarm(n_seconds)
            try:
                # Call decorated func
                return func(*args, **kwargs)
            except TimeoutError as e:
                print(f'Execution finished after {n_seconds}:', e)
            finally:
                # Cancel Alarm
                signal.alarm(0)
        return wrapper
    return decorator

## Scaffold Splitting

In [7]:
#*************************************************
# Functions to compute stratify scaffold splitting
#*************************************************
from scaffold_splitter import train_test_scaffold_split

In [8]:
%%time
# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = './df_COCRYS_DUD_DEKOIS_Murcko_Scaffolds_SMILES.pkl'

df_scff_murcko = pd.read_pickle(file)

CPU times: user 418 ms, sys: 360 ms, total: 777 ms
Wall time: 838 ms


## Train/test on the same dataset 

### Learning Curves

In [9]:
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import learning_curve

In [10]:
def plot_learning_curves(estimator, X, y, title, ylim=[0.5,1], axes=None,
                         cv=3, train_sizes=np.linspace(0.1, 1.0, 10), 
                         scoring='roc_auc', n_jobs=4):
    '''
    Plot estimator performance on the training and validation
    sets as a function of the training set size.

    Parameters
    ----------
    estimator: sklearn estimator object type
       Object type that implements the "fit" and "predict method"

    X: array-like, shape (m_samples, n_features)
        Training array with m_samples and n_features.
    y: array-like, shape (m_samples)
        Target array relative to X with m labels.
    axes: array of 3 axes
        matplotlib axes array to append the generated plot
    ylim: array
       
    '''
    if axes == None:
        _, axes = plt.subplots(1, 1, figsize=(5, 5))
        
    axes.set(title=title, ylim=ylim, xlabel='Training examples', ylabel=f'Metric: {scoring}')
    # Use learning_curve function from sklearn
    train_sizes, train_scores, test_scores, fit_times, _ = \
       learning_curve(estimator, X, y, scoring=scoring, return_times = True,
                      cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    # Compute useful metrics
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
#     fit_times_mean = np.mean(fit_times, axis=1)
#     fit_times_std = np.std(fit_times, axis=1)
    # Plot the learning curve
    axes.grid()
    axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
                        train_scores_mean + train_scores_std, alpha=0.1, color='r')
    axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
                        test_scores_mean + test_scores_std, alpha=0.1, color='g')
    axes.plot(train_sizes, train_scores_mean, 'o-', color='r',
                label='Train score')
    axes.plot(train_sizes, test_scores_mean, 'o-', color='g',
                label='Cross-validation score')
    axes.legend(loc='lower right')
    

### Hyperparameters tunning: Grid Search

In [11]:
@timeout(300) #Finish execution after 5 mins
def run_grid_search(estimator, X_train, y_train, X_test, y_test, hyperparams,  cv_value=5, 
                    scoring='roc_auc', splitting='random'):

    # Format the hyperparms
    hyperparams_dict = {'estimator__' + key: val for key, val in hyperparams.items()}

    # Create the Pipe
    scaler = StandardScaler()
    pipe = Pipeline([('scaler', scaler),
                     ('estimator', estimator)])

    gs = GridSearchCV(estimator = pipe, param_grid = hyperparams_dict,
                     cv = cv_value, scoring = scoring, n_jobs = 6, refit = True)
                    # refit=True means to train the final model using the whole training set

    # Train the model
    gs.fit(X_train, y_train) 

    # Predictions
    y_train_predict = gs.predict_proba(X_train)
    y_test_predict = gs.predict_proba(X_test)

    # Print some values
    print(f'No. of molecules in train set: {y_train.shape[0]}, with {y_train.sum()} actives.')
    print(f'No. of molecules in test set: {y_test.shape[0]}, with {y_test.sum()} actives.')
    print('')
    print('*'*10, 'GRID SEARCH RESULTS', '*'*10)
    print('- Mean CV ROC-AUC score: {:.3f}'.format(gs.best_score_))
    print('- Train ROC-AUC: {:.3f}'.format(roc_auc_score(y_train, y_train_predict[:, 1])))
    print('- Test ROC-AUC: {:.3f}'.format(roc_auc_score(y_test, y_test_predict[:, 1])))
    print('- Best hyperparameters', gs.best_params_)
    print('**'*21)
    print('')

In [12]:
def run_cross_val_bias_var_tradeoff(estimator, X_train, y_train, X_test, y_test, cv_value=5, scoring='roc_auc',
                                   score_type='Dk_sc', pred_prob = True):
    
    # Print some values
    print(f'No. of molecules in train set: {y_train.shape[0]}, with {y_train.sum()} actives.')
    print(f'No. of molecules in test set: {y_test.shape[0]}, with {y_test.sum()} actives.')

    # Scaler
    scaler = StandardScaler()

    # Set the pipe
    pipe = Pipeline([('scaler', scaler), ('estimator', estimator)])

    # Set the CV score
    roc_cv = cross_val_score(pipe, X_train, y_train, cv=cv_value, scoring=scoring, n_jobs= -1)

    pipe.fit(X_train, y_train)
    
    if pred_prob:
        # Predict proba
        y_predict_train = pipe.predict_proba(X_train)
        y_predict_test = pipe.predict_proba(X_test)
    else:
        y_predict_train = pipe.predict(X_train)
        y_predict_test = pipe.predict(X_test)

    # CV Mean ROC-AUC
    print('CV={} ROC-AUC: {:.4f}'.format(cv_value, roc_cv.mean()))

    # Training ROC-AUC:
    print('Train ROC-AUC: {:.4f}'.format(roc_auc_score(y_train, y_predict_train[:, 1])))

    # Test ROC-AUC:
    print('Test ROC-AUC: {:.4f}'.format(roc_auc_score(y_test, y_predict_test[:, 1])))

### Function to report the Best Conformation's ROC-AUC for a given subset of samples

In [13]:
#************************************************************************
# Returns the best conformatio's ROC-AUC value of a given subset X and y
#************************************************************************
def _get_best_conf_roc_auc(X_train, y_train, X_test, y_test, verbose=True):
    best_roc_auc_train = X_train.apply(
        lambda x: roc_auc_score(y_true= y_train, y_score= -x), axis=0)
    best_roc_auc_test = X_test.apply(
        lambda x: roc_auc_score(y_true= y_test, y_score= -x), axis=0)
    if verbose:
        print('--'*30)
        print("**** Best Conformation's ROC-AUC using docking scores ****")
        print(f"> Train best conf. ROC-AUC: {round(best_roc_auc_train.max(), 3)}" +
              f" \t> Median ROC-AUC: {round(best_roc_auc_train.median(), 3)}")
        print(f"> Test best conf. ROC-AUC: {round(best_roc_auc_test.max(), 3)}" +
              f" \t> Median ROC-AUC: {round(best_roc_auc_test.median(), 3)}")
        print('--'*30)
        
        
def split_and_gs(X, y, estimator, hyperparams, splitting='random', 
                 test_size=0.25, scaffold_series=None):
    if splitting == 'scaffold':
        X_train, X_test, y_train, y_test = \
            train_test_scaffold_split(X, y, scaffold_series = scaffold_series,
                test_size=test_size, stratify=y)
    elif splitting == 'random':
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=test_size, stratify=y)

    run_grid_search(estimator, 
                    X_train, y_train, X_test, y_test, 
                    hyperparams = hyperparams)

    _get_best_conf_roc_auc(X_train, y_train, X_test, y_test)

##  Grid Search
### Hyperparameter Tunning
#### DEKOIS Library

In [14]:
#### library = 'DEKOIS'
library = 'DEKOIS'

# Train and test over DEKOIS
X = X_merged_dksc.loc[library]
y = y_true_merged.loc[library]
scaffold_series = df_scff_murcko['scff_generic'].loc[library]

### GS: Linear SVM 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [15]:
%%time
from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.873
- Train ROC-AUC: 0.881
- Test ROC-AUC: 0.923
- Best hyperparameters {'estimator__C': 1e-12}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.887 	> Median ROC-AUC: 0.815
> Test best conf. ROC-AUC: 0.956 	> Median ROC-AUC: 0.868
------------------------------------------------------------
CPU times: user 617 ms, sys: 295 ms, total: 912 ms
Wall time: 2.29 s


In [16]:
%%time
# from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.858
- Train ROC-AUC: 0.867
- Test ROC-AUC: 0.956
- Best hyperparameters {'estimator__C': 1e-12}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 525 ms, sys: 87.3 ms, total: 612 ms
Wall time: 705 ms


### GS: Radial Basis Function SVM
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [17]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.882
- Train ROC-AUC: 0.892
- Test ROC-AUC: 0.890
- Best hyperparameters {'estimator__C': 1e-08, 'estimator__gamma': 0.0001}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.825
> Test best conf. ROC-AUC: 0.939 	> Median ROC-AUC: 0.854
------------------------------------------------------------
CPU times: user 868 ms, sys: 158 ms, total: 1.03 s
Wall time: 2.08 s


In [18]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.853
- Train ROC-AUC: 0.867
- Test ROC-AUC: 0.956
- Best hyperparameters {'estimator__C': 1e-08, 'estimator__gamma': 1e-06}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 863 ms, sys: 315 ms, total: 1.18 s
Wall time: 2.11 s


### GS: Logistic Regession 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [19]:
%%time
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.888
- Train ROC-AUC: 0.911
- Test ROC-AUC: 0.883
- Best hyperparameters {'estimator__C': 0.001, 'estimator__penalty': 'l2', 'estimator__solver': 'liblinear'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.903 	> Median ROC-AUC: 0.842
> Test best conf. ROC-AUC: 0.929 	> Median ROC-AUC: 0.808
------------------------------------------------------------
CPU times: user 1.11 s, sys: 520 ms, total: 1.63 s
Wall time: 1.22 s


In [20]:
%%time
# from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.862
- Train ROC-AUC: 0.868
- Test ROC-AUC: 0.958
- Best hyperparameters {'estimator__C': 0.0001, 'estimator__penalty': 'l2', 'estimator__solver': 'liblinear'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 1.16 s, sys: 484 ms, total: 1.65 s
Wall time: 1.16 s


### GS: K-Neighbors Classifier
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [21]:
%%time
from sklearn.neighbors import KNeighborsClassifier 
estimator = KNeighborsClassifier()

hyperparams = {'n_neighbors': [25, 55, 125, 225], 
               'p': [1, 2]
             }

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.843
- Train ROC-AUC: 0.875
- Test ROC-AUC: 0.959
- Best hyperparameters {'estimator__n_neighbors': 125, 'estimator__p': 2}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.873 	> Median ROC-AUC: 0.806
> Test best conf. ROC-AUC: 0.977 	> Median ROC-AUC: 0.903
------------------------------------------------------------
CPU times: user 893 ms, sys: 134 ms, total: 1.03 s
Wall time: 1.42 s


In [22]:
%%time
from sklearn.neighbors import KNeighborsClassifier 
estimator = KNeighborsClassifier()

hyperparams = {'n_neighbors': [25, 55, 125, 225], 
               'p': [1, 2]
             }

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.838
- Train ROC-AUC: 0.862
- Test ROC-AUC: 0.956
- Best hyperparameters {'estimator__n_neighbors': 225, 'estimator__p': 2}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 846 ms, sys: 93.1 ms, total: 939 ms
Wall time: 1.3 s


### GS: Decision Tree Classifier
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [23]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2, 0.25, 0.3],
               'max_features': [None, 'sqrt', 'log2']}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.847
- Train ROC-AUC: 0.910
- Test ROC-AUC: 0.706
- Best hyperparameters {'estimator__criterion': 'entropy', 'estimator__max_depth': 3, 'estimator__max_features': 'sqrt', 'estimator__min_samples_split': 0.2}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.894 	> Median ROC-AUC: 0.826
> Test best conf. ROC-AUC: 0.939 	> Median ROC-AUC: 0.857
------------------------------------------------------------
CPU times: user 1.2 s, sys: 271 ms, total: 1.48 s
Wall time: 1.87 s


In [24]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2, 0.25, 0.3],
               'max_features': [None, 'sqrt', 'log2']}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.824
- Train ROC-AUC: 0.844
- Test ROC-AUC: 0.829
- Best hyperparameters {'estimator__criterion': 'entropy', 'estimator__max_depth': 2, 'estimator__max_features': 'sqrt', 'estimator__min_samples_split': 0.2}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 1.18 s, sys: 289 ms, total: 1.47 s
Wall time: 1.79 s


### GS: Bagging Classifier (k-NN as base estimator) 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [25]:
%%time
from sklearn.ensemble import BaggingClassifier

knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')

estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=6, oob_score=True)

hyperparams = {'n_estimators': [300]}

# RANDOM Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='random', test_size=0.25, 
#              scaffold_series=None)

CPU times: user 10.2 ms, sys: 8.9 ms, total: 19.1 ms
Wall time: 15.7 ms


In [26]:
%%time
# from sklearn.ensemble import BaggingClassifier

knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')

estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=6, oob_score=True)

hyperparams = {'n_estimators': [300]}

# SCAFFOLD Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='scaffold', test_size=0.25, 
#              scaffold_series=scaffold_series)

CPU times: user 34 µs, sys: 12 µs, total: 46 µs
Wall time: 54.4 µs


### GS:Random Forest 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [27]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [2,  5],
             'min_samples_leaf': [0.1, 0.2],
             'max_features': ['log2', 'sqrt']
            }

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.889
- Train ROC-AUC: 0.929
- Test ROC-AUC: 0.854
- Best hyperparameters {'estimator__max_depth': 2, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 0.1, 'estimator__n_estimators': 500}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.903 	> Median ROC-AUC: 0.83
> Test best conf. ROC-AUC: 0.908 	> Median ROC-AUC: 0.848
------------------------------------------------------------
CPU times: user 2.11 s, sys: 253 ms, total: 2.36 s
Wall time: 18.1 s


In [28]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [2,  5],
             'min_samples_leaf': [0.1, 0.2],
             'max_features': ['log2', 'sqrt']
            }

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.854
- Train ROC-AUC: 0.906
- Test ROC-AUC: 0.955
- Best hyperparameters {'estimator__max_depth': 2, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 0.1, 'estimator__n_estimators': 300}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 1.63 s, sys: 172 ms, total: 1.8 s
Wall time: 17.7 s


##  Grid Search
### Hyperparameter Tunning
#### Train and Test sets from different libraries 

In [29]:
# Train DEKOIS, test DUD

def split_by_library_and_gs(X, y, lib_train_name, lib_test_name):
    '''Train-Test "split" by library, and run Grid Search. Splits the main dataframe by library
    using different molecular libraries for Training and Testing.'''
    X_train = X.loc[lib_train_name]
    y_train = y.loc[lib_train_name]

    X_test = X.loc[lib_test_name]
    y_test = y.loc[lib_test_name]

    run_grid_search(estimator, 
                        X_train, y_train, X_test, y_test, 
                        hyperparams = hyperparams)

    _get_best_conf_roc_auc(X_train, y_train, X_test, y_test)

In [30]:
# **********************************************
# Train and Test sets from diferent libraries
# **********************************************
X = X_merged_dksc
y = y_true_merged

### GS: Linear SVM 
#### DEKOIS and DUD 

In [31]:
%%time
from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# Train DEKOIS, test DUD
train_lib = 'DEKOIS'
test_lib = 'DUD'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 1219, with 39 actives.
No. of molecules in test set: 4495, with 136 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.884
- Train ROC-AUC: 0.891
- Test ROC-AUC: 0.595
- Best hyperparameters {'estimator__C': 1e-12}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
> Test best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
------------------------------------------------------------
CPU times: user 675 ms, sys: 9.5 ms, total: 685 ms
Wall time: 1.14 s


In [32]:
%%time
# from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# Train DEKOIS, test DUD
train_lib = 'DUD'
test_lib = 'DEKOIS'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 4495, with 136 actives.
No. of molecules in test set: 1219, with 39 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.816
- Train ROC-AUC: 0.815
- Test ROC-AUC: 0.856
- Best hyperparameters {'estimator__C': 1e-15}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
> Test best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
------------------------------------------------------------
CPU times: user 1.71 s, sys: 71 ms, total: 1.78 s
Wall time: 9.47 s


### GS: Radial Basis Function SVM
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [33]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# Train DEKOIS, test DUD
train_lib = 'DEKOIS'
test_lib = 'DUD'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 1219, with 39 actives.
No. of molecules in test set: 4495, with 136 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.884
- Train ROC-AUC: 0.891
- Test ROC-AUC: 0.595
- Best hyperparameters {'estimator__C': 1e-08, 'estimator__gamma': 1e-06}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
> Test best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
------------------------------------------------------------
CPU times: user 1.02 s, sys: 48.7 ms, total: 1.06 s
Wall time: 3.51 s


In [34]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# Train DEKOIS, test DUD
train_lib = 'DUD'
test_lib = 'DEKOIS'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 4495, with 136 actives.
No. of molecules in test set: 1219, with 39 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.887
- Train ROC-AUC: 0.984
- Test ROC-AUC: 0.601
- Best hyperparameters {'estimator__C': 0.01, 'estimator__gamma': 0.01}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
> Test best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
------------------------------------------------------------
CPU times: user 4.45 s, sys: 139 ms, total: 4.59 s
Wall time: 57.4 s


### GS: Logistic Regession 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [35]:
%%time
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# Train DEKOIS, test DUD
train_lib = 'DEKOIS'
test_lib = 'DUD'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 1219, with 39 actives.
No. of molecules in test set: 4495, with 136 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.887
- Train ROC-AUC: 0.913
- Test ROC-AUC: 0.581
- Best hyperparameters {'estimator__C': 0.001, 'estimator__penalty': 'l2', 'estimator__solver': 'liblinear'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
> Test best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
------------------------------------------------------------
CPU times: user 1.16 s, sys: 488 ms, total: 1.65 s
Wall time: 1.41 s


In [36]:
%%time
# from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# Train DEKOIS, test DUD
train_lib = 'DUD'
test_lib = 'DEKOIS'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 4495, with 136 actives.
No. of molecules in test set: 1219, with 39 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.841
- Train ROC-AUC: 0.921
- Test ROC-AUC: 0.722
- Best hyperparameters {'estimator__C': 0.1, 'estimator__penalty': 'l2', 'estimator__solver': 'liblinear'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
> Test best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
------------------------------------------------------------
CPU times: user 1.56 s, sys: 613 ms, total: 2.17 s
Wall time: 4.36 s


### GS:Random Forest 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [37]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [3,  5, 7],
             'min_samples_leaf': [0.03, 0.05],
             'max_features': ['sqrt']
            }

# Train DEKOIS, test DUD
train_lib = 'DEKOIS'
test_lib = 'DUD'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 1219, with 39 actives.
No. of molecules in test set: 4495, with 136 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.870
- Train ROC-AUC: 0.928
- Test ROC-AUC: 0.551
- Best hyperparameters {'estimator__max_depth': 3, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 0.05, 'estimator__n_estimators': 300}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
> Test best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
------------------------------------------------------------
CPU times: user 1.67 s, sys: 41.2 ms, total: 1.71 s
Wall time: 19.1 s


In [38]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [3,  5, 7],
             'min_samples_leaf': [0.03, 0.05],
             'max_features': ['sqrt']
            }

# Train DEKOIS, test DUD
train_lib = 'DUD'
test_lib = 'DEKOIS'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 4495, with 136 actives.
No. of molecules in test set: 1219, with 39 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.840
- Train ROC-AUC: 0.912
- Test ROC-AUC: 0.638
- Best hyperparameters {'estimator__max_depth': 7, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 0.03, 'estimator__n_estimators': 300}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
> Test best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
------------------------------------------------------------
CPU times: user 3.33 s, sys: 64.2 ms, total: 3.4 s
Wall time: 40.9 s


***
##  Grid Search
### Hyperparameter Tunning
#### Merged Libraries 

In [39]:
# Train and test over DEKOIS
X = X_merged_dksc
y = y_true_merged
scaffold_series = df_scff_murcko['scff_generic']

### GS: Linear SVM 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [40]:
%%time
from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.782
- Train ROC-AUC: 0.835
- Test ROC-AUC: 0.743
- Best hyperparameters {'estimator__C': 1e-06}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.697 	> Median ROC-AUC: 0.621
> Test best conf. ROC-AUC: 0.65 	> Median ROC-AUC: 0.554
------------------------------------------------------------
CPU times: user 2.76 s, sys: 103 ms, total: 2.87 s
Wall time: 15.4 s


In [41]:
%%time
# from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.770
- Train ROC-AUC: 0.824
- Test ROC-AUC: 0.698
- Best hyperparameters {'estimator__C': 1e-06}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 2.58 s, sys: 93.3 ms, total: 2.67 s
Wall time: 14.1 s


### GS: Radial Basis Function SVM
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [42]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.846
- Train ROC-AUC: 0.975
- Test ROC-AUC: 0.839
- Best hyperparameters {'estimator__C': 0.01, 'estimator__gamma': 0.01}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.678 	> Median ROC-AUC: 0.611
> Test best conf. ROC-AUC: 0.686 	> Median ROC-AUC: 0.586
------------------------------------------------------------
CPU times: user 6.72 s, sys: 295 ms, total: 7.01 s
Wall time: 1min 32s


In [43]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.815
- Train ROC-AUC: 0.974
- Test ROC-AUC: 0.704
- Best hyperparameters {'estimator__C': 0.01, 'estimator__gamma': 0.01}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 5.83 s, sys: 244 ms, total: 6.07 s
Wall time: 1min 25s


### GS: Logistic Regession 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [44]:
%%time
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.831
- Train ROC-AUC: 0.888
- Test ROC-AUC: 0.803
- Best hyperparameters {'estimator__C': 0.1, 'estimator__penalty': 'l2', 'estimator__solver': 'lbfgs'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.688 	> Median ROC-AUC: 0.611
> Test best conf. ROC-AUC: 0.662 	> Median ROC-AUC: 0.582
------------------------------------------------------------
CPU times: user 1.75 s, sys: 631 ms, total: 2.39 s
Wall time: 4.35 s


In [45]:
%%time
# from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.823
- Train ROC-AUC: 0.903
- Test ROC-AUC: 0.741
- Best hyperparameters {'estimator__C': 0.1, 'estimator__penalty': 'l2', 'estimator__solver': 'liblinear'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 1.85 s, sys: 510 ms, total: 2.36 s
Wall time: 4.37 s


### GS: Decision Tree Classifier
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [46]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2, 0.25, 0.3],
               'max_features': [None, 'sqrt', 'log2']}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.749
- Train ROC-AUC: 0.797
- Test ROC-AUC: 0.718
- Best hyperparameters {'estimator__criterion': 'entropy', 'estimator__max_depth': 5, 'estimator__max_features': None, 'estimator__min_samples_split': 0.2}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.682 	> Median ROC-AUC: 0.601
> Test best conf. ROC-AUC: 0.68 	> Median ROC-AUC: 0.614
------------------------------------------------------------
CPU times: user 3.25 s, sys: 385 ms, total: 3.63 s
Wall time: 5.76 s


In [47]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2, 0.25, 0.3],
               'max_features': [None, 'sqrt', 'log2']}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.702
- Train ROC-AUC: 0.799
- Test ROC-AUC: 0.605
- Best hyperparameters {'estimator__criterion': 'entropy', 'estimator__max_depth': 5, 'estimator__max_features': None, 'estimator__min_samples_split': 0.25}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 2.93 s, sys: 294 ms, total: 3.23 s
Wall time: 5.28 s


### GS: Bagging Classifier (k-NN as base estimator) 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [48]:
%%time
from sklearn.ensemble import BaggingClassifier

knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')

estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=3, oob_score=True)

hyperparams = {'n_estimators': [300]}

# RANDOM Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='random', test_size=0.25, 
#              scaffold_series=None)

CPU times: user 63 µs, sys: 9 µs, total: 72 µs
Wall time: 79.4 µs


In [49]:
%%time
# from sklearn.ensemble import BaggingClassifier

knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')

estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=3, oob_score=True)

hyperparams = {'n_estimators': [300]}

# SCAFFOLD Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='scaffold', test_size=0.25, 
#              scaffold_series=scaffold_series)

CPU times: user 37 µs, sys: 6 µs, total: 43 µs
Wall time: 50.8 µs


### GS:Random Forest 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [50]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [3,  5, 7],
             'min_samples_leaf': [0.03, 0.05, 0.1],
             'max_features': ['sqrt']
            }

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.756
- Train ROC-AUC: 0.865
- Test ROC-AUC: 0.778
- Best hyperparameters {'estimator__max_depth': 7, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 0.03, 'estimator__n_estimators': 500}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.682 	> Median ROC-AUC: 0.606
> Test best conf. ROC-AUC: 0.696 	> Median ROC-AUC: 0.596
------------------------------------------------------------
CPU times: user 5.75 s, sys: 132 ms, total: 5.89 s
Wall time: 58.3 s


In [51]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [3,  5, 7],
             'min_samples_leaf': [0.03, 0.05, 0.1],
             'max_features': ['sqrt']
            }

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.757
- Train ROC-AUC: 0.874
- Test ROC-AUC: 0.669
- Best hyperparameters {'estimator__max_depth': 7, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 0.03, 'estimator__n_estimators': 300}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 3.9 s, sys: 141 ms, total: 4.04 s
Wall time: 54.8 s
