# Model Selection

In [10]:
import pandas as pd
import numpy as np
import glob, sys, os
sys.path.append('..')

In [11]:
from modules.plotting_metrics import PlotMetric
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', context='talk', font_scale=0.8)

In [12]:
## Run the helper fucntions form the local CDK2 directory
# %run ./helper_1_load_data.ipynb

In [13]:
# Helper fucntion to train and test
# %run ./helper_functions_S6.ipynb

In [14]:
file_name = './df_DkSc_results_COCRYS_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape

(5839, 136)

### Timeout Decorator

In [15]:
import signal
from functools import wraps

def timeout(n_seconds=300):
    '''Stops a function execution after n seconds'''
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            # Set alarm for n seconds
            signal.alarm(n_seconds)
            try:
                # Call decorated func
                return func(*args, **kwargs)
#             except TimeoutError as e:
                print(f'Execution finished after {n_seconds}:', e)
            finally:
                # Cancel Alarm
                signal.alarm(0)
        return wrapper
    return decorator

## Scaffold Splitting

In [16]:
#*************************************************
# Functions to compute stratify scaffold splitting
#*************************************************
from scaffold_splitter import train_test_scaffold_split

In [17]:
# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = './df_COCRYS_DUD_DEKOIS_Murcko_Scaffolds_SMILES.pkl'

df_scff_murcko = pd.read_pickle(file)

## Train/test on the same dataset 

### Learning Curves

In [18]:
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import learning_curve

In [19]:
def plot_learning_curves(estimator, X, y, title, ylim=[0.5,1], axes=None,
                         cv=3, train_sizes=np.linspace(0.1, 1.0, 10), 
                         scoring='roc_auc', n_jobs=4):
    '''
    Plot estimator performance on the training and validation
    sets as a function of the training set size.

    Parameters
    ----------
    estimator: sklearn estimator object type
       Object type that implements the "fit" and "predict method"

    X: array-like, shape (m_samples, n_features)
        Training array with m_samples and n_features.
    y: array-like, shape (m_samples)
        Target array relative to X with m labels.
    axes: array of 3 axes
        matplotlib axes array to append the generated plot
    ylim: array
       
    '''
    if axes == None:
        _, axes = plt.subplots(1, 1, figsize=(5, 5))
        
    axes.set(title=title, ylim=ylim, xlabel='Training examples', ylabel=f'Metric: {scoring}')
    # Use learning_curve function from sklearn
    train_sizes, train_scores, test_scores, fit_times, _ = \
       learning_curve(estimator, X, y, scoring=scoring, return_times = True,
                      cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    # Compute useful metrics
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
#     fit_times_mean = np.mean(fit_times, axis=1)
#     fit_times_std = np.std(fit_times, axis=1)
    # Plot the learning curve
    axes.grid()
    axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
                        train_scores_mean + train_scores_std, alpha=0.1, color='r')
    axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
                        test_scores_mean + test_scores_std, alpha=0.1, color='g')
    axes.plot(train_sizes, train_scores_mean, 'o-', color='r',
                label='Train score')
    axes.plot(train_sizes, test_scores_mean, 'o-', color='g',
                label='Cross-validation score')
    axes.legend(loc='lower right')
    

### Hyperparameters tunning: Grid Search

In [20]:
@timeout(360) #Finish execution after 6 mins
def run_grid_search(estimator, X_train, y_train, X_test, y_test, hyperparams,  cv_value=5, 
                    scoring='roc_auc', splitting='random', std_scale=True):

    # Format the hyperparms
    hyperparams_dict = {'estimator__' + key: val for key, val in hyperparams.items()}

    # Create the Pipe
    if std_scale:
        scaler = StandardScaler()
        pipe = Pipeline([('scaler', scaler),
                         ('estimator', estimator)])
    else:
        pipe = Pipeline(['estimator', estimator])
    # Do grid search
    gs = GridSearchCV(estimator = pipe, param_grid = hyperparams_dict,
                     cv = cv_value, scoring = scoring, n_jobs = 6, refit = True)
                    # refit=True means to train the final model using the whole training set

    # Train the model
    gs.fit(X_train, y_train) 

    # Predictions
    y_train_predict = gs.predict_proba(X_train)
    y_test_predict = gs.predict_proba(X_test)

    # Print some values
    print(f'No. of molecules in train set: {y_train.shape[0]}, with {y_train.sum()} actives.')
    print(f'No. of molecules in test set: {y_test.shape[0]}, with {y_test.sum()} actives.')
    print('')
    print('*'*10, 'GRID SEARCH RESULTS', '*'*10)
    print('- Mean CV ROC-AUC score: {:.3f}'.format(gs.best_score_))
    print('- Train ROC-AUC: {:.3f}'.format(roc_auc_score(y_train, y_train_predict[:, 1])))
    print('- Test ROC-AUC: {:.3f}'.format(roc_auc_score(y_test, y_test_predict[:, 1])))
    print('- Best hyperparameters', gs.best_params_)
    print('**'*21)
    print('')

In [21]:
def run_cross_val_bias_var_tradeoff(estimator, X_train, y_train, X_test, y_test, cv_value=5, scoring='roc_auc',
                                   score_type='Dk_sc', pred_prob = True):
    
    # Print some values
    print(f'No. of molecules in train set: {y_train.shape[0]}, with {y_train.sum()} actives.')
    print(f'No. of molecules in test set: {y_test.shape[0]}, with {y_test.sum()} actives.')

    # Scaler
    scaler = StandardScaler()

    # Set the pipe
    pipe = Pipeline([('scaler', scaler), ('estimator', estimator)])

    # Set the CV score
    roc_cv = cross_val_score(pipe, X_train, y_train, cv=cv_value, scoring=scoring, n_jobs= -1)

    pipe.fit(X_train, y_train)
    
    if pred_prob:
        # Predict proba
        y_predict_train = pipe.predict_proba(X_train)
        y_predict_test = pipe.predict_proba(X_test)
    else:
        y_predict_train = pipe.predict(X_train)
        y_predict_test = pipe.predict(X_test)

    # CV Mean ROC-AUC
    print('CV={} ROC-AUC: {:.4f}'.format(cv_value, roc_cv.mean()))

    # Training ROC-AUC:
    print('Train ROC-AUC: {:.4f}'.format(roc_auc_score(y_train, y_predict_train[:, 1])))

    # Test ROC-AUC:
    print('Test ROC-AUC: {:.4f}'.format(roc_auc_score(y_test, y_predict_test[:, 1])))

### Function to report the Best Conformation's ROC-AUC for a given subset of samples

In [22]:
#************************************************************************
# Returns the best conformatio's ROC-AUC value of a given subset X and y
#************************************************************************

def _get_best_conf_roc_auc(X_train, y_train, X_test, y_test, verbose=True):
    best_roc_auc_train = X_train.apply(
        lambda x: roc_auc_score(y_true= y_train, y_score= -x), axis=0)
    best_roc_auc_test = X_test.apply(
        lambda x: roc_auc_score(y_true= y_test, y_score= -x), axis=0)
    if verbose:
        print('--'*30)
        print("**** Best Conformation's ROC-AUC using docking scores ****")
        print(f"> Train best conf. ROC-AUC: {round(best_roc_auc_train.max(), 3)}" +
              f" \t> Median ROC-AUC: {round(best_roc_auc_train.median(), 3)}")
        print(f"> Test best conf. ROC-AUC: {round(best_roc_auc_test.max(), 3)}" +
              f" \t> Median ROC-AUC: {round(best_roc_auc_test.median(), 3)}")
        print('--'*30)
        
        
def split_and_gs(X, y, estimator, hyperparams, splitting='random', 
                 test_size=0.25, scaffold_series=None, **kwargs):
    if splitting == 'scaffold':
        X_train, X_test, y_train, y_test = \
            train_test_scaffold_split(X, y, scaffold_series = scaffold_series,
                test_size=test_size, stratify=y)
    elif splitting == 'random':
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=test_size, stratify=y)

    run_grid_search(estimator, 
                    X_train, y_train, X_test, y_test, 
                    hyperparams = hyperparams,  **kwargs)

    _get_best_conf_roc_auc(X_train, y_train, X_test, y_test)

##  Grid Search
### Hyperparameter Tunning
#### DEKOIS Library

In [14]:
#### library = 'DEKOIS'
library = 'DEKOIS'

# Train and test over DEKOIS
X = X_merged_dksc.loc[library]
y = y_true_merged.loc[library]
scaffold_series = df_scff_murcko['scff_generic'].loc[library]

### GS: Linear SVM 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [15]:
%%time
from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.905
- Train ROC-AUC: 0.904
- Test ROC-AUC: 0.854
- Best hyperparameters {'estimator__C': 1e-12}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.906 	> Median ROC-AUC: 0.839
> Test best conf. ROC-AUC: 0.907 	> Median ROC-AUC: 0.802
------------------------------------------------------------
CPU times: user 726 ms, sys: 349 ms, total: 1.07 s
Wall time: 2.7 s


In [16]:
%%time
# from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.858
- Train ROC-AUC: 0.867
- Test ROC-AUC: 0.956
- Best hyperparameters {'estimator__C': 1e-12}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 707 ms, sys: 88.6 ms, total: 796 ms
Wall time: 956 ms


### GS: Radial Basis Function SVM
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [17]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.912
- Train ROC-AUC: 0.916
- Test ROC-AUC: 0.818
- Best hyperparameters {'estimator__C': 1e-08, 'estimator__gamma': 1e-06}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.909 	> Median ROC-AUC: 0.849
> Test best conf. ROC-AUC: 0.882 	> Median ROC-AUC: 0.775
------------------------------------------------------------
CPU times: user 1.01 s, sys: 182 ms, total: 1.19 s
Wall time: 2.38 s


In [18]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.853
- Train ROC-AUC: 0.867
- Test ROC-AUC: 0.956
- Best hyperparameters {'estimator__C': 1e-08, 'estimator__gamma': 1e-06}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 979 ms, sys: 176 ms, total: 1.16 s
Wall time: 2.38 s


### GS: Logistic Regession 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [19]:
%%time
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.900
- Train ROC-AUC: 0.917
- Test ROC-AUC: 0.873
- Best hyperparameters {'estimator__C': 0.001, 'estimator__penalty': 'l2', 'estimator__solver': 'liblinear'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.898 	> Median ROC-AUC: 0.838
> Test best conf. ROC-AUC: 0.908 	> Median ROC-AUC: 0.811
------------------------------------------------------------
CPU times: user 1.23 s, sys: 547 ms, total: 1.78 s
Wall time: 1.58 s


In [20]:
%%time
# from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.862
- Train ROC-AUC: 0.868
- Test ROC-AUC: 0.958
- Best hyperparameters {'estimator__C': 0.0001, 'estimator__penalty': 'l2', 'estimator__solver': 'liblinear'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 1.26 s, sys: 563 ms, total: 1.82 s
Wall time: 1.5 s


### GS: K-Neighbors Classifier
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [21]:
%%time
from sklearn.neighbors import KNeighborsClassifier 
estimator = KNeighborsClassifier()

hyperparams = {'n_neighbors': [25, 55, 125, 225], 
               'p': [1, 2]
             }

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.902
- Train ROC-AUC: 0.932
- Test ROC-AUC: 0.830
- Best hyperparameters {'estimator__n_neighbors': 55, 'estimator__p': 2}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.907 	> Median ROC-AUC: 0.85
> Test best conf. ROC-AUC: 0.916 	> Median ROC-AUC: 0.787
------------------------------------------------------------
CPU times: user 1.04 s, sys: 108 ms, total: 1.15 s
Wall time: 1.74 s


In [22]:
%%time
from sklearn.neighbors import KNeighborsClassifier 
estimator = KNeighborsClassifier()

hyperparams = {'n_neighbors': [25, 55, 125, 225], 
               'p': [1, 2]
             }

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.838
- Train ROC-AUC: 0.862
- Test ROC-AUC: 0.956
- Best hyperparameters {'estimator__n_neighbors': 225, 'estimator__p': 2}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 947 ms, sys: 138 ms, total: 1.09 s
Wall time: 1.7 s


### GS: Decision Tree Classifier
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [23]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2, 0.25, 0.3],
               'max_features': [None, 'sqrt', 'log2']}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.802
- Train ROC-AUC: 0.854
- Test ROC-AUC: 0.939
- Best hyperparameters {'estimator__criterion': 'entropy', 'estimator__max_depth': 2, 'estimator__max_features': 'sqrt', 'estimator__min_samples_split': 0.3}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.883 	> Median ROC-AUC: 0.808
> Test best conf. ROC-AUC: 0.974 	> Median ROC-AUC: 0.9
------------------------------------------------------------
CPU times: user 1.61 s, sys: 356 ms, total: 1.96 s
Wall time: 2.56 s


In [24]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2, 0.25, 0.3],
               'max_features': [None, 'sqrt', 'log2']}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.808
- Train ROC-AUC: 0.849
- Test ROC-AUC: 0.914
- Best hyperparameters {'estimator__criterion': 'entropy', 'estimator__max_depth': 3, 'estimator__max_features': 'log2', 'estimator__min_samples_split': 0.3}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 1.53 s, sys: 381 ms, total: 1.91 s
Wall time: 2.58 s


### GS: Bagging Classifier (k-NN as base estimator) 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [25]:
%%time
from sklearn.ensemble import BaggingClassifier

knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')

estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=6, oob_score=True)

hyperparams = {'n_estimators': [300]}

# RANDOM Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='random', test_size=0.25, 
#              scaffold_series=None)

CPU times: user 8.54 ms, sys: 12.2 ms, total: 20.8 ms
Wall time: 17.7 ms


In [26]:
%%time
# from sklearn.ensemble import BaggingClassifier

knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')

estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=6, oob_score=True)

hyperparams = {'n_estimators': [300]}

# SCAFFOLD Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='scaffold', test_size=0.25, 
#              scaffold_series=scaffold_series)

CPU times: user 43 µs, sys: 14 µs, total: 57 µs
Wall time: 65.6 µs


### GS:Random Forest 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [27]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [2,  5],
             'min_samples_leaf': [0.1, 0.2],
             'max_features': ['log2', 'sqrt']
            }

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.879
- Train ROC-AUC: 0.921
- Test ROC-AUC: 0.875
- Best hyperparameters {'estimator__max_depth': 2, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 0.1, 'estimator__n_estimators': 500}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.894 	> Median ROC-AUC: 0.826
> Test best conf. ROC-AUC: 0.942 	> Median ROC-AUC: 0.856
------------------------------------------------------------
CPU times: user 2.51 s, sys: 245 ms, total: 2.75 s
Wall time: 23.8 s


In [28]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [2,  5],
             'min_samples_leaf': [0.1, 0.2],
             'max_features': ['log2', 'sqrt']
            }

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.854
- Train ROC-AUC: 0.901
- Test ROC-AUC: 0.960
- Best hyperparameters {'estimator__max_depth': 2, 'estimator__max_features': 'log2', 'estimator__min_samples_leaf': 0.1, 'estimator__n_estimators': 300}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 1.97 s, sys: 247 ms, total: 2.22 s
Wall time: 23.5 s


##  Grid Search
### Hyperparameter Tunning
#### Train and Test sets from different libraries 

In [29]:
# Train DEKOIS, test DUD

def split_by_library_and_gs(X, y, lib_train_name, lib_test_name):
    '''Train-Test "split" by library, and run Grid Search. Splits the main dataframe by library
    using different molecular libraries for Training and Testing.'''
    X_train = X.loc[lib_train_name]
    y_train = y.loc[lib_train_name]

    X_test = X.loc[lib_test_name]
    y_test = y.loc[lib_test_name]

    run_grid_search(estimator, 
                        X_train, y_train, X_test, y_test, 
                        hyperparams = hyperparams)

    _get_best_conf_roc_auc(X_train, y_train, X_test, y_test)

In [30]:
# **********************************************
# Train and Test sets from diferent libraries
# **********************************************
X = X_merged_dksc
y = y_true_merged

### GS: Linear SVM 
#### DEKOIS and DUD 

In [31]:
%%time
from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# Train DEKOIS, test DUD
train_lib = 'DEKOIS'
test_lib = 'DUD'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 1219, with 39 actives.
No. of molecules in test set: 4495, with 136 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.884
- Train ROC-AUC: 0.891
- Test ROC-AUC: 0.595
- Best hyperparameters {'estimator__C': 1e-12}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
> Test best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
------------------------------------------------------------
CPU times: user 864 ms, sys: 21.7 ms, total: 886 ms
Wall time: 1.44 s


In [32]:
%%time
# from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# Train DEKOIS, test DUD
train_lib = 'DUD'
test_lib = 'DEKOIS'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 4495, with 136 actives.
No. of molecules in test set: 1219, with 39 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.816
- Train ROC-AUC: 0.815
- Test ROC-AUC: 0.856
- Best hyperparameters {'estimator__C': 1e-15}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
> Test best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
------------------------------------------------------------
CPU times: user 2.34 s, sys: 106 ms, total: 2.45 s
Wall time: 13.7 s


### GS: Radial Basis Function SVM
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [33]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# Train DEKOIS, test DUD
train_lib = 'DEKOIS'
test_lib = 'DUD'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 1219, with 39 actives.
No. of molecules in test set: 4495, with 136 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.884
- Train ROC-AUC: 0.891
- Test ROC-AUC: 0.595
- Best hyperparameters {'estimator__C': 1e-08, 'estimator__gamma': 1e-06}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
> Test best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
------------------------------------------------------------
CPU times: user 1.37 s, sys: 85.7 ms, total: 1.45 s
Wall time: 4.59 s


In [34]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# Train DEKOIS, test DUD
train_lib = 'DUD'
test_lib = 'DEKOIS'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 4495, with 136 actives.
No. of molecules in test set: 1219, with 39 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.887
- Train ROC-AUC: 0.984
- Test ROC-AUC: 0.601
- Best hyperparameters {'estimator__C': 0.01, 'estimator__gamma': 0.01}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
> Test best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
------------------------------------------------------------
CPU times: user 5.98 s, sys: 249 ms, total: 6.23 s
Wall time: 1min 20s


### GS: Logistic Regession 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [35]:
%%time
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': np.geomspace(1e-4, 1e5, 4),
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# Train DEKOIS, test DUD
train_lib = 'DEKOIS'
test_lib = 'DUD'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 1219, with 39 actives.
No. of molecules in test set: 4495, with 136 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.886
- Train ROC-AUC: 0.891
- Test ROC-AUC: 0.588
- Best hyperparameters {'estimator__C': 0.0001, 'estimator__penalty': 'l2', 'estimator__solver': 'liblinear'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
> Test best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
------------------------------------------------------------
CPU times: user 1.46 s, sys: 444 ms, total: 1.91 s
Wall time: 5.07 s


In [36]:
%%time
# from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': np.geomspace(1e-4, 1e5, 4), 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# Train DEKOIS, test DUD
train_lib = 'DUD'
test_lib = 'DEKOIS'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 4495, with 136 actives.
No. of molecules in test set: 1219, with 39 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.841
- Train ROC-AUC: 0.921
- Test ROC-AUC: 0.722
- Best hyperparameters {'estimator__C': 0.1, 'estimator__penalty': 'l2', 'estimator__solver': 'liblinear'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
> Test best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
------------------------------------------------------------
CPU times: user 1.81 s, sys: 534 ms, total: 2.35 s
Wall time: 9.21 s


### GS:Random Forest 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [37]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [3,  5, 7],
             'min_samples_leaf': [0.03, 0.05],
               'min_samples_split': [0.2,  0.3],
             'max_features': ['sqrt']
            }

# Train DEKOIS, test DUD
train_lib = 'DEKOIS'
test_lib = 'DUD'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 1219, with 39 actives.
No. of molecules in test set: 4495, with 136 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.871
- Train ROC-AUC: 0.936
- Test ROC-AUC: 0.565
- Best hyperparameters {'estimator__max_depth': 5, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 0.03, 'estimator__min_samples_split': 0.2, 'estimator__n_estimators': 300}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
> Test best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
------------------------------------------------------------
CPU times: user 2.61 s, sys: 112 ms, total: 2.73 s
Wall time: 48.6 s


In [38]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [3,  5, 7],
             'min_samples_leaf': [0.03, 0.05],
               'min_samples_split': [0.2,  0.3],
             'max_features': ['sqrt']
            }

# Train DEKOIS, test DUD
train_lib = 'DUD'
test_lib = 'DEKOIS'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 4495, with 136 actives.
No. of molecules in test set: 1219, with 39 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.825
- Train ROC-AUC: 0.880
- Test ROC-AUC: 0.636
- Best hyperparameters {'estimator__max_depth': 7, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 0.03, 'estimator__min_samples_split': 0.2, 'estimator__n_estimators': 500}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
> Test best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
------------------------------------------------------------
CPU times: user 6.21 s, sys: 181 ms, total: 6.39 s
Wall time: 1min 27s


***
##  Grid Search
### Hyperparameter Tunning
#### Merged Libraries 

In [23]:
# Train and test over DEKOIS
X = X_merged_dksc
y = y_true_merged
scaffold_series = df_scff_murcko['scff_generic']

### GS: Linear SVM 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [40]:
%%time
from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-9, 3)}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.742
- Train ROC-AUC: 0.699
- Test ROC-AUC: 0.624
- Best hyperparameters {'estimator__C': 1e-12}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.686 	> Median ROC-AUC: 0.605
> Test best conf. ROC-AUC: 0.691 	> Median ROC-AUC: 0.608
------------------------------------------------------------
CPU times: user 2.43 s, sys: 92.8 ms, total: 2.53 s
Wall time: 11.5 s


In [41]:
%%time
# from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-9, 3)}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.735
- Train ROC-AUC: 0.742
- Test ROC-AUC: 0.690
- Best hyperparameters {'estimator__C': 1e-12}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 3.62 s, sys: 113 ms, total: 3.74 s
Wall time: 12.1 s


### GS: Radial Basis Function SVM
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [30]:
import numpy as np
np.geomspace(1e0, 1e4, 4)
np.geomspace(1e-4, 1e0, 3)

array([1.e-04, 1.e-02, 1.e+00])

In [31]:
%%time
from sklearn.svm import SVC
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e0, 1e2, 3), 
               'gamma': np.geomspace(1e-4, 1e0, 3)}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.867
- Train ROC-AUC: 0.999
- Test ROC-AUC: 0.873
- Best hyperparameters {'estimator__C': 10.0, 'estimator__gamma': 0.01}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.693 	> Median ROC-AUC: 0.612
> Test best conf. ROC-AUC: 0.711 	> Median ROC-AUC: 0.571
------------------------------------------------------------
CPU times: user 7.15 s, sys: 0 ns, total: 7.15 s
Wall time: 2min 40s


In [32]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e0, 1e2, 3), 
               'gamma': np.geomspace(1e-4, 1e0, 3)}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.845
- Train ROC-AUC: 1.000
- Test ROC-AUC: 0.745
- Best hyperparameters {'estimator__C': 10.0, 'estimator__gamma': 0.01}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 6.23 s, sys: 0 ns, total: 6.23 s
Wall time: 2min 23s


### GS: Logistic Regession 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [44]:
%%time
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': np.geomspace(1e-4, 1e4, 3), 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.799
- Train ROC-AUC: 0.883
- Test ROC-AUC: 0.870
- Best hyperparameters {'estimator__C': 1.0, 'estimator__penalty': 'l1', 'estimator__solver': 'liblinear'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.674 	> Median ROC-AUC: 0.605
> Test best conf. ROC-AUC: 0.708 	> Median ROC-AUC: 0.604
------------------------------------------------------------
CPU times: user 1.57 s, sys: 553 ms, total: 2.13 s
Wall time: 5.1 s


In [45]:
%%time
# from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': np.geomspace(1e-4, 1e5, 4), 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.823
- Train ROC-AUC: 0.903
- Test ROC-AUC: 0.741
- Best hyperparameters {'estimator__C': 0.1, 'estimator__penalty': 'l2', 'estimator__solver': 'liblinear'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 1.82 s, sys: 560 ms, total: 2.38 s
Wall time: 6.25 s


### GS: Decision Tree Classifier
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [46]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2,  0.3],
               'min_samples_leaf': [0.02, 0.05, 0.1],
               'max_features': [None, 'sqrt', 'log2']}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.743
- Train ROC-AUC: 0.810
- Test ROC-AUC: 0.631
- Best hyperparameters {'estimator__criterion': 'gini', 'estimator__max_depth': 5, 'estimator__max_features': None, 'estimator__min_samples_leaf': 0.05, 'estimator__min_samples_split': 0.2}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.692 	> Median ROC-AUC: 0.612
> Test best conf. ROC-AUC: 0.651 	> Median ROC-AUC: 0.575
------------------------------------------------------------
CPU times: user 5.31 s, sys: 526 ms, total: 5.84 s
Wall time: 10.8 s


In [47]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2,  0.3],
               'min_samples_leaf': [0.02, 0.05, 0.1],
               'max_features': [None, 'sqrt', 'log2']}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.718
- Train ROC-AUC: 0.762
- Test ROC-AUC: 0.613
- Best hyperparameters {'estimator__criterion': 'entropy', 'estimator__max_depth': 5, 'estimator__max_features': 'log2', 'estimator__min_samples_leaf': 0.02, 'estimator__min_samples_split': 0.3}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 5.54 s, sys: 625 ms, total: 6.17 s
Wall time: 10.3 s


### GS: Bagging Classifier (k-NN as base estimator) 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [48]:
%%time
from sklearn.ensemble import BaggingClassifier

knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')

estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=3, oob_score=True)

hyperparams = {'n_estimators': [300]}

# RANDOM Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='random', test_size=0.25, 
#              scaffold_series=None)

CPU times: user 57 µs, sys: 7 µs, total: 64 µs
Wall time: 76.5 µs


In [49]:
%%time
# from sklearn.ensemble import BaggingClassifier

knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')

estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=3, oob_score=True)

hyperparams = {'n_estimators': [300]}

# SCAFFOLD Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='scaffold', test_size=0.25, 
#              scaffold_series=scaffold_series)

CPU times: user 60 µs, sys: 7 µs, total: 67 µs
Wall time: 76.3 µs


### GS:Random Forest 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [50]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 400],
               'max_depth': [2, 3],
               'min_samples_split': [0.1,  0.3],
               'min_samples_leaf': [0.02, 0.05],
               'max_features': ['sqrt']
            }

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.733
- Train ROC-AUC: 0.772
- Test ROC-AUC: 0.708
- Best hyperparameters {'estimator__max_depth': 3, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 0.02, 'estimator__min_samples_split': 0.1, 'estimator__n_estimators': 300}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.686 	> Median ROC-AUC: 0.612
> Test best conf. ROC-AUC: 0.663 	> Median ROC-AUC: 0.583
------------------------------------------------------------
CPU times: user 3.01 s, sys: 125 ms, total: 3.13 s
Wall time: 42 s


In [38]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 400],
               'max_depth': [2, 3],
               'min_samples_split': [0.1,  0.3],
               'min_samples_leaf': [0.02, 0.05],
               'max_features': ['sqrt']
            }

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.733
- Train ROC-AUC: 0.805
- Test ROC-AUC: 0.642
- Best hyperparameters {'estimator__max_depth': 3, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 0.02, 'estimator__min_samples_split': 0.1, 'estimator__n_estimators': 400}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 3.96 s, sys: 362 ms, total: 4.33 s
Wall time: 40.7 s
