# Model Selection

In [1]:
import pandas as pd
import numpy as np
import glob, sys, os
sys.path.append('..')

In [2]:
from modules.plotting_metrics import PlotMetric
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', context='talk', font_scale=0.8)

In [3]:
## Run the helper fucntions form the local CDK2 directory
# %run ./helper_1_load_data.ipynb

In [4]:
# Helper fucntion to train and test
# %run ./helper_functions_S6.ipynb

In [5]:
file_name = './df_DkSc_results_COCRYS_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape

(5839, 136)

### Timeout Decorator

In [6]:
import signal
from functools import wraps

def timeout(n_seconds=300):
    '''Stops a function execution after n seconds'''
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            # Set alarm for n seconds
            signal.alarm(n_seconds)
            try:
                # Call decorated func
                return func(*args, **kwargs)
#             except TimeoutError as e:
                print(f'Execution finished after {n_seconds}:', e)
            finally:
                # Cancel Alarm
                signal.alarm(0)
        return wrapper
    return decorator

## Scaffold Splitting

In [7]:
#*************************************************
# Functions to compute stratify scaffold splitting
#*************************************************
from scaffold_splitter import train_test_scaffold_split

In [8]:
# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = './df_COCRYS_DUD_DEKOIS_Murcko_Scaffolds_SMILES.pkl'

df_scff_murcko = pd.read_pickle(file)

## Train/test on the same dataset 

### Learning Curves

In [9]:
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import learning_curve

In [10]:
def plot_learning_curves(estimator, X, y, title, ylim=[0.5,1], axes=None,
                         cv=3, train_sizes=np.linspace(0.1, 1.0, 10), 
                         scoring='roc_auc', n_jobs=4):
    '''
    Plot estimator performance on the training and validation
    sets as a function of the training set size.

    Parameters
    ----------
    estimator: sklearn estimator object type
       Object type that implements the "fit" and "predict method"

    X: array-like, shape (m_samples, n_features)
        Training array with m_samples and n_features.
    y: array-like, shape (m_samples)
        Target array relative to X with m labels.
    axes: array of 3 axes
        matplotlib axes array to append the generated plot
    ylim: array
       
    '''
    if axes == None:
        _, axes = plt.subplots(1, 1, figsize=(5, 5))
        
    axes.set(title=title, ylim=ylim, xlabel='Training examples', ylabel=f'Metric: {scoring}')
    # Use learning_curve function from sklearn
    train_sizes, train_scores, test_scores, fit_times, _ = \
       learning_curve(estimator, X, y, scoring=scoring, return_times = True,
                      cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    # Compute useful metrics
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
#     fit_times_mean = np.mean(fit_times, axis=1)
#     fit_times_std = np.std(fit_times, axis=1)
    # Plot the learning curve
    axes.grid()
    axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
                        train_scores_mean + train_scores_std, alpha=0.1, color='r')
    axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
                        test_scores_mean + test_scores_std, alpha=0.1, color='g')
    axes.plot(train_sizes, train_scores_mean, 'o-', color='r',
                label='Train score')
    axes.plot(train_sizes, test_scores_mean, 'o-', color='g',
                label='Cross-validation score')
    axes.legend(loc='lower right')
    

### Hyperparameters tunning: Grid Search

In [11]:
@timeout(360) #Finish execution after 6 mins
def run_grid_search(estimator, X_train, y_train, X_test, y_test, hyperparams,  cv_value=5, 
                    scoring='roc_auc', splitting='random', std_scale=False):

    # Format the hyperparms
    hyperparams_dict = {'estimator__' + key: val for key, val in hyperparams.items()}
    
    if std_scale:
        # Create the Pipe
        scaler = StandardScaler()
        pipe = Pipeline([('scaler', scaler),
                         ('estimator', estimator)])
    else:
        pipe = estimator
    
    # Do Grid Search
    gs = GridSearchCV(estimator = pipe, param_grid = hyperparams,
                     cv = cv_value, scoring = scoring, n_jobs = 6, refit = True)

    # Train the model
    gs.fit(X_train, y_train) 

    # Predictions
    y_train_predict = gs.predict_proba(X_train)
    y_test_predict = gs.predict_proba(X_test)

    # Print some values
    print(f'No. of molecules in train set: {y_train.shape[0]}, with {y_train.sum()} actives.')
    print(f'No. of molecules in test set: {y_test.shape[0]}, with {y_test.sum()} actives.')
    print('')
    print('*'*10, 'GRID SEARCH RESULTS', '*'*10)
    print('- Mean CV ROC-AUC score: {:.3f}'.format(gs.best_score_))
    print('- Train ROC-AUC: {:.3f}'.format(roc_auc_score(y_train, y_train_predict[:, 1])))
    print('- Test ROC-AUC: {:.3f}'.format(roc_auc_score(y_test, y_test_predict[:, 1])))
    print('- Best hyperparameters', gs.best_params_)
    print('**'*21)
    print('')

In [12]:
def run_cross_val_bias_var_tradeoff(estimator, X_train, y_train, X_test, y_test, cv_value=5, scoring='roc_auc',
                                   score_type='Dk_sc', pred_prob = True):
    
    # Print some values
    print(f'No. of molecules in train set: {y_train.shape[0]}, with {y_train.sum()} actives.')
    print(f'No. of molecules in test set: {y_test.shape[0]}, with {y_test.sum()} actives.')

    # Scaler
    scaler = StandardScaler()

    # Set the pipe
    pipe = Pipeline([('scaler', scaler), ('estimator', estimator)])

    # Set the CV score
    roc_cv = cross_val_score(pipe, X_train, y_train, cv=cv_value, scoring=scoring, n_jobs= -1)

    pipe.fit(X_train, y_train)
    
    if pred_prob:
        # Predict proba
        y_predict_train = pipe.predict_proba(X_train)
        y_predict_test = pipe.predict_proba(X_test)
    else:
        y_predict_train = pipe.predict(X_train)
        y_predict_test = pipe.predict(X_test)

    # CV Mean ROC-AUC
    print('CV={} ROC-AUC: {:.4f}'.format(cv_value, roc_cv.mean()))

    # Training ROC-AUC:
    print('Train ROC-AUC: {:.4f}'.format(roc_auc_score(y_train, y_predict_train[:, 1])))

    # Test ROC-AUC:
    print('Test ROC-AUC: {:.4f}'.format(roc_auc_score(y_test, y_predict_test[:, 1])))

### Function to report the Best Conformation's ROC-AUC for a given subset of samples

In [13]:
#************************************************************************
# Returns the best conformatio's ROC-AUC value of a given subset X and y
#************************************************************************

def _get_best_conf_roc_auc(X_train, y_train, X_test, y_test, verbose=True):
    best_roc_auc_train = X_train.apply(
        lambda x: roc_auc_score(y_true= y_train, y_score= -x), axis=0)
    best_roc_auc_test = X_test.apply(
        lambda x: roc_auc_score(y_true= y_test, y_score= -x), axis=0)
    if verbose:
        print('--'*30)
        print("**** Best Conformation's ROC-AUC using docking scores ****")
        print(f"> Train best conf. ROC-AUC: {round(best_roc_auc_train.max(), 3)}" +
              f" \t> Median ROC-AUC: {round(best_roc_auc_train.median(), 3)}")
        print(f"> Test best conf. ROC-AUC: {round(best_roc_auc_test.max(), 3)}" +
              f" \t> Median ROC-AUC: {round(best_roc_auc_test.median(), 3)}")
        print('--'*30)
        
        
def split_and_gs(X, y, estimator, hyperparams, splitting='random', 
                 test_size=0.25, scaffold_series=None, **kwargs):
    if splitting == 'scaffold':
        X_train, X_test, y_train, y_test = \
            train_test_scaffold_split(X, y, scaffold_series = scaffold_series,
                test_size=test_size, stratify=y)
    elif splitting == 'random':
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=test_size, stratify=y)

    run_grid_search(estimator, 
                    X_train, y_train, X_test, y_test, 
                    hyperparams = hyperparams,  **kwargs)

    _get_best_conf_roc_auc(X_train, y_train, X_test, y_test)

##  Grid Search
### Hyperparameter Tunning
#### DEKOIS Library

In [14]:
#### library = 'DEKOIS'
library = 'DEKOIS'

# Train and test over DEKOIS
X = X_merged_dksc.loc[library]
y = y_true_merged.loc[library]
scaffold_series = df_scff_murcko['scff_generic'].loc[library]

### GS: Linear SVM 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [15]:
%%time
from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.877
- Train ROC-AUC: 0.881
- Test ROC-AUC: 0.921
- Best hyperparameters {'C': 1e-15}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.896 	> Median ROC-AUC: 0.826
> Test best conf. ROC-AUC: 0.937 	> Median ROC-AUC: 0.845
------------------------------------------------------------
CPU times: user 539 ms, sys: 321 ms, total: 860 ms
Wall time: 2 s


In [16]:
%%time
# from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.695
- Train ROC-AUC: 0.868
- Test ROC-AUC: 0.957
- Best hyperparameters {'C': 1e-12}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 512 ms, sys: 65.4 ms, total: 577 ms
Wall time: 623 ms


### GS: Radial Basis Function SVM
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [17]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.859
- Train ROC-AUC: 0.866
- Test ROC-AUC: 0.962
- Best hyperparameters {'C': 1e-08, 'gamma': 1e-06}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.876 	> Median ROC-AUC: 0.809
> Test best conf. ROC-AUC: 0.977 	> Median ROC-AUC: 0.901
------------------------------------------------------------
CPU times: user 801 ms, sys: 130 ms, total: 931 ms
Wall time: 1.84 s


In [18]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.809
- Train ROC-AUC: 0.873
- Test ROC-AUC: 0.957
- Best hyperparameters {'C': 1e-08, 'gamma': 0.0001}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 806 ms, sys: 107 ms, total: 913 ms
Wall time: 2.17 s


### GS: Logistic Regession 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [19]:
%%time
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.876
- Train ROC-AUC: 0.916
- Test ROC-AUC: 0.938
- Best hyperparameters {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.886 	> Median ROC-AUC: 0.82
> Test best conf. ROC-AUC: 0.945 	> Median ROC-AUC: 0.869
------------------------------------------------------------
CPU times: user 1.21 s, sys: 588 ms, total: 1.8 s
Wall time: 1.62 s


In [20]:
%%time
# from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.861
- Train ROC-AUC: 0.864
- Test ROC-AUC: 0.955
- Best hyperparameters {'C': 0.0001, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 1.05 s, sys: 552 ms, total: 1.61 s
Wall time: 1.41 s


### GS: K-Neighbors Classifier
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [21]:
%%time
from sklearn.neighbors import KNeighborsClassifier 
estimator = KNeighborsClassifier()

hyperparams = {'n_neighbors': [25, 55, 125, 225], 
               'p': [1, 2]
             }

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.872
- Train ROC-AUC: 0.922
- Test ROC-AUC: 0.912
- Best hyperparameters {'n_neighbors': 55, 'p': 2}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.889 	> Median ROC-AUC: 0.821
> Test best conf. ROC-AUC: 0.937 	> Median ROC-AUC: 0.86
------------------------------------------------------------
CPU times: user 908 ms, sys: 109 ms, total: 1.02 s
Wall time: 1.41 s


In [22]:
%%time
from sklearn.neighbors import KNeighborsClassifier 
estimator = KNeighborsClassifier()

hyperparams = {'n_neighbors': [25, 55, 125, 225], 
               'p': [1, 2]
             }

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.837
- Train ROC-AUC: 0.865
- Test ROC-AUC: 0.953
- Best hyperparameters {'n_neighbors': 225, 'p': 2}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 853 ms, sys: 125 ms, total: 978 ms
Wall time: 1.41 s


### GS: Decision Tree Classifier
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [23]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2, 0.25, 0.3],
               'max_features': [None, 'sqrt', 'log2']}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.836
- Train ROC-AUC: 0.879
- Test ROC-AUC: 0.802
- Best hyperparameters {'criterion': 'entropy', 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_split': 0.25}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.905 	> Median ROC-AUC: 0.837
> Test best conf. ROC-AUC: 0.908 	> Median ROC-AUC: 0.814
------------------------------------------------------------
CPU times: user 1.08 s, sys: 257 ms, total: 1.34 s
Wall time: 1.56 s


In [24]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2, 0.25, 0.3],
               'max_features': [None, 'sqrt', 'log2']}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.838
- Train ROC-AUC: 0.909
- Test ROC-AUC: 0.953
- Best hyperparameters {'criterion': 'gini', 'max_depth': 5, 'max_features': 'log2', 'min_samples_split': 0.3}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 1.27 s, sys: 412 ms, total: 1.69 s
Wall time: 1.92 s


### GS: Bagging Classifier (k-NN as base estimator) 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [25]:
%%time
from sklearn.ensemble import BaggingClassifier

knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')

estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=6, oob_score=True)

hyperparams = {'n_estimators': [300]}

# RANDOM Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='random', test_size=0.25, 
#              scaffold_series=None)

CPU times: user 11.4 ms, sys: 5.28 ms, total: 16.7 ms
Wall time: 13.3 ms


In [26]:
%%time
# from sklearn.ensemble import BaggingClassifier

knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')

estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=6, oob_score=True)

hyperparams = {'n_estimators': [300]}

# SCAFFOLD Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='scaffold', test_size=0.25, 
#              scaffold_series=scaffold_series)

CPU times: user 36 µs, sys: 13 µs, total: 49 µs
Wall time: 58.7 µs


### GS:Random Forest 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [27]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [2,  5],
             'min_samples_leaf': [0.1, 0.2],
             'max_features': ['log2', 'sqrt']
            }

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.853
- Train ROC-AUC: 0.907
- Test ROC-AUC: 0.942
- Best hyperparameters {'max_depth': 2, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'n_estimators': 300}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.875 	> Median ROC-AUC: 0.813
> Test best conf. ROC-AUC: 0.972 	> Median ROC-AUC: 0.894
------------------------------------------------------------
CPU times: user 1.93 s, sys: 229 ms, total: 2.16 s
Wall time: 21.9 s


In [28]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [2,  5],
             'min_samples_leaf': [0.1, 0.2],
             'max_features': ['log2', 'sqrt']
            }

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.855
- Train ROC-AUC: 0.893
- Test ROC-AUC: 0.939
- Best hyperparameters {'max_depth': 2, 'max_features': 'log2', 'min_samples_leaf': 0.2, 'n_estimators': 300}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 1.61 s, sys: 167 ms, total: 1.77 s
Wall time: 20.2 s


##  Grid Search
### Hyperparameter Tunning
#### Train and Test sets from different libraries 

In [29]:
# Train DEKOIS, test DUD

def split_by_library_and_gs(X, y, lib_train_name, lib_test_name):
    '''Train-Test "split" by library, and run Grid Search. Splits the main dataframe by library
    using different molecular libraries for Training and Testing.'''
    X_train = X.loc[lib_train_name]
    y_train = y.loc[lib_train_name]

    X_test = X.loc[lib_test_name]
    y_test = y.loc[lib_test_name]

    run_grid_search(estimator, 
                        X_train, y_train, X_test, y_test, 
                        hyperparams = hyperparams)

    _get_best_conf_roc_auc(X_train, y_train, X_test, y_test)

In [30]:
# **********************************************
# Train and Test sets from diferent libraries
# **********************************************
X = X_merged_dksc
y = y_true_merged

### GS: Linear SVM 
#### DEKOIS and DUD 

In [31]:
%%time
from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# Train DEKOIS, test DUD
train_lib = 'DEKOIS'
test_lib = 'DUD'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 1219, with 39 actives.
No. of molecules in test set: 4495, with 136 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.884
- Train ROC-AUC: 0.892
- Test ROC-AUC: 0.593
- Best hyperparameters {'C': 1e-15}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
> Test best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
------------------------------------------------------------
CPU times: user 671 ms, sys: 23.2 ms, total: 694 ms
Wall time: 1.04 s


In [32]:
%%time
# from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# Train DEKOIS, test DUD
train_lib = 'DUD'
test_lib = 'DEKOIS'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 4495, with 136 actives.
No. of molecules in test set: 1219, with 39 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.798
- Train ROC-AUC: 0.796
- Test ROC-AUC: 0.286
- Best hyperparameters {'C': 1e-12}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
> Test best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
------------------------------------------------------------
CPU times: user 1.78 s, sys: 81.1 ms, total: 1.86 s
Wall time: 10.3 s


### GS: Radial Basis Function SVM
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [33]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# Train DEKOIS, test DUD
train_lib = 'DEKOIS'
test_lib = 'DUD'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 1219, with 39 actives.
No. of molecules in test set: 4495, with 136 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.883
- Train ROC-AUC: 0.892
- Test ROC-AUC: 0.593
- Best hyperparameters {'C': 1e-06, 'gamma': 1e-06}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
> Test best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
------------------------------------------------------------
CPU times: user 975 ms, sys: 71.7 ms, total: 1.05 s
Wall time: 3.34 s


In [34]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# Train DEKOIS, test DUD
train_lib = 'DUD'
test_lib = 'DEKOIS'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 4495, with 136 actives.
No. of molecules in test set: 1219, with 39 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.889
- Train ROC-AUC: 0.987
- Test ROC-AUC: 0.607
- Best hyperparameters {'C': 0.01, 'gamma': 0.01}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
> Test best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
------------------------------------------------------------
CPU times: user 4.9 s, sys: 253 ms, total: 5.15 s
Wall time: 1min 1s


### GS: Logistic Regession 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [35]:
%%time
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': np.geomspace(1e-4, 1e5, 4),
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# Train DEKOIS, test DUD
train_lib = 'DEKOIS'
test_lib = 'DUD'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 1219, with 39 actives.
No. of molecules in test set: 4495, with 136 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.885
- Train ROC-AUC: 0.887
- Test ROC-AUC: 0.587
- Best hyperparameters {'C': 0.0001, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
> Test best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
------------------------------------------------------------
CPU times: user 1.09 s, sys: 432 ms, total: 1.52 s
Wall time: 21.5 s


In [36]:
%%time
# from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': np.geomspace(1e-4, 1e5, 4), 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# Train DEKOIS, test DUD
train_lib = 'DUD'
test_lib = 'DEKOIS'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 4495, with 136 actives.
No. of molecules in test set: 1219, with 39 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.842
- Train ROC-AUC: 0.923
- Test ROC-AUC: 0.556
- Best hyperparameters {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
> Test best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
------------------------------------------------------------
CPU times: user 1.47 s, sys: 422 ms, total: 1.89 s
Wall time: 46.1 s


### GS:Random Forest 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [37]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [3,  5, 7],
             'min_samples_leaf': [0.03, 0.05],
               'min_samples_split': [0.2,  0.3],
             'max_features': ['sqrt']
            }

# Train DEKOIS, test DUD
train_lib = 'DEKOIS'
test_lib = 'DUD'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 1219, with 39 actives.
No. of molecules in test set: 4495, with 136 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.872
- Train ROC-AUC: 0.927
- Test ROC-AUC: 0.553
- Best hyperparameters {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.05, 'min_samples_split': 0.2, 'n_estimators': 500}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
> Test best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
------------------------------------------------------------
CPU times: user 2.7 s, sys: 101 ms, total: 2.8 s
Wall time: 41.3 s


In [38]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [3,  5, 7],
             'min_samples_leaf': [0.03, 0.05],
               'min_samples_split': [0.2,  0.3],
             'max_features': ['sqrt']
            }

# Train DEKOIS, test DUD
train_lib = 'DUD'
test_lib = 'DEKOIS'
split_by_library_and_gs(X, y, train_lib, test_lib)

No. of molecules in train set: 4495, with 136 actives.
No. of molecules in test set: 1219, with 39 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.826
- Train ROC-AUC: 0.881
- Test ROC-AUC: 0.643
- Best hyperparameters {'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 0.03, 'min_samples_split': 0.2, 'n_estimators': 300}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.717 	> Median ROC-AUC: 0.557
> Test best conf. ROC-AUC: 0.892 	> Median ROC-AUC: 0.831
------------------------------------------------------------
CPU times: user 4 s, sys: 143 ms, total: 4.15 s
Wall time: 1min 19s


***
##  Grid Search
### Hyperparameter Tunning
#### Merged Libraries 

In [39]:
# Train and test over DEKOIS
X = X_merged_dksc
y = y_true_merged
scaffold_series = df_scff_murcko['scff_generic']

### GS: Linear SVM 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [40]:
%%time
from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-9, 3)}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.742
- Train ROC-AUC: 0.692
- Test ROC-AUC: 0.652
- Best hyperparameters {'C': 1e-12}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.688 	> Median ROC-AUC: 0.613
> Test best conf. ROC-AUC: 0.677 	> Median ROC-AUC: 0.579
------------------------------------------------------------
CPU times: user 2.58 s, sys: 105 ms, total: 2.68 s
Wall time: 12.7 s


In [41]:
%%time
# from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-9, 3)}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.709
- Train ROC-AUC: 0.742
- Test ROC-AUC: 0.511
- Best hyperparameters {'C': 1e-15}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 2.58 s, sys: 87.1 ms, total: 2.67 s
Wall time: 11.4 s


### GS: Radial Basis Function SVM
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [42]:
import numpy as np
np.geomspace(1e0, 1e4, 4)
np.geomspace(1e-4, 1e0, 3)

array([1.e-04, 1.e-02, 1.e+00])

In [43]:
%%time
from sklearn.svm import SVC
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e0, 1e2, 3), 
               'gamma': np.geomspace(1e-4, 1e0, 3)}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.874
- Train ROC-AUC: 1.000
- Test ROC-AUC: 0.874
- Best hyperparameters {'C': 10.0, 'gamma': 0.01}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.684 	> Median ROC-AUC: 0.61
> Test best conf. ROC-AUC: 0.669 	> Median ROC-AUC: 0.588
------------------------------------------------------------
CPU times: user 7.98 s, sys: 134 ms, total: 8.12 s
Wall time: 3min 2s


In [44]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e0, 1e2, 3), 
               'gamma': np.geomspace(1e-4, 1e0, 3)}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.843
- Train ROC-AUC: 1.000
- Test ROC-AUC: 0.750
- Best hyperparameters {'C': 100.0, 'gamma': 0.01}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 6.33 s, sys: 80 ms, total: 6.41 s
Wall time: 2min 31s


### GS: Logistic Regession 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [45]:
%%time
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': np.geomspace(1e-4, 1e4, 3), 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.833
- Train ROC-AUC: 0.890
- Test ROC-AUC: 0.831
- Best hyperparameters {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.687 	> Median ROC-AUC: 0.603
> Test best conf. ROC-AUC: 0.688 	> Median ROC-AUC: 0.603
------------------------------------------------------------
CPU times: user 6.12 s, sys: 505 ms, total: 6.62 s
Wall time: 34 s


In [46]:
%%time
# from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': np.geomspace(1e-4, 1e5, 4), 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.823
- Train ROC-AUC: 0.905
- Test ROC-AUC: 0.726
- Best hyperparameters {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 1.93 s, sys: 609 ms, total: 2.54 s
Wall time: 36.7 s


### GS: Decision Tree Classifier
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [47]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2,  0.3],
               'min_samples_leaf': [0.02, 0.05, 0.1],
               'max_features': [None, 'sqrt', 'log2']}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.717
- Train ROC-AUC: 0.772
- Test ROC-AUC: 0.715
- Best hyperparameters {'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 0.02, 'min_samples_split': 0.3}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.697 	> Median ROC-AUC: 0.618
> Test best conf. ROC-AUC: 0.659 	> Median ROC-AUC: 0.564
------------------------------------------------------------
CPU times: user 2.95 s, sys: 295 ms, total: 3.25 s
Wall time: 6.07 s


In [48]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2,  0.3],
               'min_samples_leaf': [0.02, 0.05, 0.1],
               'max_features': [None, 'sqrt', 'log2']}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.714
- Train ROC-AUC: 0.789
- Test ROC-AUC: 0.624
- Best hyperparameters {'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 0.05, 'min_samples_split': 0.3}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 2.58 s, sys: 168 ms, total: 2.75 s
Wall time: 5.62 s


### GS: Bagging Classifier (k-NN as base estimator) 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [49]:
%%time
from sklearn.ensemble import BaggingClassifier

knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')

estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=3, oob_score=True)

hyperparams = {'n_estimators': [300]}

# RANDOM Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='random', test_size=0.25, 
#              scaffold_series=None)

CPU times: user 47 µs, sys: 6 µs, total: 53 µs
Wall time: 61.8 µs


In [50]:
%%time
# from sklearn.ensemble import BaggingClassifier

knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')

estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=3, oob_score=True)

hyperparams = {'n_estimators': [300]}

# SCAFFOLD Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='scaffold', test_size=0.25, 
#              scaffold_series=scaffold_series)

CPU times: user 91 µs, sys: 0 ns, total: 91 µs
Wall time: 107 µs


### GS:Random Forest 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [51]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 400],
               'max_depth': [2, 3],
               'min_samples_split': [0.1,  0.3],
               'min_samples_leaf': [0.02, 0.05],
               'max_features': ['sqrt']
            }

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.739
- Train ROC-AUC: 0.780
- Test ROC-AUC: 0.750
- Best hyperparameters {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.02, 'min_samples_split': 0.3, 'n_estimators': 400}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.682 	> Median ROC-AUC: 0.604
> Test best conf. ROC-AUC: 0.692 	> Median ROC-AUC: 0.605
------------------------------------------------------------
CPU times: user 3.37 s, sys: 91.7 ms, total: 3.46 s
Wall time: 39 s


In [52]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 400],
               'max_depth': [2, 3],
               'min_samples_split': [0.1,  0.3],
               'min_samples_leaf': [0.02, 0.05],
               'max_features': ['sqrt']
            }

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.733
- Train ROC-AUC: 0.800
- Test ROC-AUC: 0.638
- Best hyperparameters {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.02, 'min_samples_split': 0.3, 'n_estimators': 300}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 2.96 s, sys: 112 ms, total: 3.08 s
Wall time: 36.6 s
