# Helper Functions: Model Selection 

In [1]:
import pandas as pd
import numpy as np
import glob, sys, os
sys.path.append('..')

In [2]:
from modules.plotting_metrics import PlotMetric
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', context='talk', font_scale=0.8)

### Timeout decorator

In [None]:
import signal
from functools import wraps

def timeout(n_seconds=300):
    '''Stops a function execution after n seconds'''
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            # Set alarm for n seconds
            signal.alarm(n_seconds)
            try:
                # Call decorated func
                return func(*args, **kwargs)
#             except TimeoutError as e:
                print(f'Execution finished after {n_seconds}:', e)
            finally:
                # Cancel Alarm
                signal.alarm(0)
        return wrapper
    return decorator

### Learning Curves

In [8]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import learning_curve

In [9]:
def plot_learning_curves(estimator, X, y, title, ylim=[0.5,1], axes=None,
                         cv=3, train_sizes=np.linspace(0.1, 1.0, 10), 
                         scoring='roc_auc', n_jobs=4):
    '''
    Plot estimator performance on the training and validation
    sets as a function of the training set size.

    Parameters
    ----------
    estimator: sklearn estimator object type
       Object type that implements the "fit" and "predict method"

    X: array-like, shape (m_samples, n_features)
        Training array with m_samples and n_features.
    y: array-like, shape (m_samples)
        Target array relative to X with m labels.
    axes: array of 3 axes
        matplotlib axes array to append the generated plot
    ylim: array
       
    '''
    if axes == None:
        _, axes = plt.subplots(1, 1, figsize=(5, 5))
        
    axes.set(title=title, ylim=ylim, xlabel='Training examples', ylabel=f'Metric: {scoring}')
    # Use learning_curve function from sklearn
    train_sizes, train_scores, test_scores, fit_times, _ = \
       learning_curve(estimator, X, y, scoring=scoring, return_times = True,
                      cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    # Compute useful metrics
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
#     fit_times_mean = np.mean(fit_times, axis=1)
#     fit_times_std = np.std(fit_times, axis=1)
    # Plot the learning curve
    axes.grid()
    axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
                        train_scores_mean + train_scores_std, alpha=0.1, color='r')
    axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
                        test_scores_mean + test_scores_std, alpha=0.1, color='g')
    axes.plot(train_sizes, train_scores_mean, 'o-', color='r',
                label='Train score')
    axes.plot(train_sizes, test_scores_mean, 'o-', color='g',
                label='Cross-validation score')
    axes.legend(loc='lower right')
    

### Hyperparameters tunning: Grid Search

In [10]:
def run_grid_search(estimator, X_train, y_train, X_test, y_test, hyperparams,  cv_value=5, 
                    scoring='roc_auc', splitting='random', std_scale=False,
                   randomGS = False, n_iter=10):
    # Format the hyperparms
    hyperparams_dict = {'estimator__' + key: val for key, val in hyperparams.items()}
    
    if std_scale:
        # Create the Pipe
        scaler = StandardScaler()
        pipe = Pipeline([('scaler', scaler),
                         ('estimator', estimator)])
    else:
        pipe = estimator
    
    # Do Grid Search
    if randomGS:
        gs = RandomizedSearchCV(estimator = pipe,  param_distributions = hyperparams,
                     cv = cv_value, scoring = scoring, n_jobs = 6, refit = True, n_iter=n_iter)
    else:
        gs = GridSearchCV(estimator = pipe, param_grid = hyperparams,
                     cv = cv_value, scoring = scoring, n_jobs = 6, refit = True)
    # Train the model
    gs.fit(X_train, y_train) 

    # Predictions
    y_train_predict = gs.predict_proba(X_train)
    y_test_predict = gs.predict_proba(X_test)
    
    # Values to return
    n_train_mols    = y_train.shape[0]
    n_train_actives = y_train.sum()
    n_test_mols     = y_test.shape[0]
    n_test_actives  = y_test.sum()
    mean_cv_roc     = gs.best_score_
    train_roc       = roc_auc_score(y_train, y_train_predict[:, 1])
    test_roc        = roc_auc_score(y_test, y_test_predict[:, 1])
    best_params     = gs.best_params_

    # Print some values
    print(f'No. of molecules in train set: {n_train_mols}, with {n_train_actives} actives.')
    print(f'No. of molecules in test set: {n_test_mols}, with {n_test_actives} actives.')
    print('')
    print('*'*10, 'GRID SEARCH RESULTS', '*'*10)
    print('- Mean CV ROC-AUC:\t{:.3f}'.format(mean_cv_roc))
    print('- Train ROC-AUC:  \t{:.3f}'.format(train_roc))
    print('- Test ROC-AUC:   \t{:.3f}'.format(test_roc))
    print('- Best hyperparameters', best_params)
    print('**'*21)
    print('')
    
    return [n_train_mols, n_train_actives, n_test_mols, n_test_actives,
            mean_cv_roc, train_roc, test_roc, best_params]

In [77]:
def gs_cv(estimator, X_train, y_train,hyperparams,  cv_value=5, 
                    scoring='roc_auc', splitting='random', std_scale=True,
                   randomGS = False, n_iter=10):
    # Format the hyperparms
    
    if std_scale:
        # Create the Pipe
        hyperparams_dict = {'estimator__' + key: val for key, val in hyperparams.items()}
        scaler = StandardScaler()
        pipe = Pipeline([('scaler', scaler),
                         ('estimator', estimator)])
    else:
        pipe = estimator
    
    # Do Grid Search
    if randomGS:
        gs = RandomizedSearchCV(estimator = pipe,  param_distributions = hyperparams_dict,
                     cv = cv_value, scoring = scoring, n_jobs = 6, refit = True, n_iter=n_iter)
    else:
        gs = GridSearchCV(estimator = pipe, param_grid = hyperparams_dict,
                     cv = cv_value, scoring = scoring, n_jobs = 6, refit = True)
    # Train the model
    gs.fit(X_train, y_train) 
    y_train_predict = gs.predict_proba(X_train)


    # Values to return
    n_train_mols    = y_train.shape[0]
    n_train_actives = y_train.sum()

    mean_cv_roc     = gs.best_score_
    train_roc       = roc_auc_score(y_train, y_train_predict[:, 1])
    
    best_params     = gs.best_params_

    # Print some values
    print(f'No. of molecules in train set: {n_train_mols}, with {n_train_actives} actives.')
    #print(f'No. of molecules in test set: {n_test_mols}, with {n_test_actives} actives.')
    print('')
    print('*'*10, 'GRID SEARCH RESULTS', '*'*10)
    print('- Mean CV ROC-AUC:\t{:.3f}'.format(mean_cv_roc))
    print('- Train ROC-AUC:  \t{:.3f}'.format(train_roc))
    #print('- Test ROC-AUC:   \t{:.3f}'.format(test_roc))
    print('- Best hyperparameters', best_params)
    print('**'*21)
    print('')
    
    return gs 

### Function to report the Best Conformation's ROC-AUC for a given subset of samples

In [11]:
#************************************************************************
# Returns the best conformatio's ROC-AUC value of a given subset X and y
#************************************************************************

def get_roc_auc_DkSc(X_train, y_train, X_test, y_test, verbose=True):
    roc_auc_train = X_train.apply(
        lambda x: roc_auc_score(y_true= y_train, y_score= -x), axis=0)
    roc_auc_test = X_test.apply(
        lambda x: roc_auc_score(y_true= y_test, y_score= -x), axis=0)
    # Values to return
    train_best_roc = roc_auc_train.max()
    train_median   = roc_auc_train.median()
    train_mean     = roc_auc_train.mean()
    test_best_roc = roc_auc_test.max()
    test_median   = roc_auc_test.median()
    test_mean     = roc_auc_test.mean()
    
    if verbose:
        print("***** Best Conformation's ROC-AUC using docking scores *****")
        
        print("> Train best conf. ROC-AUC: {:.3f}".format(train_best_roc) +
              " \t> median: {:.3f}".format(train_median) +
              ', mean: {:.3f}'.format(train_mean))
        
        print("> Test best conf. ROC-AUC: {:.3f}".format(test_best_roc) +
              " \t> median: {:.3f}".format(test_median) +
              ', mean: {:.3f}'.format(test_mean))
        print('**'*32)
    # return a list of results to capture by the  wrapper function
    return [train_best_roc, train_median, train_mean,
            test_best_roc, test_median, test_mean]
        

In [83]:
#******************************************
# Decorator functions to capture GS results
#******************************************
from functools import wraps

def capture_GS_results(results_dict=None, capture=True):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            if results_dict != None and capture:
                results = func(*args, **kwargs)
                # Create a key with the first four values
                key = '_'.join(results[:4])
                # Append results to the results dictionary
                results_dict[key] = results
            else:
                return func(*args, **kwargs)
        return wrapper
    return decorator
            
#******************************************************
# Dictionary to capture GS results using the decorator 
#******************************************************
results_dict = {}

#******************************************************
# Function to Split and run Grid Search
#******************************************************
@capture_GS_results(results_dict)
def split_and_gs(train_name, test_name, estimator_name,
                 X, y, estimator, hyperparams, splitting='random', 
                 test_size=0.25, scaffold_series=None, random_state=None, **kwargs):
    '''Given a X and y sets, a sklean estimator and an splitting method, 
    performs Grid Search CV using the parsed hyperparams'''
    #**************
    # Do the split
    #**************
    if splitting == 'scaffold':
        X_train, X_test, y_train, y_test = \
            train_test_scaffold_split(X, y, scaffold_series = scaffold_series,
                test_size=test_size, stratify=y)
    elif splitting == 'random':
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=test_size, stratify=y,
                            random_state=random_state)
        
    # Verbose
    print(f'{estimator_name} => Train: {train_name}; Test: {test_name}; split: {splitting}')
    
    # Function to run Grid Search
    #----------------------------
    gs_results = run_grid_search(estimator, 
                    X_train, y_train, X_test, y_test, 
                    hyperparams = hyperparams,  **kwargs)

    # Function to extract ROC results from DkSc values 
    #-------------------------------------------------
    dksc_results = get_roc_auc_DkSc(X_train, y_train, X_test, y_test)
    
    # Return both list of results to captured by the decorador function
    return [train_name, test_name, estimator_name, splitting] + gs_results + dksc_results