# Helper functions

### Import the necessary modules

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, roc_auc_score, make_scorer, f1_score

### Data load

In [None]:
def open_results(path, column_to_drop, active_label='Active'):
    '''Function to load VS results from path file.
    It returns X and y ndarrays.'''
    path_file = os.path.join(*path.split('/'))
    df_results = pd.read_csv(path_file, index_col=0)
    y_ = pd.Series(df_results[column_to_drop] == active_label, dtype = int) # Setting y_true
    X_ = df_results.drop([column_to_drop], axis = 1) # Setting X
    return X_, y_

### Training PipeLine

In [1]:
def train_model(X_train, y_train, estimator, estimator_hyparams = None,
               scoring = 'roc_auc', k_folds = 5, standarize = True, 
               split_train = False, test_size = 0.2, random_state = 1, **kwargs):
    '''If desire, the original train set cab be splitted. Just in case of been useful'''
    if split_train:
        X_train, X_test, y_train, y_test = \
        train_test_split(X_train, y_train, test_size = test_size,
                         stratify = y_train, random_state = random_state)
    
    '''
     1) Pipeline is created, and will perform StandarScaler. More steps can be added later.
     # pipe args is a list of tuples initialized with one element; the estimator.
     # If standarize = true it adds StandardScaler at the bigining of the pipe.
    '''
    pipe_args = [("estimator", estimator(**kwargs))]
    if standarize:
        pipe_args.insert(0, ("scaler", StandardScaler()))
    pipe = Pipeline(pipe_args)
    '''
    2) The specific hyperparameters of the selected stimator are given, 
    we will parse them to the gridSearch instance.
    '''
    if estimator_hyparams != None:
        params = {}
        for key, value in estimator_hyparams.items():
            params['estimator__' + key] = value
        '''
        3) Grid search cross validation for turning the optimal parameters, 
        it takes the pipeline object. GridSearch performs k-fold cross validation, 
        and uses the given scoring method to validate each set.

        '''
        grid = GridSearchCV(estimator = pipe, param_grid = params, 
                            cv = k_folds, scoring = scoring,
                            n_jobs = 6, refit=True)
        estimator = grid
        
    else:
        '''Additionaly, if estimator_hyparams is None, Grid search is avoided.'''
        estimator = pipe
        
    '''SVC training through GridSearch object'''
    estimator.fit(X_train, y_train)
    '''Return the trained estimator (an instance from GridSearchCV or Pipeline)'''
    #final_model = estimator.best_estimator_ if estimator_hyparams != None else estimator
    return(estimator)

def eval_model(model, X_test, y_test, return_proba = True):
    '''
    1) Predictions and evaluation on the Test set
    - Scaling and prediction of X_train using the best model found by grid
    '''
    if return_proba:
        y_prob  = model.predict_proba(X_test)[:,1] # Predicted prob values for X_test
        y_hat = y_prob
    else:
        y_score = model.decision_function(X_test)
        y_hat = y_score
    y_pred  = model.predict(X_test) # predicted values
    '''Returns the y_score values and the lnear_SVC object'''
    return(y_hat)

### ML Wrappers

In [None]:
def train_wrapper(train_key, list_of_scores, estimator,
                  scores_dic, selected_features = None, **kwargs):
    ''''''
    trained_models = {}
    for score in list_of_scores:
        if selected_features is None:
            X_train = scores_dic[train_key][score]['X']
        else:
            X_train = scores_dic[train_key][score]['X'][selected_features]
        y_train = scores_dic[train_key][score]['y']
        name = F'{score}'
        trained_models[name] = train_model(X_train, y_train, estimator, **kwargs)
    return(trained_models)

In [None]:
def eval_wrapper(trained_model, test_keys, list_of_scores,
                  scores_dic, selected_features = None, return_proba = True, **kwargs):
    model_results = {}
    for test_key in test_keys:
        y_preds = {}
        for score in list_of_scores:
            if selected_features is None:
                X_test  = scores_dic[test_key][score]['X']
            else: 
                X_test  = scores_dic[test_key][score]['X'][selected_features]
            y_test  = scores_dic[test_key][score]['y']
            name = F'{test_key}-{score}'
            y_preds[name] = eval_model(trained_model[score], X_test, y_test, return_proba = return_proba)
        # Invoke PlotMetric Class
        model_results[test_key] = PlotMetric(y_true = y_test, y_pred_dict = y_preds, **kwargs)
    return(model_results)

## Functions for plotting

In [None]:
def plot_predict_results(title, predictions, train_key, plot_rankings = True, plot_nef = False):
    n_rows = 2 if plot_nef else 1
    plt.figure(figsize=(14, 7*n_rows))
    #plt.subplots_adjust(wspace=0.3, hspace=0.3)
    for i, test_set in enumerate(test_keys):
        plt.subplot(F'{n_rows}2{i+1}')
        predictions[test_set].plot_roc_auc(F'{title}:\n{train_key} train, {test_set} test', 
                                     show_by_itself = False, fontsize = 'x-small')
        if plot_nef:
            plt.subplot(F'{n_rows}2{i+3}')
            predictions[test_set].plot_ef_auc('', method = 'normalized', max_chi = 0.1, 
                                         show_by_itself = False, fontsize = 'x-small')
    plt.show()
    if plot_rankings:
        for test_set in test_keys:
            predictions[test_set].plot_actives_distribution(max_position_to_plot=100)

## Function for random picking n features and perform SVC

In [3]:
def random_confs_picking(scores_dic,
                             score_type,
                             train_key, 
                             test_keys,
                             model, 
                             n_features,
                             n_reps = 50,
                             metric = 'roc_auc',
                             **kwargs,
                            ):
    # Creates the dictionary of results to be filled and returned
    results_dict = {}
    for test in test_keys:
        results_dict[test] = np.zeros(n_reps)
    
    for rep in range(n_reps):
        '''Performs the random selection'''
        random_features = np.random.choice(a = range(0, 402), size = n_features, replace=False)
        
        random_features = np.sort(random_features)
        features = scores_dic[train_key][score_type]['X'].columns[random_features]
        
        model_train = train_wrapper(train_key = train_key, list_of_scores = [score_type],
                               scores_dic = scores_dic, estimator = model,
                               selected_features = features,
                               **kwargs)
        
        model_pred = eval_wrapper(trained_model = model_train, 
                            test_keys = test_keys, list_of_scores = [score_type], 
                            selected_features = features,
                            scores_dic = scores_dic, decreasing = False)
        
        for test in test_keys:
            metric_value = model_pred[test].format_metric_results(metric).values[0][0]
            results_dict[test][rep] = metric_value
            
    return results_dict

## Funtion to perform kmeans


In [1]:

from sklearn.cluster import KMeans

def get_medoids_idx(X, n_dims = 2, n_clusters = 10, random_state = None):
    _X = X[:fd].T

    kmc = KMeans(n_clusters = n_clusters, random_state = random_state)
    kmc.fit( _X )
    
    # Get the distances from the centroids
    _y = kmc.labels_
    medoids_idx = []
    for label in sorted(np.unique(_y)):
        label_indices = np.where(_y == label)[0]
        # Get the distance from the centroid
        # centroid
        centroid = kmc.cluster_centers_[label]
        # Points inside the cluster
        _X_points = _X[label_indices]
        # Compute the distances
        _X_distances = ((_X_points - centroid)**2).sum(axis = 1)
        # Get the closest point
        medoid_idx = label_indices[np.argmin(_X_distances)]
        medoids_idx.append(medoid_idx)

    return medoids_idx

### Functions for ML features selection

In [1]:
def kmeans_picking_ML(ml_model, model_name, scores_dic, train_key, test_keys,
                      df_precomputed_medoids = None,
                      min_confs = 1, max_confs = 402, interval = 1, 
                      score_type = 'Dk_sc', metric = 'roc_auc', 
                      file_path = '../data/ml_evaluations/', file_sufix = '',
                      **kwargs):
    
    # File name
    file_name = file_path + '_'.join(test_keys) + \
            F"_{model_name}_kmeans_{score_type.replace('_', '')}" + file_sufix + '.obj'
    # Check if the file exist
    if os.path.isfile(file_name):
        with open(file_name, 'rb') as f:
            results_df = pickle.load(f)
    else:        
        results_dic = {}
        for test in test_keys:
            results_dic[test] = np.zeros((1, max_confs - min_confs + 1))

        for k in range(min_confs, max_confs + 1, interval):
            if df_precomputed_medoids is None:
                medoids_idx = get_medoids_idx(data_to_clust, n_dims = n_dims, n_clusters = k,
                                         random_state = random_state)
            else: 
                medoids_idx = df_precomputed_medoids.iloc[k - 1].values[0] 
                # Get the list of conformations from precomputed kmeans
            features = scores_dic[train_key][score_type]['X'].columns[medoids_idx]
            model_train = train_wrapper(train_key = train_key, list_of_scores = [score_type],
                                   scores_dic = scores_dic, estimator = ml_model,
                                   selected_features = features,
                                   **kwargs)
            # Perfrom the testing
            model_pred = eval_wrapper(trained_model = model_train, 
                                test_keys = test_keys, list_of_scores = [score_type], 
                                selected_features = features,
                                scores_dic = scores_dic, decreasing = False)
            # Get the auc values
            for test in test_keys:
                metric_value = model_pred[test].format_metric_results(metric).values[0][0]
                results_dic[test][0, k - 1] = metric_value
        # Create the dataframe
        columns_ = [k for k in range(1, max_confs + 1)]

        results_df = {}
        for test in test_keys:
            df_ = pd.DataFrame(data = results_dic[test], columns = columns_)
            results_df[test] = df_
        # Saves the dictionary
        with open(file_name, 'wb') as f:
            pickle.dump(results_df, f, protocol=pickle.HIGHEST_PROTOCOL)
    return results_df

In [None]:
def random_picking_ML(ml_model, model_name, scores_dic, train_key, test_keys,
                      min_confs = 1, max_confs = 402, interval = 1, n_reps = 30,  
                      score_type = 'Dk_sc', metric = 'roc_auc',
                      file_path = '../data/ml_evaluations/', file_sufix = '',
                      **kwargs):
    
    # File name
    file_name = file_path + '_'.join(test_keys) + \
            F"_{model_name}_random_{score_type.replace('_', '')}" + file_sufix + '.obj'
    # Check if the file exist
    if os.path.isfile(file_name):
        with open(file_name, 'rb') as f:
            results_df = pickle.load(f)
    else:        
        results_dic = {}
        for test in test_keys:
            results_dic[test] = np.zeros((n_reps, (max_confs - min_confs + 1)))

        for m in range(min_confs, max_confs + 1, interval):

            for rep in range(n_reps):
                # get the n conformations randomly, no repetition
                random_features = np.random.choice(a = range(0, max_confs), 
                                                   size = m, replace = False)
                # Select the conformations
                features = scores_dic[train_key][score_type]['X'].columns[random_features]
                # Perform the training
                model_train = train_wrapper(train_key = train_key, list_of_scores = [score_type],
                                       scores_dic = scores_dic, estimator = ml_model,
                                       selected_features = features,
                                       **kwargs)
                # Perfrom the testing
                model_pred = eval_wrapper(trained_model = model_train, 
                                    test_keys = test_keys, list_of_scores = [score_type], 
                                    selected_features = features,
                                    scores_dic = scores_dic, decreasing = False)
                # Get the auc values
                for test in test_keys:
                    metric_value = model_pred[test].format_metric_results(metric).values[0][0]
                    results_dic[test][rep, m - 1] = metric_value
        # Create the dataframe
        index_ = [F'rep_{n}' for n in range(n_reps)]
        columns_ = [m for m in range(1, max_confs + 1)]

        results_df = {}
        for test in test_keys:
            df_ = pd.DataFrame(data = results_dic[test], index = index_, columns = columns_)
            results_df[test] = df_
        # Saves the dictionary
        with open(file_name, 'wb') as f:
            pickle.dump(results_df, f, protocol=pickle.HIGHEST_PROTOCOL)
    return results_df

## Functions for consensus scoring

In [None]:
# Random picking conformations for consensus scoring
def random_picking_consensus(f, df_X, df_y, min_confs, max_confs, interval, n_reps, **kwargs):
    '''Performs a consensus scoring n_rep times for a range of min_confs to max_confs'''
    results_dic = {}
    for n in range(min_confs, max_confs + 1, interval):
        auc_array = np.zeros(n_reps)
        for rep in range(n_reps):
            # get the n conformations randomly, no repetition
            random_features = np.random.choice(a = range(0, max_confs), size = n, replace = False)
            _X = df_X.iloc[:, random_features] # Filter the dataframe
            # Assume the rrank by number as the y_pred 
            y_pred = f(df=_X, **kwargs) 

            # Get the roc auc
            auc = roc_auc_score(df_y, y_pred)
            auc_array[rep] = auc
        results_dic[n] = auc_array
    return pd.DataFrame(results_dic)

# K-means picking conformations for consensus scoring
def kmeans_picking_consensus(f, df_X, df_y, min_confs, max_confs, interval, 
                             df_precomputed_medoids = None,
                             data_to_clust = mds_pisani_402[0], n_dims = 2,
                             random_state = 0, **kwargs):
    '''Performs a consensus scoring giving k selected features thorugh k-means medoids selection'''
    auc_array = np.zeros(max_confs)
    for k in range(min_confs, max_confs + 1, interval):
        if df_precomputed_medoids is None:
            medoids_idx = get_medoids_idx(data_to_clust, n_dims = n_dims, n_clusters = k,
                                     random_state = random_state)
        else: 
            medoids_idx = df_precomputed_medoids.iloc[k - 1].values[0] # Get the list of conformations from precomputed kmeans
        features = scores_dic[train_key][score_type]['X'].columns[medoids_idx]
        _X = df_X[features] # Filter the dataframe
        # Assume the rrank by number as the y_pred 
        y_pred = f(_X, **kwargs)
        # Get the roc auc
        auc = roc_auc_score(df_y, y_pred)
        auc_array[k - 1] = auc
    return auc_array

In [None]:
import pickle, os

def consensus_wrapper(func, scores_dic, score_types, dataset_names, consensus_name, 
                             file_path = '../data/ml_evaluations/', file_sufix = '', method = 'random',
                             min_confs = 1, max_confs = 402, interval = 1, n_reps = 50, 
                             **kwargs):
    # File name
    file_name = file_path + '_'.join(dataset_names) + \
            F'_{method}_Cons_{consensus_name}' + file_sufix + '.obj'
    # Check if the file exist
    if os.path.isfile(file_name):
        with open(file_name, 'rb') as f:
            _results_dict = pickle.load(f)
    else:
        _results_dict = {}
        for dataset in dataset_names:
            for score in score_types:
                key_name = F'{dataset}_{score.replace("_", "")}_{method}_Cons_{consensus_name}'
                df_X = scores_dic[dataset][score]['X']
                df_y = scores_dic[dataset][score]['y']
                if method == 'random':
                    df_aucs = random_picking_consensus(func, df_X, df_y, 
                                         min_confs, max_confs, interval, n_reps, **kwargs)
                elif method == 'kmeans':
                    df_aucs = kmeans_picking_consensus(func, df_X, df_y, 
                                     min_confs, max_confs, interval, **kwargs)
                else:
                    print('Wrong method for conformations selection ("random", "kmeans").')
                # Add to the dictionary of results
                _results_dict[key_name] = df_aucs
        # Saves the dictionary
        with open(file_name, 'wb') as f:
            pickle.dump(_results_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
    return(_results_dict)