Next, we run the data on many different models to get a set of candidate models that seem to have high performance. We do a preliminary hyperparameter search of the likely most influential parameters at coarse resolution. 

In [1]:
# import torch
import argparse
import joblib
from pathlib import Path
import os

from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import Lasso,Ridge,ElasticNet
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.model_selection import GridSearchCV
import xgboost as XGB

import plotnine as p9

In [13]:
x_train.shape[1]

17879

Specify parameters:

In [2]:
# parser = argparse.ArgumentParser(description='Run different ML models')
# parser.add_argument('--model_types', metavar='N', type=str, nargs='*', help='models to train',default=['PLSR','knn' ,'svmLinear','svmRBF','svmPoly','lasso','ridge','elasticNet','neuralNet','xgboost','rf'])
# parser.add_argument('--X_path', action='store', type=str,help='path to the input training data',default='X.csv')
# parser.add_argument('--Y_path', action='store', type=str,help='path to the output training data',default='y.csv')
# parser.add_argument('--num_folds', action='store', type=int,help='number of folds',default=10)
# parser.add_argument('--res_dir', action='store', type=str,help='Results directory',default='/nobackup/users/hmbaghda/metastatic_potential/interim/')
# parser.add_argument('--seed', action='store', type=int,help='seed',default=42)
# parser.add_argument('--grid_search', action='store', type=bool,help='Nested Hyperparameter Tuning',default=True)
# parser.add_argument('--hyperparam_folds', action='store', type=int,help='number of hyperparametr tuning folds',default=5)
# parser.add_argument('--n_cores', action='store', type=int,help='number of cores for parallelization',default=20)

# args = parser.parse_args()
# model_types = args.model_types
# num_folds = args.num_folds
# X_path = args.X_path
# Y_path = args.Y_path
# res_dir= args.res_dir
# seed = args.seed
# cv_folds = args.hyperparam_folds

# params
data_path = '/nobackup/users/hmbaghda/metastatic_potential/'
res_dir = os.path.join(data_path, 'interim')

hvg_selection = False # whether to filter for HVGs
hvg_name = '' if not hvg_selection else 'hvg_'
X_path = os.path.join(data_path, 'processed',  hvg_name + 'expr.csv')
Y_path = os.path.join(data_path, 'processed', 'metastatic_potential.csv')

seed = 42

num_folds = 10
grid_search = True
cv_folds = 5
n_cores = 30

model_types = ['PLSR','elasticNet', 'svm', 
               'rf', 'xgboost', 'knn']
model_types = ['PLSR', 'ridge']


In [3]:
os.environ["OMP_NUM_THREADS"] = str(n_cores)
os.environ["MKL_NUM_THREADS"] = str(n_cores)
os.environ["OPENBLAS_NUM_THREADS"] = str(n_cores)
os.environ["VECLIB_MAXIMUM_THREADS"] = str(n_cores)
os.environ["NUMEXPR_NUM_THREADS"] = str(n_cores)

In [4]:
X = pd.read_csv(X_path,index_col=0).T
Y = pd.read_csv(Y_path,index_col=0)

In [5]:
def pearson_r(y_true, y_pred):
    x = y_true
    y = y_pred
    mx = torch.mean(x, dim=0)
    my = torch.mean(y, dim=0)
    xm, ym = x - mx, y - my
    r_num = torch.sum(xm * ym,dim=0)
    x_square_sum = torch.sum(xm * xm,dim=0)
    y_square_sum = torch.sum(ym * ym,dim=0)
    r_den = torch.sqrt(x_square_sum * y_square_sum)
    r = r_num / r_den
    return r #torch.mean(r)

def pair_pearsonr(x, y, axis=0):
    mx = np.mean(x, axis=axis, keepdims=True)
    my = np.mean(y, axis=axis, keepdims=True)
    xm, ym = x-mx, y-my
    r_num = np.add.reduce(xm * ym, axis=axis)
    r_den = np.sqrt((xm*xm).sum(axis=axis) * (ym*ym).sum(axis=axis))
    r = r_num / r_den
    return r

def getSamples(N, batchSize):
    order = np.random.permutation(N)
    outList = []
    while len(order)>0:
        outList.append(order[0:batchSize])
        order = order[batchSize:]
    return outList

def L2Regularization(deepLearningModel, L2):
    weightLoss = 0.
    biasLoss = 0.
    for layer in deepLearningModel:
        if isinstance(layer, torch.nn.Linear):
            weightLoss = weightLoss + L2 * torch.sum((layer.weight)**2)
            biasLoss = biasLoss + L2 * torch.sum((layer.bias)**2)
    L2Loss = biasLoss + weightLoss
    return(L2Loss)

In [12]:
[10**i for i in range(-4, 3)]

[0.0001, 0.001, 0.01, 0.1, 1, 10, 100]

In [6]:
n_estimators = range(100, 1001, 250)
svm_c = [10**i for i in range(-3, 3)]
svm_gamma = [10**i for i in range(-3, 2)]
# svm_epsilon = [10**i for i in range(-2, 1)]
alpha = [10**i for i in range(-3, 3)]

grid_search_params = {
    'knn': {
        'n_neighbors': range(5, 41, 5)
    },
    'plsr': {
        'n_components': [15, 25, 50, 100]#range(2, 16, 2)
    }, 
    'rf': {
        'n_estimators': n_estimators
            },
    'xgboost': {
        'n_estimators': n_estimators
    },
    'svm': {
        'kernel': ['linear', 'rbf', 'poly'],
        'C': svm_c, 
        'gamma': svm_gamma,  # only poly and rbf
        'degree': [2,3,4,5],  # only for poly
#         'coef0':[0,0.1,0.5,1.,1.2,2.] # only for poly, 
        
    },
#     'svmRBF': {
#         'C': svm_c,
#         'gamma': svm_gamma,
#         'epsilon': svm_epsilon
#     }, 
#     'svmPoly':{
#         'gamma': svm_gamma,
#         'C': svm_c,
#         'degree':[2,3,4,5],
#         'coef0':[0,0.1,0.5,1.,1.2,2.]
#     }, 
#     'lasso':{
#         'alpha': alpha
#     }, 
    'ridge': {
        'alpha': alpha,
    }, 
    'elasticNet': {
        'alpha': alpha,
        'l1_ratio': np.arange(0, 1.01, 0.25) # with 0 and 1 inclusive, this also does ridge and lasso
    }
}


In [7]:
models = {}
for mdl in model_types:
    if mdl == 'knn':
        if grid_search:
            model = GridSearchCV(estimator=KNN(),
                                 param_grid = grid_search_params['knn'], 
                                 cv=cv_folds, 
                                 n_jobs=n_cores)
        else:
            model = KNN(n_neighbors=5) # default value
    elif mdl=='PLSR':
        if grid_search:
            model = GridSearchCV(estimator=PLSRegression(scale=False),
                                 param_grid = grid_search_params['plsr'], cv=cv_folds, n_jobs=n_cores)
        else:
            model = PLSRegression(n_components=4,scale=False)
    elif mdl == 'rf':
        if grid_search:
            model = GridSearchCV(estimator=RandomForestRegressor(),
                                 param_grid = grid_search_params['rf'], cv=cv_folds, n_jobs=n_cores)
        else:
            model = RandomForestRegressor(n_estimators=800, n_jobs = -1)
    elif mdl == 'xgboost':
        if grid_search:
            model = GridSearchCV(estimator=XGB.XGBRegressor(),
                                 param_grid = grid_search_params['xgboost'], cv=cv_folds, n_jobs=n_cores)
        else:
            model = XGB.XGBRegressor(n_estimators=800,n_jobs = -1)
    elif mdl == 'svm':
        if grid_search:
            model = GridSearchCV(estimator=SVR(),
                                 param_grid = grid_search_params['svm'], cv=cv_folds, n_jobs=n_cores)
        else:
            model = SVR(kernel='rbf')
#     elif mdl == 'svmLinear':
#         if grid_search:
#             model = GridSearchCV(estimator=LinearSVR(),
#                                  param_grid = grid_search_params['svmLinear'], cv=cv_folds, n_jobs=n_cores)
#         else:
#             model = LinearSVR()
#     elif mdl == 'svmRBF':
#         if grid_search:
#             model = GridSearchCV(estimator=SVR(kernel='rbf'),
#                                  param_grid = grid_search_params['svmRBF'], 
#                                  cv=cv_folds, n_jobs=n_cores)
#         else:
#             model = SVR(kernel='rbf')
#     elif mdl == 'svmPoly':
#         if grid_search:
#             model = GridSearchCV(estimator=SVR(kernel='poly'),
#                                  param_grid = grid_search_params['svmPoly'], 
#                                  cv=cv_folds, n_jobs=n_cores)
#         else:
#             model = SVR(kernel='poly')
#     elif mdl == 'lasso':
#         if grid_search:
#             model = GridSearchCV(estimator=Lasso(),
#                                  param_grid = grid_search_params['lasso'], cv=cv_folds, n_jobs=n_cores)
#         else:
#             model = Lasso(alpha=0.1)
#     elif mdl == 'ridge':
#         if grid_search:
#             model = GridSearchCV(estimator=Ridge(),param_grid = grid_search_params['ridge'], 
#                                  cv=cv_folds, n_jobs=n_cores)
#         else:
#             model = Ridge(alpha=0.1)
    elif mdl == 'elasticNet':
        if grid_search:
            model = GridSearchCV(estimator=ElasticNet(),
                                 param_grid = grid_search_params['elasticNet'], cv=cv_folds, n_jobs=n_cores)
        else:
            model = ElasticNet(alpha=0.1)
#     elif mdl == 'neuralNet':
#         device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#         model = 'define the ANN just before it is trained'
#         epochs = 100
#         l2_reg  = 0.01
#         bs = 20
#         criterion = torch.nn.MSELoss(reduction='mean')
#     models.append(model)
    models[mdl] = model

Run the iterations:

In [8]:
def mixup_augmentation(x, y,
                       seed:int=42,
                       n_synthetic: int = 1000, 
                      alpha: float = 1):
    """Mixup augmentation strategy

    Parameters
    ----------
    x : numpy array
        input X block
    y : numpy array
        inpute y block
    seed : int, optional
        _description_, by default 42
    n_synthetic : int, optional
        number of synthetic data points to make, by default 1000
    alpha : float, optional
        controls the parameter for generating the mixup coefficient, by default 1 (which will draw lambda uniformly from 0 to 1)
        must be a positive number. The larger the number, the more the synthetic data deviates from the original
    """

    n_samples, n_features = x.shape
    synthetic_x = np.zeros((n_synthetic, n_features))
    synthetic_y = np.zeros((n_synthetic, ))
    for i in range(n_synthetic):
        idx1, idx2 = np.random.choice(n_samples, size=2, replace=False)
        lambda_ = np.random.beta(alpha, alpha) # alpha = beta means drawn symmetrically about 0.5
        synthetic_x[i] = lambda_ * x[idx1] + (1 - lambda_) * x[idx2]
        synthetic_y[i] = lambda_ * y[idx1] + (1 - lambda_) * y[idx2]
        
    x_all = np.vstack([x, synthetic_x])
    y_all = np.concatenate([y, synthetic_y])
    return x_all, y_all

In [None]:
cv = KFold(n_splits=num_folds,shuffle=True,random_state=seed)

res = pd.DataFrame(columns = ['model_type', 'fold', 'train_pearson', 'test_pearson', 'best_params'])
for model_type, model in tqdm(models.items()):
    print('Begun fitting and evaluation for model: %s'%model_type)
    for k, (train_index, test_index) in enumerate(cv.split(X)):
        x_train = X.iloc[train_index,:].values
        x_test = X.iloc[test_index,:].values
        y_train = Y.iloc[train_index,:].values.ravel()
        y_test = Y.iloc[test_index,:].values.ravel()

        x_train, y_train = mixup_augmentation(x = x_train, 
                                              y = y_train,
                                     seed = seed + k, 
                                     n_synthetic = 1000, 
                                     alpha = 1)


        # fit model and evaluate in validation set
        model.fit(x_train,y_train)
        yhat_train = model.predict(x_train)
        yhat_test = model.predict(x_test)
        
        train_pearson=pair_pearsonr(y_train, yhat_train, axis=0).mean()
        test_pearson=pair_pearsonr(y_test, yhat_test, axis=0).mean()
        
        res.loc[res.shape[0], :] = [model_type, k, train_pearson, test_pearson, model.best_params_]
        res.to_csv(os.path.join(data_path, 'processed', 'da_coarse_model_tests.csv'))



  0%|                                                     | 0/2 [00:00<?, ?it/s]

Begun fitting and evaluation for model: PLSR


