In [135]:
# import torch
import argparse
import joblib
from pathlib import Path
import os

from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import Lasso,Ridge,ElasticNet
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.model_selection import GridSearchCV
import xgboost as XGB

import plotnine as p9
from sklearn.decomposition import SparsePCA


In [2]:
# parser = argparse.ArgumentParser(description='Run different ML models')
# parser.add_argument('--model_types', metavar='N', type=str, nargs='*', help='models to train',default=['PLSR','knn' ,'svmLinear','svmRBF','svmPoly','lasso','ridge','elasticNet','neuralNet','xgboost','rf'])
# parser.add_argument('--X_path', action='store', type=str,help='path to the input training data',default='X.csv')
# parser.add_argument('--Y_path', action='store', type=str,help='path to the output training data',default='y.csv')
# parser.add_argument('--num_folds', action='store', type=int,help='number of folds',default=10)
# parser.add_argument('--res_dir', action='store', type=str,help='Results directory',default='/nobackup/users/hmbaghda/metastatic_potential/interim/')
# parser.add_argument('--seed', action='store', type=int,help='seed',default=42)
# parser.add_argument('--grid_search', action='store', type=bool,help='Nested Hyperparameter Tuning',default=True)
# parser.add_argument('--hyperparam_folds', action='store', type=int,help='number of hyperparametr tuning folds',default=5)
# parser.add_argument('--n_cores', action='store', type=int,help='number of cores for parallelization',default=20)

# args = parser.parse_args()
# model_types = args.model_types
# num_folds = args.num_folds
# X_path = args.X_path
# Y_path = args.Y_path
# res_dir= args.res_dir
# seed = args.seed
# cv_folds = args.hyperparam_folds

# params
data_path = '/nobackup/users/hmbaghda/metastatic_potential/'
res_dir = os.path.join(data_path, 'interim')

hvg_selection = False # whether to filter for HVGs
hvg_name = '' if not hvg_selection else 'hvg_'
X_path = os.path.join(data_path, 'processed',  hvg_name + 'expr.csv')
Y_path = os.path.join(data_path, 'processed', 'metastatic_potential.csv')

seed = 42

num_folds = 10
grid_search = True
cv_folds = 5
n_cores = 30

model_types = ['PLSR','elasticNet', 'svm', 
               'rf', 'xgboost', 'knn']
model_types = ['PLSR', 'ridge']


In [4]:
os.environ["OMP_NUM_THREADS"] = str(n_cores)
os.environ["MKL_NUM_THREADS"] = str(n_cores)
os.environ["OPENBLAS_NUM_THREADS"] = str(n_cores)
os.environ["VECLIB_MAXIMUM_THREADS"] = str(n_cores)
os.environ["NUMEXPR_NUM_THREADS"] = str(n_cores)

In [5]:
X = pd.read_csv(X_path,index_col=0).T
Y = pd.read_csv(Y_path,index_col=0)

In [7]:
def pearson_r(y_true, y_pred):
    x = y_true
    y = y_pred
    mx = torch.mean(x, dim=0)
    my = torch.mean(y, dim=0)
    xm, ym = x - mx, y - my
    r_num = torch.sum(xm * ym,dim=0)
    x_square_sum = torch.sum(xm * xm,dim=0)
    y_square_sum = torch.sum(ym * ym,dim=0)
    r_den = torch.sqrt(x_square_sum * y_square_sum)
    r = r_num / r_den
    return r #torch.mean(r)

def pair_pearsonr(x, y, axis=0):
    mx = np.mean(x, axis=axis, keepdims=True)
    my = np.mean(y, axis=axis, keepdims=True)
    xm, ym = x-mx, y-my
    r_num = np.add.reduce(xm * ym, axis=axis)
    r_den = np.sqrt((xm*xm).sum(axis=axis) * (ym*ym).sum(axis=axis))
    r = r_num / r_den
    return r

def getSamples(N, batchSize):
    order = np.random.permutation(N)
    outList = []
    while len(order)>0:
        outList.append(order[0:batchSize])
        order = order[batchSize:]
    return outList

def L2Regularization(deepLearningModel, L2):
    weightLoss = 0.
    biasLoss = 0.
    for layer in deepLearningModel:
        if isinstance(layer, torch.nn.Linear):
            weightLoss = weightLoss + L2 * torch.sum((layer.weight)**2)
            biasLoss = biasLoss + L2 * torch.sum((layer.bias)**2)
    L2Loss = biasLoss + weightLoss
    return(L2Loss)

In [127]:
# initialize models
# initialize models
# nonlinear_params={
#     'kernel': ['rbf', 'poly'],
#     'C': [10**i for i in range(-3, 3)], 
#     'degree': [2,3,4,5],  # only for poly
# }
# nonlinear_model = GridSearchCV(estimator=SVR(gamma = 0.001, kernel = 'rbf'),
#                          param_grid = nonlinear_params, 
#                          cv=cv_folds, 
#                          n_jobs=n_cores)
nonlinear_params={'n_estimators': [100,200]}
nonlinear_model = GridSearchCV(estimator=RandomForestRegressor(),
                         param_grid = nonlinear_params, 
                         cv=cv_folds, 
                         n_jobs=n_cores)
pls_model = PLSRegression(n_components=30,scale=False)

res = pd.DataFrame(columns = ['fold', 'train_pearson', 'test_pearson', 'best_params'])
cv = KFold(n_splits=num_folds,shuffle=True,random_state=seed)
for k, (train_index, test_index) in enumerate(cv.split(X)):
    x_train = X.iloc[train_index,:].values
    x_test = X.iloc[test_index,:].values
    y_train = Y.iloc[train_index,:].values.ravel()
    y_test = Y.iloc[test_index,:].values.ravel()
    
    # get LVs from PLS
    pls_model.fit(x_train, y_train)
    pls_lvs_train = pls_model.x_scores_
    
    # hyperparameter tuning
    nonlinear_model.fit(pls_lvs_train, y_train)
    
    # predict
    pls_lvs_test = pls_model.transform(x_test)
    yhat_train = nonlinear_model.predict(pls_lvs_train)
    yhat_test = nonlinear_model.predict(pls_lvs_test)
    
    train_pearson=pair_pearsonr(y_train, yhat_train, axis=0).mean()
    test_pearson=pair_pearsonr(y_test, yhat_test, axis=0).mean()
    print(test_pearson)
    
    res.loc[res.shape[0], :] = [k, train_pearson, test_pearson, nonlinear_model.best_params_]
#     res.to_csv(os.path.join(data_path, 'processed', 'plsr_svr.csv'))

0.3021289880432848
0.31614282477540595
0.5693814903942188
0.24535389078449696
0.5765393989242495
0.14676192292619647
0.36886484039028905
0.2692737685030734
0.5266563874662774
0.27114892900524024


# Startm

In [145]:
# initialize models
# initialize models
# nonlinear_params={
#     'kernel': ['rbf', 'poly'],
#     'C': [10**i for i in range(-3, 3)], 
#     'degree': [2,3,4,5],  # only for poly
# }
# nonlinear_model = GridSearchCV(estimator=SVR(gamma = 0.001, kernel = 'rbf'),
#                          param_grid = nonlinear_params, 
#                          cv=cv_folds, 
#                          n_jobs=n_cores)
nonlinear_params={'n_estimators': [100,200]}
nonlinear_model = GridSearchCV(estimator=RandomForestRegressor(),
                         param_grid = nonlinear_params, 
                         cv=cv_folds, 
                         n_jobs=n_cores)
linear_model = PCA(n_components=50, n_jobs = n_cores)


res = pd.DataFrame(columns = ['fold', 'train_pearson', 'test_pearson', 'best_params'])
cv = KFold(n_splits=num_folds,shuffle=True,random_state=seed)
for k, (train_index, test_index) in enumerate(cv.split(X)):
    x_train = X.iloc[train_index,:].values
    x_test = X.iloc[test_index,:].values
    y_train = Y.iloc[train_index,:].values.ravel()
    y_test = Y.iloc[test_index,:].values.ravel()
    break


In [146]:
linear_model.fit(x_train)

KeyboardInterrupt: 

In [None]:
linear_model

In [None]:

# get LVs from PLS
pls_model.fit(x_train, y_train)
pls_lvs_train = pls_model.x_scores_

# hyperparameter tuning
nonlinear_model.fit(pls_lvs_train, y_train)

# predict
pls_lvs_test = pls_model.transform(x_test)
yhat_train = nonlinear_model.predict(pls_lvs_train)
yhat_test = nonlinear_model.predict(pls_lvs_test)

train_pearson=pair_pearsonr(y_train, yhat_train, axis=0).mean()
test_pearson=pair_pearsonr(y_test, yhat_test, axis=0).mean()
print(test_pearson)

res.loc[res.shape[0], :] = [k, train_pearson, test_pearson, nonlinear_model.best_params_]
#     res.to_csv(os.path.join(data_path, 'processed', 'plsr_svr.csv'))