In [13]:
import os

from tqdm import tqdm 

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from scipy.stats import pearsonr
import scipy.stats as stats
from sklearn.linear_model import LinearRegression

In [20]:
seed = 42
n_cores = 20
data_path = '/nobackup/users/hmbaghda/metastatic_potential/'

In [51]:
X_train_val = pd.read_csv(os.path.join(data_path, 'interim', 'X_train_val_selected.csv'), index_col = 0)
y_train_val = pd.read_csv(os.path.join(data_path, 'interim', 'y_train_val.csv'), index_col = 0)

X_test = pd.read_csv(os.path.join(data_path, 'interim', 'X_test_selected.csv'), index_col = 0)
y_test = pd.read_csv(os.path.join(data_path, 'interim', 'y_test.csv'), index_col = 0)
y_test = y_test.values.flatten()

res = pd.read_csv(os.path.join(data_path, 'interim', 'hyperparams.csv'), index_col = 0)

In [4]:
# fns = [os.path.join(data_path, 'interim', 'X_train_val_selected.csv'), 
#        os.path.join(data_path, 'interim', 'y_train_val.csv'), 
#        os.path.join(data_path, 'interim', 'X_test_selected.csv'), 
#        os.path.join(data_path, 'interim', 'y_test.csv'), 
#       os.path.join(data_path, 'interim', 'hyperparams.csv')]
# for fn in fns:
#     print('scp hmbaghda@satori-login-002.mit.edu:' + fn + ' .')
# print('')

scp hmbaghda@satori-login-002.mit.edu:/nobackup/users/hmbaghda/metastatic_potential/interim/X_train_val_selected.csv .
scp hmbaghda@satori-login-002.mit.edu:/nobackup/users/hmbaghda/metastatic_potential/interim/y_train_val.csv .
scp hmbaghda@satori-login-002.mit.edu:/nobackup/users/hmbaghda/metastatic_potential/interim/X_test_selected.csv .
scp hmbaghda@satori-login-002.mit.edu:/nobackup/users/hmbaghda/metastatic_potential/interim/y_test.csv .
scp hmbaghda@satori-login-002.mit.edu:/nobackup/users/hmbaghda/metastatic_potential/interim/hyperparams.csv .



In [52]:
res_ = res.infer_objects(copy = False).fillna('None')
hyperparam_names = ['n_estimators', 'max_features', 'max_samples', 'max_depth', 'min_samples_split', 'min_samples_leaf']
best_mse_params = list(res_.groupby(hyperparam_names)['mse'].mean().idxmin())
best_mse_params = [x if x != 'None' else None for x in best_mse_params]
best_mse_params = dict(zip(hyperparam_names, best_mse_params))

for hp in ['max_features', 'max_depth']:
    best_mse_params[hp] = int(best_mse_params[hp])

best_mse_params

{'n_estimators': np.int64(850),
 'max_features': 1,
 'max_samples': None,
 'max_depth': 25,
 'min_samples_split': np.int64(2),
 'min_samples_leaf': np.int64(1)}

In [58]:
rf_mse = []
rf_pearson = []
rf_spearman = []

baseline_mse = []
baseline_pearson = []
baseline_spearman = []

linear_mse = []
linear_pearson = []
linear_spearman = []

n_splits = 10
kf = KFold(n_splits=n_splits, shuffle = True, random_state = seed)
y_predictions = pd.DataFrame(columns = range(n_splits), index = range(y_test.shape[0]))

for i, (train_index, val_index) in tqdm(enumerate(kf.split(X_train_val, y_train_val))): # iterate through folds for train-val split and do the below loop for each fold

    X_train, _ = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
    y_train, _ = y_train_val.iloc[train_index], y_train_val.iloc[val_index]
    y_train = y_train.values.flatten()

    # ACTUAL MODEL
    model = RandomForestRegressor(n_estimators = best_mse_params['n_estimators'],
                                  max_features = best_mse_params['max_features'],
                                  max_samples = best_mse_params['max_samples'],
                                  max_depth = best_mse_params['max_depth'],
                                  min_samples_split = best_mse_params['min_samples_split'],
                                  min_samples_leaf = best_mse_params['min_samples_leaf'],
                                  n_jobs = n_cores,
                                  random_state = seed)
    # # training on shuffled features (random baseline)
    # y_train_shuffled = y_train.values.reshape(-1).copy()
    # np.random.shuffle(y_train_shuffled)
    model.fit(X_train, y_train) # fit on the shuffled data

    # predict on non-shuffled features
    y_pred = model.predict(X_test)

    # store results
    y_predictions.iloc[:, i] = y_pred
    rf_mse.append(mean_squared_error(y_test, y_pred))
    rf_pearson.append(pearsonr(y_test, y_pred))
    rf_spearman.append(stats.spearmanr(y_test, y_pred))

    # RANDOM BASELINE
    model_rand = RandomForestRegressor(n_estimators = best_mse_params['n_estimators'],
                                  max_features = best_mse_params['max_features'],
                                  max_samples = best_mse_params['max_samples'],
                                  max_depth = best_mse_params['max_depth'],
                                  min_samples_split = best_mse_params['min_samples_split'],
                                  min_samples_leaf = best_mse_params['min_samples_leaf'],
                                    random_state = seed)

    # training on shuffled features (random baseline)
    y_train_shuffled = y_train.copy()
    np.random.shuffle(y_train_shuffled)
    model_rand.fit(X_train, y_train_shuffled) # fit on the shuffled data

    # predict on non-shuffled features
    y_pred_rand = model_rand.predict(X_test)

    # TO DO get mse and pearsons comparing y_pred_rand to y_test
    baseline_mse.append(mean_squared_error(y_test, y_pred_rand))
    baseline_pearson.append(pearsonr(y_test, y_pred_rand))
    baseline_spearman.append(stats.spearmanr(y_test, y_pred_rand))


   # LINEAR BASELINE
    model_linear = LinearRegression()
    model_linear.fit(X_train, y_train)

    # predict on test values
    y_pred_linear = model_linear.predict(X_test)

    # get stats comparing predicted to test values
    linear_mse.append(mean_squared_error(y_test, y_pred_linear))
    linear_pearson.append(pearsonr(y_test, y_pred_linear)) # flatten to 1D arrays
    linear_spearman.append(stats.spearmanr(y_test, y_pred_linear))



10it [00:36,  3.61s/it]


In [62]:
stats.spearmanr(y_test, y_pred)

SignificanceResult(statistic=np.float64(0.18976256552574777), pvalue=np.float64(0.2065437533858929))

In [61]:
y_pred

array([-1.56093728, -0.94204902, -1.6349899 , -1.41862062, -1.76053958,
       -1.18147114, -1.8719264 , -1.37790161, -1.94459033, -1.71356756,
       -1.56643192, -2.20652747, -1.46287334, -1.16829814, -1.29843077,
       -1.04836812, -1.89047599, -2.03876644, -0.78611356, -1.28246168,
       -2.28050832, -1.70636575, -1.46087174, -1.63282035, -1.67581432,
       -1.69941386, -1.89251902, -1.68891085, -1.79536695, -1.68181596,
       -0.98974806, -1.64609567, -1.61547171, -1.50081229, -1.78545324,
       -1.75446019, -1.71656779, -1.96649416, -1.72883712, -2.01120151,
       -1.24736955, -2.06782622, -1.49713663, -1.62209141, -1.38066736,
       -0.89321234])

In [29]:
y_pred

array([-1.68769417, -0.82605535, -1.69000056, -1.3477291 , -1.70552701,
       -1.18819499, -1.76194308, -1.44865451, -1.93319881, -1.76671943,
       -1.48143044, -2.0771705 , -1.39989795, -1.30339735, -1.34194877,
       -1.00936808, -1.88674267, -2.04838034, -0.7545633 , -1.26668222,
       -2.2501269 , -1.62513068, -1.46957948, -1.62690455, -1.67585733,
       -1.59833999, -1.79910003, -1.63450723, -1.74151919, -1.6270904 ,
       -0.95610886, -1.75031271, -1.61299769, -1.40687082, -1.73162797,
       -1.74344652, -1.73968625, -2.00004862, -1.66168871, -1.9598469 ,
       -1.19895341, -1.97431925, -1.40940372, -1.50373621, -1.34616631,
       -1.03315551])