In [1]:
import os

from tqdm import tqdm 

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from scipy.stats import pearsonr
import scipy.stats as stats
from sklearn.linear_model import LinearRegression

In [2]:
seed = 42
n_cores = 30
data_path = '/nobackup/users/hmbaghda/metastatic_potential/'

In [3]:
X_train_val_all = pd.read_csv(os.path.join(data_path, 'interim', 'depr_X_train_val.csv'), index_col = 0)
y_train_val_all = pd.read_csv(os.path.join(data_path, 'interim', 'depr_y_train_val.csv'), index_col = 0)

X_test_all = pd.read_csv(os.path.join(data_path, 'interim', 'X_test.csv'), index_col = 0)
y_test = pd.read_csv(os.path.join(data_path, 'interim', 'y_test.csv'), index_col = 0)
y_test = y_test.values.flatten()

In [74]:
selected_features = pd.read_csv(os.path.join(data_path, 'interim', 'depr_selected_features.csv'), index_col = 0)
# selected_features = pd.read_csv(os.path.join(data_path, 'interim', 'depr_selected_all_features.csv'), index_col = 0)

selected_feature_index = selected_features.index.tolist()
# selected_feature_index = open(os.path.join(data_path, 'interim', 'depre_selected_train.txt')).read().splitlines()


selected_feature_index = ['-'.join(i.split('.')) for i in selected_feature_index] # formatting R --> python

for idx, i in enumerate(selected_feature_index):
    if i == 'X5S_rRNA':
        selected_feature_index[idx] = '5S_rRNA'

X_train_val.columns = ['-'.join(col.split('.')) for col in X_train_val.columns]
X_test.columns = ['-'.join(col.split('.')) for col in X_test.columns]

X_train_val = X_train_val.loc[:, selected_feature_index]
X_test = X_test.loc[:, selected_feature_index]


In [10]:
# X_train_val_all = pd.read_csv(os.path.join(data_path, 'interim', 'X_train_val.csv'), index_col = 0)
# y_train_val_all = pd.read_csv(os.path.join(data_path, 'interim', 'y_train_val.csv'), index_col = 0)

# X_test_all = pd.read_csv(os.path.join(data_path, 'interim', 'X_test.csv'), index_col = 0)
# y_test = pd.read_csv(os.path.join(data_path, 'interim', 'y_test.csv'), index_col = 0)
# y_test = y_test.values.flatten()

# X_train_val = X_train_val_all.copy()
# y_train_val = y_train_val_all.copy()
# X_test = X_test_all.copy()

# import json
# with open(os.path.join(data_path, 'interim', 'depr_boruta_features.json'), 'r') as file:
#     selected_features_dict = json.load(file)
# selected_feature_index = selected_features_dict['0']['selected_features']
# X_train_val = X_train_val.loc[:, selected_feature_index]
# X_test = X_test.loc[:, selected_feature_index]

In [11]:
# res_ = res.infer_objects(copy = False).fillna('None')
# hyperparam_names = ['n_estimators', 'max_features', 'max_samples', 'max_depth', 'min_samples_split', 'min_samples_leaf']
# best_mse_params = list(res_.groupby(hyperparam_names)['mse'].mean().idxmin())
# best_mse_params = [x if x != 'None' else None for x in best_mse_params]
# best_mse_params = dict(zip(hyperparam_names, best_mse_params))

# for hp in ['max_features', 'max_depth']:
#     best_mse_params[hp] = int(best_mse_params[hp])

# best_mse_params

best_mse_params = {'n_estimators': np.int64(850),
 'max_features': 1,
 'max_samples': None,
 'max_depth': 25,
 'min_samples_split': np.int64(2),
 'min_samples_leaf': np.int64(1)}



In [12]:
rf_mse = []
rf_pearson = []
rf_spearman = []

n_splits = 10
kf = KFold(n_splits=n_splits, shuffle = True, random_state = seed)

for i, (train_index, val_index) in tqdm(enumerate(kf.split(X_train_val, y_train_val))): # iterate through folds for train-val split and do the below loop for each fold

    X_train, _ = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
    y_train, _ = y_train_val.iloc[train_index], y_train_val.iloc[val_index]
    y_train = y_train.values.flatten()

    # ACTUAL MODEL
#     model = RandomForestRegressor(n_estimators = best_mse_params['n_estimators'],
#                                   max_features = best_mse_params['max_features'],
#                                   max_samples = best_mse_params['max_samples'],
#                                   max_depth = best_mse_params['max_depth'],
#                                   min_samples_split = best_mse_params['min_samples_split'],
#                                   min_samples_leaf = best_mse_params['min_samples_leaf'],
#                                   n_jobs = n_cores,
#                                   random_state = seed)
    
    model = RandomForestRegressor(n_jobs = n_cores,
                                  random_state = seed)
    # # training on shuffled features (random baseline)
    # y_train_shuffled = y_train.values.reshape(-1).copy()
    # np.random.shuffle(y_train_shuffled)
    model.fit(X_train, y_train) # fit on the shuffled data

    # predict on non-shuffled features
    y_pred = model.predict(X_test)

    # store results
    rf_mse.append(mean_squared_error(y_test, y_pred))
    rf_pearson.append(pearsonr(y_test, y_pred))
    rf_spearman.append(stats.spearmanr(y_test, y_pred))
    if i > 4:
        break


5it [00:02,  1.78it/s]


In [13]:
np.mean([p.statistic for p in rf_pearson])

np.float64(0.03908738242198852)

In [72]:
np.mean([p.statistic for p in rf_pearson])

np.float64(0.8706469041422756)

In [42]:
np.mean([p.statistic for p in rf_pearson])

np.float64(0.1840716432473156)

In [22]:
np.mean([p.statistic for p in rf_pearson])

np.float64(0.7338203011812585)

validation:

In [146]:
rf_mse = []
rf_pearson = []
rf_spearman = []

n_splits = 10
kf = KFold(n_splits=n_splits, shuffle = True, random_state = seed)

for i, (train_index, val_index) in tqdm(enumerate(kf.split(X_train_val, y_train_val))): # iterate through folds for train-val split and do the below loop for each fold

    X_train, X_val = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[val_index]
    y_train = y_train.values.flatten()
    y_val = y_val.values.flatten()

    # ACTUAL MODEL
#     model = RandomForestRegressor(n_estimators = best_mse_params['n_estimators'],
#                                   max_features = best_mse_params['max_features'],
#                                   max_samples = best_mse_params['max_samples'],
#                                   max_depth = best_mse_params['max_depth'],
#                                   min_samples_split = best_mse_params['min_samples_split'],
#                                   min_samples_leaf = best_mse_params['min_samples_leaf'],
#                                   n_jobs = n_cores,
#                                   random_state = seed)
    
    model = RandomForestRegressor(n_jobs = n_cores,
                                  random_state = seed)
    # # training on shuffled features (random baseline)
    # y_train_shuffled = y_train.values.reshape(-1).copy()
    # np.random.shuffle(y_train_shuffled)
    model.fit(X_train, y_train) # fit on the shuffled data

    # predict on non-shuffled features
    y_pred = model.predict(X_val)

    # store results
    rf_mse.append(mean_squared_error(y_val, y_pred))
    rf_pearson.append(pearsonr(y_val, y_pred))
    rf_spearman.append(stats.spearmanr(y_val, y_pred))

10it [00:05,  1.89it/s]


In [147]:
np.mean([p.statistic for p in rf_pearson])

np.float64(0.5809239286874526)

In [139]:
np.mean([p.statistic for p in rf_pearson])

np.float64(0.6032467406260676)

In [103]:
np.mean([p.statistic for p in rf_pearson])

np.float64(0.6343411831398538)