Here we do nested 10-fold CV. We have an outter loop split into (train + validation, test)  where we have unseen test data. We then have an inner loop splitting the outter train + validation into (train, validation). In the inner loop we do the following:

1. Feature selection: This is conducted only on the train data (to allow for generalization to validation during hyperparameter tuning). We use 10-fold elastic net CV on the train data, selecting those features that were retained 80% of the time across 100 iterations. CV chooses the best alpha, while we set the L1 ratio to 0.1 to help ensure that approximately 10-20% of the features are selected for. 
2. Hyperparameter tuning: The model is fit on the training data for the selected features, and assessed on the validation data. Across the inner 10-folds, we identify the set of hyperparameters that minimize the MSE. 



In [1]:
import os
from multiprocessing import Pool

from tqdm import trange 

import pandas as pd
import numpy as np

from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
n_cores = 20
seed = 42
data_path = '/nobackup/users/hmbaghda/metastatic_potential/'
res_all_fn = os.path.join(data_path, 'interim', 'iteration_all_res.json')

In [4]:
n_splits = 10

# feature selection params
n_iter_elastic = int(1e2)
feature_thresh = 0.8
par_feature = True # parallelization on feature selection

In [5]:
def write_res(res_all):
    with open(res_all_fn, "w") as json_file:
        json.dump(res_all, json_file, indent=4)  

def elastic_net_iteration(X, y, seed_, n_splits, n_cores):
    """
    Perform a single iteration of ElasticNetCV for feature selection.

    Parameters:
        args (tuple): A tuple containing (X_train, y_train, random_seed).

    Returns:
        np.array: Binary mask indicating selected features.
    """
    elastic_net = ElasticNetCV(l1_ratio = 0.1, 
                               cv=n_splits, 
                               n_alphas = 100,
                               random_state=seed, 
                               n_jobs = n_cores)
    elastic_net.fit(X, y.values.ravel())
    selected_features = np.where(elastic_net.coef_ != 0)[0]
    return selected_features

In [7]:
y = pd.read_csv(os.path.join(data_path, 'processed', 'metastatic_potential.csv'), index_col = 0)
X = pd.read_csv(os.path.join(data_path, 'processed', 'expr.csv'), index_col = 0).transpose()

if os.path.isfile(res_all_fn):
    with open(res_all_fn, 'r') as file:
        res_all = json.load(file)
else:
    res_all = {}

In [1]:
outer_idx = 0
inner_idx = 0

In [9]:
outer_cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
for outer_idx, (train_idx, test_idx) in enumerate(outer_cv.split(X)):
    res_all[outer_idx] = {}
    X_outer_train, X_outer_test = X.iloc[train_idx,:], X.iloc[test_idx,:]
    y_outer_train, y_outer_test = y.iloc[train_idx,:], y.iloc[test_idx,:]

    # set up inner loop
    inner_cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    inner_selected_features = []
    inner_best_params = []
    
    for inner_idx, (inner_train_idx, inner_val_idx) in enumerate(inner_cv.split(X_outer_train)):
        print('outer: {}', 'inner: {}'.format(outer_idx, inner_idx))
        res_all[outer_idx][inner_idx] = {}
        X_inner_train, X_inner_val = X_outer_train.iloc[inner_train_idx,:], X_outer_train.iloc[inner_val_idx,:]
        y_inner_train, y_inner_val = y_outer_train.iloc[inner_train_idx,:], y_outer_train.iloc[inner_val_idx,:]
        
        # FEATURE SELECTION - on inner train
        if 'selected_features' not in res_all[outer_idx][inner_idx]:
            ec_seeds = range(n_iter_elastic)
            if not par_feature:
                selected_features_res = []
                for feature_iter in trange(n_iter_elastic):
                    selected_features = elastic_net_iteration(X = X_inner_train, 
                                                              y = y_inner_train, 
                                                              seed_ = ec_seeds[feature_iter], 
                                                              n_splits = n_splits,
                                                              n_cores = n_cores)
                    selected_features_res.append(selected_features)
            else:
                pool = Pool(processes = min(n_cores, n_iter_elastic))
                par_inputs = [(X_inner_train, y_inner_train, seed, n_splits, 1) for seed in ec_seeds]
                selected_features_res = pool.starmap(elastic_net_iteration, par_inputs)
                del par_inputs
                pool.close()
                pool.join()
                gc.collect()


            feature_counter = dict(zip(list(range(X_inner_train.shape[1])), [0]*X_inner_train.shape[1]))
            for selected_features in selected_features_res:
                for sf_idx in selected_features:
                    feature_counter[sf_idx] += 1
            selected_features = [k for k,v in feature_counter.items() if v >= (n_iter_elastic * feature_thresh)]
            selected_features = X_inner_train.columns[selected_features].tolist()

            res_all[outer_idx][inner_idx]['selected_features'] = selected_features
            write_res(res_all) # checkpoint 1
        # HYPERPARAMETER TUNING

outer: {} inner: 0


Process ForkPoolWorker-17:
Traceback (most recent call last):
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
Process ForkPoolWorker-4:
Traceback (most recent call last):
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Process ForkPoolWorker-5:
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
Process ForkPo

Process ForkPoolWorker-19:
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/sklearn/linear_model/_coordinate_descent.py", line 2407, in fit
    return super().fit(X, y, sample_weight=sample_weight, **params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3

  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/joblib/parallel.py", line 1918, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/joblib/parallel.py", line 1847, in _get_sequential_output
    res = func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^
Process ForkPoolWorker-10:
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/sklearn/utils/parallel.py", line 136, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/joblib/parallel.py", line 1847, in _get_sequential_output
    res = func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_

Process ForkPoolWorker-11:
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/multiprocessing/pool.py", line 51, in starmapstar
    return list(itertools.starmap(args[0], args[1]))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_968057/2323763638.py", line 20, in elastic_net_iteration
    elastic_net.fit(X, y.values.ravel())
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/sklea

  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/sklearn/utils/parallel.py", line 74, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/joblib/parallel.py", line 1918, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/joblib/parallel.py", line 1847, in _get_sequential_output
    res = func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/sklearn/linear_model/_coordinate_descent.py", line 697, in enet_path
    model = cd_fast.enet_coordinate_descent(
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_968057/232376

  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/sklearn/utils/parallel.py", line 74, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/joblib/parallel.py", line 1918, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/multiprocessing/pool.py", line 51, in starmapstar
    return list(itertools.starmap(args[0], args[1]))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/mi

  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/sklearn/linear_model/_coordinate_descent.py", line 2407, in fit
    return super().fit(X, y, sample_weight=sample_weight, **params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/sklearn/linear_model/_coordinate_descent.py", line 1784, in fit
    mse_paths = Parallel(
                ^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/sklearn/linear_model/_coordinate_descent.py", line 1784, in fit
    mse_paths = Parallel(
                ^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp

  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/sklearn/linear_model/_coordinate_descent.py", line 1784, in fit
    mse_paths = Parallel(
                ^^^^^^^^^
KeyboardInterrupt
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/multiprocessing/pool.py", line 51, in starmapstar
    return list(itertools.starmap(args[0], args[1]))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_968057/2323763638.py", line 20, in elastic_net_iteration
    elastic_net.fit(X, y.values.ravel())
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_968057/2323763638.py", line 20, in elastic_net_iteration
    elastic_net.fit(X, y.values.ravel())
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib

KeyboardInterrupt
KeyboardInterrupt
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/sklearn/utils/parallel.py", line 136, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/sklearn/linear_model/_coordinate_descent.py", line 1464, in _path_residuals
    alphas, coefs, _ = path(X_train, y_train, **path_params)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 186, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/nobackup/users/hmbaghda/Software/miniforge3/envs/mp_2/lib/python3.12/site-packages/sklearn/linear_model/_coordinate_descent.py", line 697, in enet_path
    model = cd_fast.enet_coordinate_descent(
            ^^^^^^^^^^^^^^

KeyboardInterrupt: 

In [236]:
res_all[outer_idx] = {}

In [239]:
write_res(res_all)

In [209]:
n_splits = 3
n_iter_elastic = 2
par = False

In [228]:
res_all = {}
import json

In [226]:
import json
with open(os.path.join(data_path, 'interim', 'iteration_all_res.json'), 'r') as file:
    selected_features_dict = json.load(file)

{0: {0: {'selected_features': ['AAAS',
    'ACER2P1',
    'ACNATP',
    'ACYP1',
    'ADAM18',
    'ADGRL4',
    'ADH5P5',
    'AGTR1',
    'ALDH9A1',
    'AMY2B',
    'ANKDD1B',
    'ANKRD33BP10',
    'ANO3',
    'APOL5',
    'ARF4-AS1',
    'ARGFX',
    'ARPP19P2',
    'ARSDP1',
    'ARSLP1',
    'ASH1L-IT1',
    'ASXL2',
    'ATE1',
    'ATG12P1',
    'ATP5MFP6',
    'ATP5PBP1',
    'ATP6V0E1P1',
    'BACH1',
    'BAG2',
    'BCAP31P2',
    'BCRP4',
    'BHMT2',
    'BMF',
    'BRD7P7',
    'BTC',
    'BTF3-DT',
    'C17orf75',
    'C3orf49',
    'C9orf24',
    'CA3',
    'CACNA2D1-AS1',
    'CARD8',
    'CDH6',
    'CDH8-AS1',
    'CELF4',
    'CH25H',
    'CLCP1',
    'CLSTN3',
    'CNIH3',
    'COPRSP1',
    'COX5BP6',
    'CPSF3',
    'CRTC1P1',
    'CSRP2P2',
    'CT66',
    'CTAGE9',
    'CTNNA2',
    'CXCL6',
    'CYP2C8',
    'CYP2E1',
    'DENND2B-AS1',
    'DIO2',
    'DMGDH',
    'DNAJC19P7',
    'DNAJC28',
    'DNAJC8P4',
    'DNMBP',
    'DPPA4',
    'DPY19L2P4',
    'D

In [217]:
len(og)

397

In [187]:
selected_features

[14,
 15,
 58,
 134,
 248,
 355,
 442,
 914,
 955,
 957,
 1025,
 1120,
 1240,
 1339,
 1436,
 1559,
 1694,
 1704,
 1808,
 1814,
 1878,
 1880,
 2078,
 2175,
 2190,
 2259,
 2591,
 2669,
 2848,
 2850,
 3026,
 3098,
 3112,
 3201,
 3236,
 3327,
 3373,
 3417,
 3449,
 3615,
 3760,
 3956,
 4079,
 4132,
 4141,
 4223,
 4462,
 4570,
 4778,
 4882,
 4914,
 4967,
 5011,
 5032,
 5201,
 5302,
 5426,
 5548,
 5646,
 5770,
 5865,
 5883,
 5914,
 6005,
 6024,
 6059,
 6095,
 6144,
 6149,
 6153,
 6161,
 6354,
 6440,
 6502,
 6544,
 6570,
 6585,
 6732,
 6843,
 6892,
 6903,
 6966,
 6970,
 6994,
 6995,
 7006,
 7032,
 7069,
 7096,
 7163,
 7264,
 7287,
 7328,
 7330,
 7347,
 7371,
 7430,
 7436,
 7591,
 7648,
 7699,
 7701,
 7749,
 7872,
 7874,
 7932,
 7965,
 7979,
 8025,
 8160,
 8200,
 8203,
 8238,
 8382,
 8661,
 8787,
 8794,
 8846,
 8890,
 8961,
 8963,
 9258,
 9302,
 9311,
 9352,
 9721,
 9757,
 9845,
 9847,
 9984,
 10014,
 10064,
 10164,
 10307,
 10308,
 10371,
 10417,
 10437,
 10464,
 10539,
 10632,
 10644,
 10769,

In [169]:
selected_features_all = pd.DataFrame(columns = range(X_inner_train.shape[1]), 
                                     index = range(n_iter_elastic))
selected_features_all = selected_features_all.infer_objects(copy=False).fillna(0)
for idx, selected_features in enumerate(selected_features_res):
    selected_features_all.iloc[feature_iter, selected_features] = 1

KeyboardInterrupt: 

In [167]:
selected_features_res[0] == selected_features_res[1]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [130]:
len(selected_features)

397

In [127]:
len(selected_features)

632

In [104]:
selected_features

(array([   14,    15,    58,   134,   214,   242,   248,   355,   442,
          520,   648,   666,   945,   955,   957,  1025,  1041,  1339,
         1347,  1364,  1436,  1526,  1559,  1651,  1694,  1704,  1808,
         1814,  1878,  1880,  2078,  2175,  2190,  2259,  2476,  2501,
         2525,  2591,  2607,  2669,  2765,  2789,  2809,  2816,  2848,
         2850,  3026,  3098,  3112,  3236,  3327,  3373,  3417,  3449,
         3459,  3507,  3615,  3728,  3760,  3762,  3779,  3895,  3956,
         4044,  4066,  4079,  4120,  4132,  4141,  4223,  4234,  4267,
         4310,  4462,  4561,  4570,  4603,  4738,  4882,  4914,  4960,
         4964,  4967,  5011,  5032,  5086,  5103,  5154,  5201,  5302,
         5548,  5600,  5627,  5646,  5677,  5734,  5770,  5802,  5865,
         5883,  5914,  5933,  6005,  6018,  6024,  6059,  6095,  6096,
         6144,  6149,  6153,  6161,  6209,  6332,  6354,  6440,  6502,
         6544,  6585,  6732,  6843,  6892,  6903,  6920,  6994,  6995,
      

1.6

In [63]:
elastic_net.alpha_

1.1239541562839666

In [67]:
og == selected_features

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [65]:
selected_features

array([   14,    15,    58,   134,   248,   355,   442,   914,   955,
         957,  1025,  1120,  1240,  1339,  1436,  1559,  1694,  1704,
        1808,  1814,  1878,  1880,  2078,  2175,  2190,  2259,  2591,
        2669,  2848,  2850,  3026,  3098,  3112,  3201,  3236,  3327,
        3373,  3417,  3449,  3615,  3760,  3956,  4079,  4132,  4141,
        4223,  4462,  4570,  4778,  4882,  4914,  4967,  5011,  5032,
        5201,  5302,  5426,  5548,  5646,  5770,  5865,  5883,  5914,
        6005,  6024,  6059,  6095,  6144,  6149,  6153,  6161,  6354,
        6440,  6502,  6544,  6570,  6585,  6732,  6843,  6892,  6903,
        6966,  6970,  6994,  6995,  7006,  7032,  7069,  7096,  7163,
        7264,  7287,  7328,  7330,  7347,  7371,  7430,  7436,  7591,
        7648,  7699,  7701,  7749,  7872,  7874,  7932,  7965,  7979,
        8025,  8160,  8200,  8203,  8238,  8382,  8661,  8787,  8794,
        8846,  8890,  8961,  8963,  9258,  9302,  9311,  9352,  9721,
        9757,  9845,

In [55]:
elastic_net.alpha_

1.1239541562839666

In [None]:
X_train

In [48]:
help(ElasticNetCV)

Help on class ElasticNetCV in module sklearn.linear_model._coordinate_descent:

class ElasticNetCV(sklearn.base.RegressorMixin, LinearModelCV)
 |  ElasticNetCV(*, l1_ratio=0.5, eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, precompute='auto', max_iter=1000, tol=0.0001, cv=None, copy_X=True, verbose=0, n_jobs=None, positive=False, random_state=None, selection='cyclic')
 |
 |  Elastic Net model with iterative fitting along a regularization path.
 |
 |  See glossary entry for :term:`cross-validation estimator`.
 |
 |  Read more in the :ref:`User Guide <elastic_net>`.
 |
 |  Parameters
 |  ----------
 |  l1_ratio : float or list of float, default=0.5
 |      Float between 0 and 1 passed to ElasticNet (scaling between
 |      l1 and l2 penalties). For ``l1_ratio = 0``
 |      the penalty is an L2 penalty. For ``l1_ratio = 1`` it is an L1 penalty.
 |      For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2
 |      This parameter can be a list, in which case the dif

In [None]:


# Outer CV loop
n_splits = 10
outer_cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
outer_results = []

for train_idx, test_idx in outer_cv.split(X):
    X_outer_train, X_outer_test = X[train_idx], X[test_idx]
    y_outer_train, y_outer_test = y[train_idx], y[test_idx]

    # Inner CV loop
    inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
    inner_selected_features = []
    inner_best_params = []

    for inner_train_idx, inner_val_idx in inner_cv.split(X_outer_train):
        X_inner_train, X_inner_val = X_outer_train[inner_train_idx], X_outer_train[inner_val_idx]
        y_inner_train, y_inner_val = y_outer_train[inner_train_idx], y_outer_train[inner_val_idx]

        # Step 1: Feature selection with ElasticNetCV -- replace with calling R, or see chatgpt for python version
        # iterate many times
        elastic_net = ElasticNetCV(cv=n_splits, random_state=42)
        elastic_net.fit(X_inner_train, y_inner_train)

        # Identify selected features (non-zero coefficients)
#         selected_features = np.where(elastic_net.coef_ != 0)[0]
#         selected_features = iterative_elastic_net(X_inner_train, y_inner_train, num_iterations=100, selection_threshold=0.8)
        selected_features_mask = iterative_elastic_net_parallel(
            X_train,
            y_train,
            num_iterations=100,
            selection_threshold=0.8,
            n_jobs=4  # Use 4 parallel processes
    )
        X_inner_train_reduced = X_inner_train[:, selected_features]
        X_inner_val_reduced = X_inner_val[:, selected_features]

        inner_selected_features.append(selected_features)

        # Step 2: Hyperparameter tuning (e.g., Random Forest on reduced feature set)
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [None, 10],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2]
        }

        best_params = None
        best_score = float('inf')

        for n_estimators, max_depth, min_samples_split, min_samples_leaf in itertools.product(
                param_grid['n_estimators'], param_grid['max_depth'],
                param_grid['min_samples_split'], param_grid['min_samples_leaf']):
            
            model = RandomForestRegressor(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                random_state=42
            )
            model.fit(X_inner_train_reduced, y_inner_train)
            val_predictions = model.predict(X_inner_val_reduced)
            val_score = mean_squared_error(y_inner_val, val_predictions)

            if val_score < best_score:
                best_score = val_score
                best_params = {
                    'n_estimators': n_estimators,
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split,
                    'min_samples_leaf': min_samples_leaf
                }

        inner_best_params.append(best_params)

    # Aggregate selected features and hyperparameters across inner folds
    feature_counts = Counter([tuple(features) for features in inner_selected_features])
    consensus_features = np.array(feature_counts.most_common(1)[0][0])

    param_counts = Counter(tuple(params.items()) for params in inner_best_params)
    consensus_params = dict(param_counts.most_common(1)[0][0])

    # Step 3: Train final model on outer training set with consensus features
    X_outer_train_reduced = X_outer_train[:, consensus_features]
    X_outer_test_reduced = X_outer_test[:, consensus_features]

    final_model = RandomForestRegressor(**consensus_params, random_state=42)
    final_model.fit(X_outer_train_reduced, y_outer_train)

    # Evaluate on outer test set
    test_predictions = final_model.predict(X_outer_test_reduced)
    test_score = mean_squared_error(y_outer_test, test_predictions)
    test_correlation = np.corrcoef(test_predictions, y_outer_test)[0, 1]
    
    
    # add linear and random baselines here
    
    outer_results.append((test_score, test_correlation))

# Report results
print("Outer Fold Results (MSE, Pearson Correlation):", outer_results)
print("Average MSE:", np.mean([result[0] for result in outer_results]))
print("Average Pearson Correlation:", np.mean([result[1] for result in outer_results]))


In [None]:
from sklearn.ensemble import RandomForestRegressor
from collections import Counter
import numpy as np

# Assume `all_selected_features` and `all_best_params` are from the outer loop
# Step 1: Aggregate consensus features
feature_counts = Counter([tuple(features) for features in all_selected_features])
final_features = np.array(feature_counts.most_common(1)[0][0])

# Step 2: Aggregate consensus hyperparameters
param_counts = Counter(tuple(params.items()) for params in all_best_params)
final_params = dict(param_counts.most_common(1)[0][0])

print("Final Selected Features:", final_features)
print("Final Hyperparameters:", final_params)

# Step 3: Train final model on the entire dataset
X_reduced = X[:, final_features]  # Use the entire dataset with final features
final_model = RandomForestRegressor(**final_params, random_state=42)
final_model.fit(X_reduced, y)  # Train on all data

# Step 4: Save the final model (optional)
import joblib
joblib.dump(final_model, "final_random_forest_model.pkl")
