In [1]:
from sklearn.linear_model import ElasticNetCV
import numpy as np

def iterative_elastic_net(X_train, y_train, num_iterations=100, selection_threshold=0.8):
    """
    Perform iterative ElasticNetCV and retain features selected in a majority of iterations.
    
    Parameters:
        X_train (np.array): Training feature matrix.
        y_train (np.array): Training target vector.
        num_iterations (int): Number of ElasticNetCV iterations.
        selection_threshold (float): Minimum fraction of iterations a feature must be selected.

    Returns:
        np.array: Mask of selected features.
    """
    n_features = X_train.shape[1]
    feature_selection_counts = np.zeros(n_features)

    for i in range(num_iterations):
        # Set a random seed for reproducibility
        random_seed = np.random.randint(0, 10000)
        elastic_net = ElasticNetCV(cv=5, random_state=random_seed)
        elastic_net.fit(X_train, y_train)
        
        # Record features with non-zero coefficients
        feature_selection_counts += (elastic_net.coef_ != 0).astype(int)

    # Calculate the frequency of selection
    selection_frequency = feature_selection_counts / num_iterations
    
    # Retain features selected above the threshold
    selected_features = selection_frequency > selection_threshold
    return selected_features


from sklearn.linear_model import ElasticNetCV
import numpy as np
from multiprocessing import Pool

def elastic_net_iteration(args):
    """
    Perform a single iteration of ElasticNetCV for feature selection.

    Parameters:
        args (tuple): A tuple containing (X_train, y_train, random_seed).

    Returns:
        np.array: Binary mask indicating selected features.
    """
    X_train, y_train, random_seed = args
    elastic_net = ElasticNetCV(cv=5, random_state=random_seed)
    elastic_net.fit(X_train, y_train)
    return (elastic_net.coef_ != 0).astype(int)

def iterative_elastic_net_parallel(X_train, y_train, num_iterations=100, selection_threshold=0.8, n_jobs=None):
    """
    Perform iterative ElasticNetCV in parallel and retain features selected in a majority of iterations.

    Parameters:
        X_train (np.array): Training feature matrix.
        y_train (np.array): Training target vector.
        num_iterations (int): Number of ElasticNetCV iterations.
        selection_threshold (float): Minimum fraction of iterations a feature must be selected.
        n_jobs (int): Number of parallel processes (default is None, which uses all available cores).

    Returns:
        np.array: Mask of selected features.
    """
    n_features = X_train.shape[1]

    # Create random seeds for each iteration
    random_seeds = np.random.randint(0, 10000, size=num_iterations)

    # Prepare arguments for parallel processing
    args = [(X_train, y_train, seed) for seed in random_seeds]

    # Perform iterations in parallel
    with Pool(processes=n_jobs) as pool:
        feature_selection_masks = pool.map(elastic_net_iteration, args)

    # Calculate the frequency of feature selection
    feature_selection_counts = np.sum(feature_selection_masks, axis=0)
    selection_frequency = feature_selection_counts / num_iterations

    # Retain features selected above the threshold
    selected_features = selection_frequency > selection_threshold
    return selected_features

In [2]:
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [4]:
seed = 42
data_path = '/nobackup/users/hmbaghda/metastatic_potential/'

In [5]:
n_splits = 10

In [None]:
nmp.to_csv(os.path.join(data_path, 'processed', 'metastatic_potential.csv'))
nexpr.to_csv(os.path.join(data_path, 'processed', 'expr.csv'))

In [None]:


# Outer CV loop
n_splits = 10
outer_cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
outer_results = []

for train_idx, test_idx in outer_cv.split(X):
    X_outer_train, X_outer_test = X[train_idx], X[test_idx]
    y_outer_train, y_outer_test = y[train_idx], y[test_idx]

    # Inner CV loop
    inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
    inner_selected_features = []
    inner_best_params = []

    for inner_train_idx, inner_val_idx in inner_cv.split(X_outer_train):
        X_inner_train, X_inner_val = X_outer_train[inner_train_idx], X_outer_train[inner_val_idx]
        y_inner_train, y_inner_val = y_outer_train[inner_train_idx], y_outer_train[inner_val_idx]

        # Step 1: Feature selection with ElasticNetCV -- replace with calling R, or see chatgpt for python version
        # iterate many times
        elastic_net = ElasticNetCV(cv=n_splits, random_state=42)
        elastic_net.fit(X_inner_train, y_inner_train)

        # Identify selected features (non-zero coefficients)
#         selected_features = np.where(elastic_net.coef_ != 0)[0]
#         selected_features = iterative_elastic_net(X_inner_train, y_inner_train, num_iterations=100, selection_threshold=0.8)
        selected_features_mask = iterative_elastic_net_parallel(
            X_train,
            y_train,
            num_iterations=100,
            selection_threshold=0.8,
            n_jobs=4  # Use 4 parallel processes
    )
        X_inner_train_reduced = X_inner_train[:, selected_features]
        X_inner_val_reduced = X_inner_val[:, selected_features]

        inner_selected_features.append(selected_features)

        # Step 2: Hyperparameter tuning (e.g., Random Forest on reduced feature set)
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [None, 10],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2]
        }

        best_params = None
        best_score = float('inf')

        for n_estimators, max_depth, min_samples_split, min_samples_leaf in itertools.product(
                param_grid['n_estimators'], param_grid['max_depth'],
                param_grid['min_samples_split'], param_grid['min_samples_leaf']):
            
            model = RandomForestRegressor(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                random_state=42
            )
            model.fit(X_inner_train_reduced, y_inner_train)
            val_predictions = model.predict(X_inner_val_reduced)
            val_score = mean_squared_error(y_inner_val, val_predictions)

            if val_score < best_score:
                best_score = val_score
                best_params = {
                    'n_estimators': n_estimators,
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split,
                    'min_samples_leaf': min_samples_leaf
                }

        inner_best_params.append(best_params)

    # Aggregate selected features and hyperparameters across inner folds
    feature_counts = Counter([tuple(features) for features in inner_selected_features])
    consensus_features = np.array(feature_counts.most_common(1)[0][0])

    param_counts = Counter(tuple(params.items()) for params in inner_best_params)
    consensus_params = dict(param_counts.most_common(1)[0][0])

    # Step 3: Train final model on outer training set with consensus features
    X_outer_train_reduced = X_outer_train[:, consensus_features]
    X_outer_test_reduced = X_outer_test[:, consensus_features]

    final_model = RandomForestRegressor(**consensus_params, random_state=42)
    final_model.fit(X_outer_train_reduced, y_outer_train)

    # Evaluate on outer test set
    test_predictions = final_model.predict(X_outer_test_reduced)
    test_score = mean_squared_error(y_outer_test, test_predictions)
    test_correlation = np.corrcoef(test_predictions, y_outer_test)[0, 1]
    
    
    # add linear and random baselines here
    
    outer_results.append((test_score, test_correlation))

# Report results
print("Outer Fold Results (MSE, Pearson Correlation):", outer_results)
print("Average MSE:", np.mean([result[0] for result in outer_results]))
print("Average Pearson Correlation:", np.mean([result[1] for result in outer_results]))


In [None]:
from sklearn.ensemble import RandomForestRegressor
from collections import Counter
import numpy as np

# Assume `all_selected_features` and `all_best_params` are from the outer loop
# Step 1: Aggregate consensus features
feature_counts = Counter([tuple(features) for features in all_selected_features])
final_features = np.array(feature_counts.most_common(1)[0][0])

# Step 2: Aggregate consensus hyperparameters
param_counts = Counter(tuple(params.items()) for params in all_best_params)
final_params = dict(param_counts.most_common(1)[0][0])

print("Final Selected Features:", final_features)
print("Final Hyperparameters:", final_params)

# Step 3: Train final model on the entire dataset
X_reduced = X[:, final_features]  # Use the entire dataset with final features
final_model = RandomForestRegressor(**final_params, random_state=42)
final_model.fit(X_reduced, y)  # Train on all data

# Step 4: Save the final model (optional)
import joblib
joblib.dump(final_model, "final_random_forest_model.pkl")
