In [2]:
import os
import pickle
import pathlib

from tqdm import tqdm

import numpy as np
import pandas as pd

import optuna
from optuna.samplers import CmaEsSampler, TPESampler, RandomSampler
from optuna.distributions import CategoricalDistribution

from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import ElasticNet, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import make_scorer, mean_squared_error
from scipy.stats import pearsonr
from sklearn.utils import shuffle

import sys
sys.path.insert(1, '../')
from utils import write_pickled_object
from utils import FeatureSelector, MeanCenterer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data_path = '/nobackup/users/hmbaghda/metastatic_potential/'
random_state = 42 + 2

n_cores = 80
os.environ["OMP_NUM_THREADS"] = str(n_cores)
os.environ["MKL_NUM_THREADS"] = str(n_cores)
os.environ["OPENBLAS_NUM_THREADS"] = str(n_cores)
os.environ["VECLIB_MAXIMUM_THREADS"] = str(n_cores)
os.environ["NUMEXPR_NUM_THREADS"] = str(n_cores)

In [25]:
def pearson_corr_scorer(y_true, y_pred):
    return pearsonr(y_true, y_pred)[0]

class PLSRegression_X(PLSRegression):
    def transform(self, X, y=None):
        X_transformed = super().transform(X, y)
        if isinstance(X_transformed, tuple):
            X_transformed = X_transformed[0]
        return X_transformed

In [5]:
class HybridSampler(optuna.samplers.BaseSampler):
    def __init__(self, primary_sampler, fallback_sampler):
        self.primary_sampler = primary_sampler  # e.g., CmaEsSampler
        self.fallback_sampler = fallback_sampler  # e.g., TPESampler

    def infer_relative_search_space(self, study, trial):
        # Let the primary sampler define the relative search space
        return self.primary_sampler.infer_relative_search_space(study, trial)

    def sample_relative(self, study, trial, search_space):
        # Let the primary sampler handle relative sampling
        return self.primary_sampler.sample_relative(study, trial, search_space)

    def sample_independent(self, study, trial, param_name, param_distribution):
        # Use the fallback sampler for unsupported parameter types
        if isinstance(param_distribution, CategoricalDistribution):
            return self.fallback_sampler.sample_independent(study, trial, param_name, param_distribution)
        # Default to the primary sampler
        return self.primary_sampler.sample_independent(study, trial, param_name, param_distribution)

class RandomTPESampler(TPESampler):
    def __init__(self, exploration_sampler, exploration_freq=20, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.exploration_sampler = exploration_sampler
        self.exploration_freq = exploration_freq

    def sample_independent(self, study, trial, param_name, param_distribution):
        # Use the exploration_sampler periodically
        if trial.number % self.exploration_freq == 0:
            return self.exploration_sampler.sample_independent(study, trial, param_name, param_distribution)
        # Default to TPE
        return super().sample_independent(study, trial, param_name, param_distribution)


def optuna_objective(trial, X, y, inner_cv, n_cores, random_state):
    # Define feature reduction/selection method
        
    steps = [
        ("feature_reduction", FeatureSelector(method="top_n_cv", 
                                              n_features=trial.suggest_categorical("FeatureSelector__n_features", [250, 500, 1000, 5000, 12755]))),
        ("mean_centering", MeanCenterer()),
    ]


    # Define model
    model_type = trial.suggest_categorical("model_type", ["SVR", 'PLS', 'Ridge'])
    if model_type == "SVR":
        steps.append(("model", SVR(
            kernel='linear',
            C=trial.suggest_float("SVR__C", 1e-4, 1e2, log = True),
            epsilon=trial.suggest_float("SVR__epsilon", 1e-3, 10, log=True)
        )))
#     elif model_type == "RFR":
#         steps.append(("model", RandomForestRegressor(
#             n_estimators=trial.suggest_int("RFR__n_estimators", 300, 1600, step=400),
#             max_features=trial.suggest_categorical("RFR__max_features", ["sqrt", "log2", 0.5, 0.75, 1]),
#             max_samples=trial.suggest_categorical("RFR__max_samples", [0.25, 0.5, 0.75, None]),
#             max_depth=trial.suggest_categorical("RFR__max_depth", [None, 10, 25, 50, 100, 200]),
#             random_state=random_state,
#             n_jobs=int(n_cores/inner_cv.n_splits)
#         )))
    elif model_type == 'PLS':
        steps.append(
            ("model", PLSRegression_X(n_components=trial.suggest_int("PLS__n_components", 2, 100, step = 3))), 
        )
    elif model_type == 'Ridge':
        steps.append(
            ('model', Ridge(alpha=trial.suggest_float("Ridge__alpha", 1, 250, step = 10), 
                                             random_state=random_state))
        )

    # Create the pipeline
    pipeline = Pipeline(steps)

    # Evaluate with cross-validation
    mse = -cross_val_score(pipeline, X, y, 
                           cv=inner_cv, 
                           scoring="neg_mean_squared_error", 
                           n_jobs=inner_cv.n_splits).mean()

#     for fold_idx, (train_idx, val_idx) in enumerate(inner_cv.split(X, y)):
#         X_train, X_val = X[train_idx], X[val_idx]
#         y_train, y_val = y[train_idx], y[val_idx]

#         # Train and evaluate the pipeline on the current fold
#         pipeline.fit(X_train, y_train)
#         y_val_pred = pipeline.predict(X_val)
#         mse = mean_squared_error(y_val, y_val_pred)

#         # Store the MSE for this fold
#         mse_scores.append(mse)

#         # Report intermediate result to Optuna
#         trial.report(np.mean(mse_scores), step=fold_idx)

#         # Check if the trial should be pruned
#         if trial.should_prune():
#             raise optuna.exceptions.TrialPruned()
    
#     return np.mean(mse_scores)

    return mse


def generate_best_pipeline(study):
    best_params = study.best_params
    steps = []
    steps.append(("feature_reduction", FeatureSelector(method="top_n_cv", n_features=best_params["FeatureSelector__n_features"])))
    steps.append(("mean_centering", MeanCenterer()))
    
    if "SVR__C" in best_params:
        steps.append(("model", SVR(
            kernel='linear',
            C=best_params["SVR__C"],
            epsilon=best_params['SVR__epsilon']
        )))
#     elif "RFR__n_estimators" in best_params:
#         steps.append(("model", RandomForestRegressor(
#             n_estimators=best_params["RFR__n_estimators"],
#             max_features=best_params["RFR__max_features"],
#             max_samples=best_params["RFR__max_samples"],
#             max_depth=best_params["RFR__max_depth"],
#             random_state=random_state,
#             n_jobs=n_cores
#         )))
    elif 'PLS__n_components' in best_params:
        steps.append(("model", PLSRegression_X(n_components=best_params["PLS__n_components"])))
    elif 'Ridge__alpha' in best_params:
        steps.append(("model", Ridge(alpha=best_params["Ridge__alpha"], 
                                                      random_state=random_state)))

    best_pipeline = Pipeline(steps)
    return best_pipeline


In [10]:
X = pd.read_csv(os.path.join(data_path, 'processed',  'expr_protein.csv'), index_col = 0).values
y = pd.read_csv(os.path.join(data_path, 'processed', 'metastatic_potential_protein.csv'), index_col = 0)['mean'].values.ravel()

In [11]:
outer_folds=10
inner_folds=5
n_trials = 200

In [12]:
cmaes_sampler = CmaEsSampler(seed=random_state, 
                             warn_independent_sampling=False, 
                            restart_strategy='bipop')

exploration_sampler = RandomSampler(seed=random_state)
tpe_sampler = RandomTPESampler(seed=random_state, 
                               n_startup_trials = 25,
                               exploration_sampler = exploration_sampler, 
                               exploration_freq=20 # randomly sample every n trials
                              )
# tpe_sampler = TPESampler(seed=random_state, 
#                         n_startup_trials = 20)



In [13]:
outer_cv = KFold(n_splits=outer_folds, shuffle=True, random_state=random_state)
inner_cv = KFold(n_splits=inner_folds, shuffle=True, random_state=random_state)

if os.path.isfile(os.path.join(data_path, 'interim', 'pipeline_model_selection_protein.csv')):
    res_df = pd.read_csv(os.path.join(data_path, 'interim', 'pipeline_model_selection_protein.csv'), 
                     index_col = 0)
    results = res_df.to_dict(orient='records')
else:
    results = []
    res_df = None
    
for k, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):
    if res_df is not None and res_df[res_df.fold == k].shape[0] != 0:
        pass
    else:
        print(str(k))
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]


        pruner = optuna.pruners.SuccessiveHalvingPruner()
        study = optuna.create_study(direction="minimize", 
                                    sampler=HybridSampler(primary_sampler=cmaes_sampler, fallback_sampler=tpe_sampler), 
                                   pruner = pruner, 
                                   study_name = '{}_optuna'.format(k))
        study.optimize(
            lambda trial: optuna_objective(trial, X_train, y_train, inner_cv, n_cores, random_state),
            n_trials=n_trials, 
            catch=(ValueError,)
        )
        write_pickled_object(study, os.path.join(data_path, 'interim', study.study_name + '.pickle'))

        best_pipeline = generate_best_pipeline(study)
        best_pipeline.fit(X_train, y_train)

        y_train_pred = best_pipeline.predict(X_train)
        y_test_pred = best_pipeline.predict(X_test)

        train_corr = pearsonr(y_train, y_train_pred)[0]
        test_corr = pearsonr(y_test, y_test_pred)[0]

        results.append({
            "fold": k,
            "train_corr": train_corr,
            "test_corr": test_corr,
            "best_params": study.best_params,
            "inner_cv": study.trials_dataframe()
            })
        res_df = pd.DataFrame(results)
        res_df.to_csv(os.path.join(data_path, 'interim', 'pipeline_model_selection_protein.csv'))
