[ENH] Allow list of hyperparameter options for tunning (#47)

* [WIP] Multiple hyperparam grids * fix linter * increase coverage * add more tests * More tests for merger * add news fragemtn
juaml · May 25, 2023 · 099435b · 099435b
1 parent 19af973
commit 099435b
Show file tree

Hide file tree

Showing 10 changed files with 965 additions and 201 deletions.
diff --git a/docs/changes/newsfragments/47.feature b/docs/changes/newsfragments/47.feature
@@ -0,0 +1 @@
+Add support for multiple grids in hyperparameter tuning by `Fede Raimondo`_
diff --git a/examples/03_complex_models/run_hyperparameter_multiple_grids.py b/examples/03_complex_models/run_hyperparameter_multiple_grids.py
@@ -0,0 +1,102 @@
+"""
+Tuning Multiple Hyperparameters Grids
+=====================================
+
+This example uses the 'fmri' dataset, performs simple binary classification
+using a Support Vector Machine classifier while tuning multiple hyperparameters
+grids at the same time.
+
+
+References
+----------
+Waskom, M.L., Frank, M.C., Wagner, A.D. (2016). Adaptive engagement of
+cognitive control in context-dependent decision-making. Cerebral Cortex.
+
+
+.. include:: ../../links.inc
+"""
+# Authors: Federico Raimondo <f.raimondo@fz-juelich.de>
+#
+# License: AGPL
+import numpy as np
+from seaborn import load_dataset
+
+from julearn import run_cross_validation
+from julearn.utils import configure_logging
+from julearn.pipeline import PipelineCreator
+
+###############################################################################
+# Set the logging level to info to see extra information
+configure_logging(level="INFO")
+
+###############################################################################
+# Set the random seed to always have the same example
+np.random.seed(42)
+
+###############################################################################
+# Load the dataset
+df_fmri = load_dataset("fmri")
+print(df_fmri.head())
+
+###############################################################################
+# Set the dataframe in the right format
+df_fmri = df_fmri.pivot(
+    index=["subject", "timepoint", "event"], columns="region", values="signal"
+)
+
+df_fmri = df_fmri.reset_index()
+print(df_fmri.head())
+
+X = ["frontal", "parietal"]
+y = "event"
+
+###############################################################################
+# Lets do a first attempt and use a linear SVM with the default parameters.
+
+creator = PipelineCreator(problem_type="classification")
+creator.add("zscore")
+creator.add("svm", kernel="linear")
+
+scores = run_cross_validation(X=X, y=y, data=df_fmri, model=creator)
+
+print(scores["test_score"].mean())
+
+###############################################################################
+# Now lets tune a bit this SVM. We will use a grid search to tune the
+# regularization parameter C and the kernel. We will also tune the gamma.
+# But since the gamma is only used for the rbf kernel, we will use a
+# different grid for the rbf kernel.
+#
+# To specify two different sets of parameters for the same step, we can
+# explicitly specify the name of the step. This is done by passing the
+# ``name`` parameter to the ``add`` method.
+creator = PipelineCreator(problem_type="classification")
+creator.add("zscore")
+creator.add("svm", kernel="linear", C=[0.01, 0.1], name="svm")
+creator.add(
+    "svm",
+    kernel="rbf",
+    C=[0.01, 0.1],
+    gamma=["scale", "auto", 1e-2, 1e-3],
+    name="svm",
+)
+
+search_params = {
+    "kind": "grid",
+    "cv": 2,  # to speed up the example
+}
+
+scores, estimator = run_cross_validation(
+    X=X,
+    y=y,
+    data=df_fmri,
+    model=creator,
+    search_params=search_params,
+    return_estimator="final",
+)
+
+print(scores["test_score"].mean())
+###############################################################################
+# It seems that we might have found a better model, but which one is it?
+print(estimator.best_params_)
+print(estimator.best_estimator_["svm"]._gamma)
diff --git a/julearn/api.py b/julearn/api.py
@@ -15,6 +15,7 @@
 from sklearn.pipeline import Pipeline
 
 from .pipeline import PipelineCreator
+from .pipeline.merger import merge_pipelines
 from .prepare import check_consistency, prepare_input_data
 from .scoring import check_scoring
 from .utils import logger, raise_error, _compute_cvmdsum
@@ -24,7 +25,7 @@
 def run_cross_validation(
     X: List[str],
     y: str,
-    model: Union[str, PipelineCreator, BaseEstimator],
+    model: Union[str, PipelineCreator, BaseEstimator, List[PipelineCreator]],
     X_types: Optional[Dict] = None,
     data: Optional[pd.DataFrame] = None,
     problem_type: Optional[str] = None,
@@ -216,24 +217,52 @@ def run_cross_validation(
             "model, use PipelineCreator instead"
         )
 
-    if isinstance(model, PipelineCreator):
+    if isinstance(model, (PipelineCreator, list)):
         if preprocess is not None:
             raise_error(
-                "If model is a PipelineCreator, preprocess should be None"
+                "If model is a PipelineCreator (or list of), "
+                "preprocess should be None"
             )
         if problem_type is not None:
             raise_error("Problem type should be set in the PipelineCreator")
 
         if len(model_params) > 0:
             raise_error(
-                "If model is a PipelineCreator, model_params must be None. "
-                f"Currently, it contains {model_params.keys()}"
+                "If model is a PipelineCreator (or list of), model_params must"
+                f" be None. Currently, it contains {model_params.keys()}"
             )
+        if isinstance(model, list):
+            if any(not isinstance(m, PipelineCreator) for m in model):
+                raise_error(
+                    "If model is a list, all elements must be PipelineCreator"
+                )
+        else:
+            model = [model]
 
-        pipeline = model.to_pipeline(
-            X_types=X_types, search_params=search_params
-        )
-        problem_type = model.problem_type
+        problem_types = set([m.problem_type for m in model])
+        if len(problem_types) > 1:
+            raise_error(
+                "If model is a list of PipelineCreator, all elements must have"
+                " the same problem_type"
+            )
+
+        expanded_models = []
+        for m in model:
+            expanded_models.extend(m.split())
+
+        all_pipelines = [
+            model.to_pipeline(X_types=X_types, search_params=search_params)
+            for model in expanded_models
+        ]
+
+        if len(all_pipelines) > 1:
+            pipeline = merge_pipelines(
+                *all_pipelines, search_params=search_params
+            )
+        else:
+            pipeline = all_pipelines[0]
+
+        problem_type = model[0].problem_type
 
     elif not isinstance(model, (str, BaseEstimator)):
         raise_error(

diff --git a/julearn/pipeline/merger.py b/julearn/pipeline/merger.py
@@ -0,0 +1,121 @@
+"""Module to merge multiple pipelines into a single one."""
+
+# Authors: Federico Raimondo <f.raimondo@fz-juelich.de>prepa
+# License: AGPL
+
+from typing import Dict
+
+from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
+from sklearn.pipeline import Pipeline
+
+from ..utils.logging import raise_error
+from ..utils.typing import EstimatorLike
+from .pipeline_creator import _prepare_hyperparameter_tuning
+from ..prepare import prepare_search_params
+
+
+def merge_pipelines(
+    *pipelines: EstimatorLike, search_params: Dict
+) -> Pipeline:
+    """Merge multiple pipelines into a single one.
+
+    Parameters
+    ----------
+    pipelines : List[EstimatorLike]
+        List of estimators that will be merged.
+    search_params : Dict
+        Dictionary with the search parameters.
+
+    Returns
+    -------
+    merged : BaseSearchCV
+        The merged pipeline as a searcher.
+    """
+
+    # Check that we only merge pipelines and searchers. And if there is a
+    # searcher, they are all of the same kind and match the search params.
+
+    search_params = prepare_search_params(search_params)
+
+    for p in pipelines:
+        if not isinstance(p, (Pipeline, GridSearchCV, RandomizedSearchCV)):
+            raise_error(
+                "Only pipelines and searchers are supported. "
+                f"Found {type(p)} instead.")
+        if isinstance(p, GridSearchCV):
+            if search_params["kind"] != "grid":
+                raise_error(
+                    "At least one of the pipelines to merge is a "
+                    "GridSearchCV, but the search params do not specify a "
+                    "grid search. These pipelines cannot be merged."
+                )
+        elif isinstance(p, RandomizedSearchCV):
+            if search_params["kind"] != "random":
+                raise_error(
+                    "At least one of the pipelines to merge is a "
+                    "RandomizedSearchCV, but the search params do not specify "
+                    "a random search. These pipelines cannot be merged."
+                )
+
+    # Check that all estimators have the same named steps in their pipelines.
+    reference_pipeline = pipelines[0]
+    if isinstance(reference_pipeline, (GridSearchCV, RandomizedSearchCV)):
+        reference_pipeline = reference_pipeline.estimator
+
+    step_names = reference_pipeline.named_steps.keys()
+
+    for p in pipelines:
+        if isinstance(p, (GridSearchCV, RandomizedSearchCV)):
+            p = p.estimator
+            if not isinstance(p, Pipeline):
+                raise_error("All searchers must use a pipeline.")
+        if step_names != p.named_steps.keys():
+            raise_error("All pipelines must have the same named steps.")
+
+    # The idea behind the merge is to create a list of parameter
+    # grids/distributions from a list of pipeline and searchers, to then
+    # wrap them into a single searcher. Since all the searchers have the same
+    # steps, this is possible. We just need to concatenate the
+    # grids/distributions from all searchers. If one of the pipelines is not
+    # a searcher, then this means that it has no hyperparameters to tune, but
+    # the pipeline is one of the hyperparameter options.
+
+    different_steps = []
+    for t_step_name in step_names:
+        # Get the transformer/model of the first element
+        t = reference_pipeline.named_steps[t_step_name]
+
+        # Check that all searchers have the same transformer/model.
+        # TODO: Fix this comparison, as it always returns False.
+        for s in pipelines[1:]:
+            if isinstance(s, (GridSearchCV, RandomizedSearchCV)):
+                if s.estimator.named_steps[t_step_name] != t:
+                    different_steps.append(t_step_name)
+                    break
+            else:
+                if s.named_steps[t_step_name] != t:
+                    different_steps.append(t_step_name)
+                    break
+
+    # Then, we will update the grid of the searchers that have different
+    # transformer/model.
+    all_grids = []
+    for s in pipelines:
+        if isinstance(s, GridSearchCV):
+            t_grid = s.param_grid.copy()
+        elif isinstance(s, RandomizedSearchCV):
+            t_grid = s.param_distributions.copy()
+        else:
+            t_grid = {}
+        for t_name in different_steps:
+            if isinstance(s, (GridSearchCV, RandomizedSearchCV)):
+                t_grid[t_name] = [s.estimator.named_steps[t_name]]
+            else:
+                t_grid[t_name] = [s.named_steps[t_name]]
+        all_grids.append(t_grid)
+
+    # Finally, we will concatenate the grids and create a new searcher.
+    new_searcher = _prepare_hyperparameter_tuning(
+        all_grids, search_params, reference_pipeline
+    )
+    return new_searcher