Skip to content

Commit

Permalink
[ENH] Allow list of hyperparameter options for tunning (#47)
Browse files Browse the repository at this point in the history
* [WIP] Multiple hyperparam grids

* fix linter

* increase coverage

* add more tests

* More tests for merger

* add news fragemtn
  • Loading branch information
fraimondo authored May 25, 2023
1 parent 19af973 commit 099435b
Show file tree
Hide file tree
Showing 10 changed files with 965 additions and 201 deletions.
1 change: 1 addition & 0 deletions docs/changes/newsfragments/47.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add support for multiple grids in hyperparameter tuning by `Fede Raimondo`_
102 changes: 102 additions & 0 deletions examples/03_complex_models/run_hyperparameter_multiple_grids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""
Tuning Multiple Hyperparameters Grids
=====================================
This example uses the 'fmri' dataset, performs simple binary classification
using a Support Vector Machine classifier while tuning multiple hyperparameters
grids at the same time.
References
----------
Waskom, M.L., Frank, M.C., Wagner, A.D. (2016). Adaptive engagement of
cognitive control in context-dependent decision-making. Cerebral Cortex.
.. include:: ../../links.inc
"""
# Authors: Federico Raimondo <f.raimondo@fz-juelich.de>
#
# License: AGPL
import numpy as np
from seaborn import load_dataset

from julearn import run_cross_validation
from julearn.utils import configure_logging
from julearn.pipeline import PipelineCreator

###############################################################################
# Set the logging level to info to see extra information
configure_logging(level="INFO")

###############################################################################
# Set the random seed to always have the same example
np.random.seed(42)

###############################################################################
# Load the dataset
df_fmri = load_dataset("fmri")
print(df_fmri.head())

###############################################################################
# Set the dataframe in the right format
df_fmri = df_fmri.pivot(
index=["subject", "timepoint", "event"], columns="region", values="signal"
)

df_fmri = df_fmri.reset_index()
print(df_fmri.head())

X = ["frontal", "parietal"]
y = "event"

###############################################################################
# Lets do a first attempt and use a linear SVM with the default parameters.

creator = PipelineCreator(problem_type="classification")
creator.add("zscore")
creator.add("svm", kernel="linear")

scores = run_cross_validation(X=X, y=y, data=df_fmri, model=creator)

print(scores["test_score"].mean())

###############################################################################
# Now lets tune a bit this SVM. We will use a grid search to tune the
# regularization parameter C and the kernel. We will also tune the gamma.
# But since the gamma is only used for the rbf kernel, we will use a
# different grid for the rbf kernel.
#
# To specify two different sets of parameters for the same step, we can
# explicitly specify the name of the step. This is done by passing the
# ``name`` parameter to the ``add`` method.
creator = PipelineCreator(problem_type="classification")
creator.add("zscore")
creator.add("svm", kernel="linear", C=[0.01, 0.1], name="svm")
creator.add(
"svm",
kernel="rbf",
C=[0.01, 0.1],
gamma=["scale", "auto", 1e-2, 1e-3],
name="svm",
)

search_params = {
"kind": "grid",
"cv": 2, # to speed up the example
}

scores, estimator = run_cross_validation(
X=X,
y=y,
data=df_fmri,
model=creator,
search_params=search_params,
return_estimator="final",
)

print(scores["test_score"].mean())
###############################################################################
# It seems that we might have found a better model, but which one is it?
print(estimator.best_params_)
print(estimator.best_estimator_["svm"]._gamma)
47 changes: 38 additions & 9 deletions julearn/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from sklearn.pipeline import Pipeline

from .pipeline import PipelineCreator
from .pipeline.merger import merge_pipelines
from .prepare import check_consistency, prepare_input_data
from .scoring import check_scoring
from .utils import logger, raise_error, _compute_cvmdsum
Expand All @@ -24,7 +25,7 @@
def run_cross_validation(
X: List[str],
y: str,
model: Union[str, PipelineCreator, BaseEstimator],
model: Union[str, PipelineCreator, BaseEstimator, List[PipelineCreator]],
X_types: Optional[Dict] = None,
data: Optional[pd.DataFrame] = None,
problem_type: Optional[str] = None,
Expand Down Expand Up @@ -216,24 +217,52 @@ def run_cross_validation(
"model, use PipelineCreator instead"
)

if isinstance(model, PipelineCreator):
if isinstance(model, (PipelineCreator, list)):
if preprocess is not None:
raise_error(
"If model is a PipelineCreator, preprocess should be None"
"If model is a PipelineCreator (or list of), "
"preprocess should be None"
)
if problem_type is not None:
raise_error("Problem type should be set in the PipelineCreator")

if len(model_params) > 0:
raise_error(
"If model is a PipelineCreator, model_params must be None. "
f"Currently, it contains {model_params.keys()}"
"If model is a PipelineCreator (or list of), model_params must"
f" be None. Currently, it contains {model_params.keys()}"
)
if isinstance(model, list):
if any(not isinstance(m, PipelineCreator) for m in model):
raise_error(
"If model is a list, all elements must be PipelineCreator"
)
else:
model = [model]

pipeline = model.to_pipeline(
X_types=X_types, search_params=search_params
)
problem_type = model.problem_type
problem_types = set([m.problem_type for m in model])
if len(problem_types) > 1:
raise_error(
"If model is a list of PipelineCreator, all elements must have"
" the same problem_type"
)

expanded_models = []
for m in model:
expanded_models.extend(m.split())

all_pipelines = [
model.to_pipeline(X_types=X_types, search_params=search_params)
for model in expanded_models
]

if len(all_pipelines) > 1:
pipeline = merge_pipelines(
*all_pipelines, search_params=search_params
)
else:
pipeline = all_pipelines[0]

problem_type = model[0].problem_type

elif not isinstance(model, (str, BaseEstimator)):
raise_error(
Expand Down
121 changes: 121 additions & 0 deletions julearn/pipeline/merger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""Module to merge multiple pipelines into a single one."""

# Authors: Federico Raimondo <f.raimondo@fz-juelich.de>prepa
# License: AGPL

from typing import Dict

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline

from ..utils.logging import raise_error
from ..utils.typing import EstimatorLike
from .pipeline_creator import _prepare_hyperparameter_tuning
from ..prepare import prepare_search_params


def merge_pipelines(
*pipelines: EstimatorLike, search_params: Dict
) -> Pipeline:
"""Merge multiple pipelines into a single one.
Parameters
----------
pipelines : List[EstimatorLike]
List of estimators that will be merged.
search_params : Dict
Dictionary with the search parameters.
Returns
-------
merged : BaseSearchCV
The merged pipeline as a searcher.
"""

# Check that we only merge pipelines and searchers. And if there is a
# searcher, they are all of the same kind and match the search params.

search_params = prepare_search_params(search_params)

for p in pipelines:
if not isinstance(p, (Pipeline, GridSearchCV, RandomizedSearchCV)):
raise_error(
"Only pipelines and searchers are supported. "
f"Found {type(p)} instead.")
if isinstance(p, GridSearchCV):
if search_params["kind"] != "grid":
raise_error(
"At least one of the pipelines to merge is a "
"GridSearchCV, but the search params do not specify a "
"grid search. These pipelines cannot be merged."
)
elif isinstance(p, RandomizedSearchCV):
if search_params["kind"] != "random":
raise_error(
"At least one of the pipelines to merge is a "
"RandomizedSearchCV, but the search params do not specify "
"a random search. These pipelines cannot be merged."
)

# Check that all estimators have the same named steps in their pipelines.
reference_pipeline = pipelines[0]
if isinstance(reference_pipeline, (GridSearchCV, RandomizedSearchCV)):
reference_pipeline = reference_pipeline.estimator

step_names = reference_pipeline.named_steps.keys()

for p in pipelines:
if isinstance(p, (GridSearchCV, RandomizedSearchCV)):
p = p.estimator
if not isinstance(p, Pipeline):
raise_error("All searchers must use a pipeline.")
if step_names != p.named_steps.keys():
raise_error("All pipelines must have the same named steps.")

# The idea behind the merge is to create a list of parameter
# grids/distributions from a list of pipeline and searchers, to then
# wrap them into a single searcher. Since all the searchers have the same
# steps, this is possible. We just need to concatenate the
# grids/distributions from all searchers. If one of the pipelines is not
# a searcher, then this means that it has no hyperparameters to tune, but
# the pipeline is one of the hyperparameter options.

different_steps = []
for t_step_name in step_names:
# Get the transformer/model of the first element
t = reference_pipeline.named_steps[t_step_name]

# Check that all searchers have the same transformer/model.
# TODO: Fix this comparison, as it always returns False.
for s in pipelines[1:]:
if isinstance(s, (GridSearchCV, RandomizedSearchCV)):
if s.estimator.named_steps[t_step_name] != t:
different_steps.append(t_step_name)
break
else:
if s.named_steps[t_step_name] != t:
different_steps.append(t_step_name)
break

# Then, we will update the grid of the searchers that have different
# transformer/model.
all_grids = []
for s in pipelines:
if isinstance(s, GridSearchCV):
t_grid = s.param_grid.copy()
elif isinstance(s, RandomizedSearchCV):
t_grid = s.param_distributions.copy()
else:
t_grid = {}
for t_name in different_steps:
if isinstance(s, (GridSearchCV, RandomizedSearchCV)):
t_grid[t_name] = [s.estimator.named_steps[t_name]]
else:
t_grid[t_name] = [s.named_steps[t_name]]
all_grids.append(t_grid)

# Finally, we will concatenate the grids and create a new searcher.
new_searcher = _prepare_hyperparameter_tuning(
all_grids, search_params, reference_pipeline
)
return new_searcher
Loading

0 comments on commit 099435b

Please sign in to comment.