Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Allow list of hyperparameter options for tuning #47

Merged
merged 6 commits into from
May 25, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
102 changes: 102 additions & 0 deletions examples/03_complex_models/run_hyperparameter_multiple_grids.py
@@ -0,0 +1,102 @@
"""
Tuning Multiple Hyperparameters Grids
=====================================

This example uses the 'fmri' dataset, performs simple binary classification
using a Support Vector Machine classifier while tuning multiple hyperparameters
grids at the same time.


References
----------
Waskom, M.L., Frank, M.C., Wagner, A.D. (2016). Adaptive engagement of
cognitive control in context-dependent decision-making. Cerebral Cortex.


.. include:: ../../links.inc
"""
# Authors: Federico Raimondo <f.raimondo@fz-juelich.de>
#
# License: AGPL
import numpy as np
from seaborn import load_dataset

from julearn import run_cross_validation
from julearn.utils import configure_logging
from julearn.pipeline import PipelineCreator

###############################################################################
# Set the logging level to info to see extra information
configure_logging(level="INFO")

###############################################################################
# Set the random seed to always have the same example
np.random.seed(42)

###############################################################################
# Load the dataset
df_fmri = load_dataset("fmri")
print(df_fmri.head())

###############################################################################
# Set the dataframe in the right format
df_fmri = df_fmri.pivot(
index=["subject", "timepoint", "event"], columns="region", values="signal"
)

df_fmri = df_fmri.reset_index()
print(df_fmri.head())

X = ["frontal", "parietal"]
y = "event"

###############################################################################
# Lets do a first attempt and use a linear SVM with the default parameters.

creator = PipelineCreator(problem_type="classification")
creator.add("zscore")
creator.add("svm", kernel="linear")

scores = run_cross_validation(X=X, y=y, data=df_fmri, model=creator)

print(scores["test_score"].mean())

###############################################################################
# Now lets tune a bit this SVM. We will use a grid search to tune the
# regularization parameter C and the kernel. We will also tune the gamma.
# But since the gamma is only used for the rbf kernel, we will use a
# different grid for the rbf kernel.
#
# To specify two different sets of parameters for the same step, we can
# explicitly specify the name of the step. This is done by passing the
# ``name`` parameter to the ``add`` method.
creator = PipelineCreator(problem_type="classification")
creator.add("zscore")
creator.add("svm", kernel="linear", C=[0.01, 0.1], name="svm")
creator.add(
"svm",
kernel="rbf",
C=[0.01, 0.1],
gamma=["scale", "auto", 1e-2, 1e-3],
name="svm",
)

search_params = {
"kind": "grid",
"cv": 2, # to speed up the example
}

scores, estimator = run_cross_validation(
X=X,
y=y,
data=df_fmri,
model=creator,
search_params=search_params,
return_estimator="final",
)

print(scores["test_score"].mean())
###############################################################################
# It seems that we might have found a better model, but which one is it?
print(estimator.best_params_)
print(estimator.best_estimator_["svm"]._gamma)
45 changes: 37 additions & 8 deletions julearn/api.py
Expand Up @@ -15,6 +15,7 @@
from sklearn.pipeline import Pipeline

from .pipeline import PipelineCreator
from .pipeline.merger import merge_pipelines
from .prepare import check_consistency, prepare_input_data
from .scoring import check_scoring
from .utils import logger, raise_error, _compute_cvmdsum
Expand Down Expand Up @@ -216,24 +217,52 @@ def run_cross_validation(
"model, use PipelineCreator instead"
)

if isinstance(model, PipelineCreator):
if isinstance(model, (PipelineCreator, list)):
if preprocess is not None:
raise_error(
"If model is a PipelineCreator, preprocess should be None"
"If model is a PipelineCreator (or list of), "
"preprocess should be None"
)
if problem_type is not None:
raise_error("Problem type should be set in the PipelineCreator")

if len(model_params) > 0:
raise_error(
"If model is a PipelineCreator, model_params must be None. "
f"Currently, it contains {model_params.keys()}"
"If model is a PipelineCreator (or list of), model_params must"
f" be None. Currently, it contains {model_params.keys()}"
)
if isinstance(model, list):
if any(not isinstance(m, PipelineCreator) for m in model):
raise_error(
"If model is a list, all elements must be PipelineCreator"
)
else:
model = [model]

pipeline = model.to_pipeline(
X_types=X_types, search_params=search_params
)
problem_type = model.problem_type
problem_types = set([m.problem_type for m in model])
if len(problem_types) > 1:
raise_error(
"If model is a list of PipelineCreator, all elements must have"
" the same problem_type"
)

expanded_models = []
for m in model:
expanded_models.extend(m.split())

all_pipelines = [
x.to_pipeline(X_types=X_types, search_params=search_params)
fraimondo marked this conversation as resolved.
Show resolved Hide resolved
for x in expanded_models
]

if len(all_pipelines) > 1:
pipeline = merge_pipelines(
*all_pipelines, search_params=search_params
)
else:
pipeline = all_pipelines[0]

problem_type = model[0].problem_type

elif not isinstance(model, (str, BaseEstimator)):
raise_error(
Expand Down
116 changes: 116 additions & 0 deletions julearn/pipeline/merger.py
@@ -0,0 +1,116 @@
from typing import Dict

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline

from ..utils.logging import raise_error
from ..utils.typing import EstimatorLike
from .pipeline_creator import _prepare_hyperparameter_tuning
from ..prepare import prepare_search_params


def merge_pipelines(
*pipelines: EstimatorLike, search_params: Dict
) -> Pipeline:
"""Merge multiple pipelines into a single one.

Parameters
----------
pipelines : List[EstimatorLike]
List of estimators that will be merged.
search_params : Dict
Dictionary with the search parameters.

Returns
-------
merged : BaseSearchCV
The merged pipeline as a searcher.
"""

# Check that we only merge pipelines and searchers. And if there is a
# searcher, they are all of the same kind and match the search params.

search_params = prepare_search_params(search_params)

for p in pipelines:
if not isinstance(p, (Pipeline, GridSearchCV, RandomizedSearchCV)):
fraimondo marked this conversation as resolved.
Show resolved Hide resolved
raise_error(
"Only pipelines and searchers are supported. "
f"Found {type(p)} instead.")
if isinstance(p, GridSearchCV):
if search_params["kind"] != "grid":
raise_error(
"At least one of the pipelines to merge is a "
"GridSearchCV, but the search params do not specify a "
"grid search. These pipelines cannot be merged."
)
elif isinstance(p, RandomizedSearchCV):
if search_params["kind"] != "random":
raise_error(
"At least one of the pipelines to merge is a "
"RandomizedSearchCV, but the search params do not specify "
"a random search. These pipelines cannot be merged."
)

# Check that all estimators have the same named steps in their pipelines.
reference_pipeline = pipelines[0]
if isinstance(reference_pipeline, (GridSearchCV, RandomizedSearchCV)):
reference_pipeline = reference_pipeline.estimator

step_names = reference_pipeline.named_steps.keys()

for p in pipelines:
if isinstance(p, (GridSearchCV, RandomizedSearchCV)):
p = p.estimator
if not isinstance(p, Pipeline):
raise_error("All searchers must use a pipeline.")
if step_names != p.named_steps.keys():
raise_error("All pipelines must have the same named steps.")

# The idea behind the merge is to create a list of parameter
# grids/distributions from a list of pipeline and searchers, to then
# wrap them into a single searcher. Since all the searchers have the same
# steps, this is possible. We just need to concatenate the
# grids/distributions from all searchers. If one of the pipelines is not
# a searcher, then this means that it has no hyperparameters to tune, but
# the pipeline is one of the hyperparameter options.

different_steps = []
for t_name in step_names:
fraimondo marked this conversation as resolved.
Show resolved Hide resolved
# Get the transformer/model of the first element
t = reference_pipeline.named_steps[t_name]

# Check that all searchers have the same transformer/model.
# TODO: Fix this comparison, as it always returns False.
fraimondo marked this conversation as resolved.
Show resolved Hide resolved
for s in pipelines[1:]:
if isinstance(s, (GridSearchCV, RandomizedSearchCV)):
if s.estimator.named_steps[t_name] != t:
different_steps.append(t_name)
break
else:
if s.named_steps[t_name] != t:
different_steps.append(t_name)
break

# Then, we will update the grid of the searchers that have different
# transformer/model.
all_grids = []
for s in pipelines:
if isinstance(s, GridSearchCV):
t_grid = s.param_grid.copy()
elif isinstance(s, RandomizedSearchCV):
t_grid = s.param_distributions.copy()
else:
t_grid = {}
for t_name in different_steps:
if isinstance(s, (GridSearchCV, RandomizedSearchCV)):
t_grid[t_name] = [s.estimator.named_steps[t_name]]
else:
t_grid[t_name] = [s.named_steps[t_name]]
all_grids.append(t_grid)

# Finally, we will concatenate the grids and create a new searcher.
new_searcher = _prepare_hyperparameter_tuning(
all_grids, search_params, reference_pipeline
)
return new_searcher