diff --git a/examples/integrations/sktime_regression_example.py b/examples/integrations/sktime_regression_example.py new file mode 100644 index 00000000..77825311 --- /dev/null +++ b/examples/integrations/sktime_regression_example.py @@ -0,0 +1,45 @@ +import numpy as np +from sklearn.tree import DecisionTreeRegressor +from sktime.datasets import load_unit_test +from sktime.transformations.panel.rocket import Rocket + +from hyperactive.integrations.sktime import TSROptCV +from hyperactive.opt import RandomSearch + +# 1. Load data +X_train, y_train = load_unit_test(split="train", return_X_y=True) +X_test, y_test = load_unit_test(split="test", return_X_y=True) + +# 2. Define search space +# We use a pipeline with Rocket transform and DecisionTreeRegressor +# But TSROptCV wraps a regressor. +# Let's use a simple regressor that handles time series or use a pipeline. +# For simplicity in this example, we can use a ComposableTimeSeriesForestRegressor if available, +# or just wrap a sklearn regressor if we treat it as a tabular problem (which sktime can do). +# However, TSROptCV expects a sktime regressor. + +from sktime.regression.dummy import DummyRegressor +from sktime.regression.distance_based import KNeighborsTimeSeriesRegressor + +# Let's use KNeighborsTimeSeriesRegressor as it is a standard sktime regressor +search_space_knn = { + "n_neighbors": list(range(1, 10)), + "weights": ["uniform", "distance"], +} + +tsr_opt = TSROptCV( + estimator=KNeighborsTimeSeriesRegressor(), + optimizer=RandomSearch(search_space_knn, n_iter=5), + cv=3, +) + +# 4. Run optimization +tsr_opt.fit(X_train, y_train) + +# 5. Check results +print("Best score:", tsr_opt.best_score_) +print("Best params:", tsr_opt.best_params_) + +# 6. Predict +y_pred = tsr_opt.predict(X_test) +print("Predictions shape:", y_pred.shape) diff --git a/src/hyperactive/experiment/integrations/__init__.py b/src/hyperactive/experiment/integrations/__init__.py index c302e25a..2c64bece 100644 --- a/src/hyperactive/experiment/integrations/__init__.py +++ b/src/hyperactive/experiment/integrations/__init__.py @@ -11,6 +11,9 @@ from hyperactive.experiment.integrations.sktime_forecasting import ( SktimeForecastingExperiment, ) +from hyperactive.experiment.integrations.sktime_regression import ( + SktimeRegressionExperiment, +) from hyperactive.experiment.integrations.torch_lightning_experiment import ( TorchExperiment, ) @@ -20,5 +23,6 @@ "SkproProbaRegExperiment", "SktimeClassificationExperiment", "SktimeForecastingExperiment", + "SktimeRegressionExperiment", "TorchExperiment", ] diff --git a/src/hyperactive/experiment/integrations/sktime_regression.py b/src/hyperactive/experiment/integrations/sktime_regression.py new file mode 100644 index 00000000..159f0690 --- /dev/null +++ b/src/hyperactive/experiment/integrations/sktime_regression.py @@ -0,0 +1,289 @@ +"""Experiment adapter for sktime regression experiments.""" + +# copyright: hyperactive developers, MIT License (see LICENSE file) + +import numpy as np + +from hyperactive.base import BaseExperiment + + +class SktimeRegressionExperiment(BaseExperiment): + """Experiment adapter for time series regression experiments. + + This class is used to perform cross-validation experiments using a given + sktime regressor. It allows for hyperparameter tuning and evaluation of + the model's performance. + + The score returned is the summary backtesting score, + of applying ``sktime`` ``evaluate`` to ``estimator`` with the parameters given in + ``score`` ``params``. + + The backtesting performed is specified by the ``cv`` parameter, + and the scoring metric is specified by the ``scoring`` parameter. + The ``X`` and ``y`` parameters are the input data and target values, + which are used in fit/predict cross-validation. + + Parameters + ---------- + estimator : sktime BaseRegressor descendant (concrete regressor) + sktime regressor to benchmark + + X : sktime-compatible panel data (Panel scitype) + Panel data container. Supported formats include: + + - ``pd.DataFrame`` with MultiIndex [instance, time] and variable columns + - 3D ``np.array`` with shape ``[n_instances, n_dimensions, series_length]`` + - Other formats listed in ``datatypes.SCITYPE_REGISTER`` + + y : sktime-compatible tabular data (Table scitype) + Target variable, typically a 1D ``np.ndarray`` or ``pd.Series`` + of shape ``[n_instances]``. + + cv : int, sklearn cross-validation generator or an iterable, default=3-fold CV + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None = default = ``KFold(n_splits=3, shuffle=True)`` + - integer, number of folds folds in a ``KFold`` splitter, ``shuffle=True`` + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, :class:`KFold` is used. + These splitters are instantiated with ``shuffle=False`` so the splits + will be the same across calls. + + scoring : str, callable, default=None + Strategy to evaluate the performance of the cross-validated model on + the test set. Can be: + + - a single string resolvable to an sklearn scorer + - a callable that returns a single value; + - ``None`` = default = ``mean_squared_error`` + + error_score : "raise" or numeric, default=np.nan + Value to assign to the score if an exception occurs in estimator fitting. If set + to "raise", the exception is raised. If a numeric value is given, + FitFailedWarning is raised. + + backend : string, by default "None". + Parallelization backend to use for runs. + Runs parallel evaluate if specified and ``strategy="refit"``. + + - "None": executes loop sequentially, simple list comprehension + - "loky", "multiprocessing" and "threading": uses ``joblib.Parallel`` loops + - "joblib": custom and 3rd party ``joblib`` backends, e.g., ``spark`` + - "dask": uses ``dask``, requires ``dask`` package in environment + - "dask_lazy": same as "dask", + but changes the return to (lazy) ``dask.dataframe.DataFrame``. + - "ray": uses ``ray``, requires ``ray`` package in environment + + Recommendation: Use "dask" or "loky" for parallel evaluate. + "threading" is unlikely to see speed ups due to the GIL and the serialization + backend (``cloudpickle``) for "dask" and "loky" is generally more robust + than the standard ``pickle`` library used in "multiprocessing". + + backend_params : dict, optional + additional parameters passed to the backend as config. + Directly passed to ``utils.parallel.parallelize``. + Valid keys depend on the value of ``backend``: + + - "None": no additional parameters, ``backend_params`` is ignored + - "loky", "multiprocessing" and "threading": default ``joblib`` backends + any valid keys for ``joblib.Parallel`` can be passed here, e.g., ``n_jobs``, + with the exception of ``backend`` which is directly controlled by ``backend``. + If ``n_jobs`` is not passed, it will default to ``-1``, other parameters + will default to ``joblib`` defaults. + - "joblib": custom and 3rd party ``joblib`` backends, e.g., ``spark``. + any valid keys for ``joblib.Parallel`` can be passed here, e.g., ``n_jobs``, + ``backend`` must be passed as a key of ``backend_params`` in this case. + If ``n_jobs`` is not passed, it will default to ``-1``, other parameters + will default to ``joblib`` defaults. + - "dask": any valid keys for ``dask.compute``, e.g., ``scheduler``. + - "dask_lazy": any valid keys for ``dask.compute``, e.g., ``scheduler``. + - "ray": any valid keys for ``ray.init``, e.g., ``num_cpus``. + """ + + _tags = { + "authors": ["fkiraly", "Omswastik-11"], + "maintainers": ["SimonBlanke", "fkiraly", "Omswastik-11"], + "python_dependencies": "sktime", + } + + def __init__( + self, + estimator, + X, + y, + cv=None, + scoring=None, + error_score=np.nan, + backend=None, + backend_params=None, + ): + self.estimator = estimator + self.X = X + self.y = y + self.cv = cv + self.scoring = scoring + self.error_score = error_score + self.backend = backend + self.backend_params = backend_params + + super().__init__() + + self._cv = cv + if scoring is None: + from sktime.performance_metrics.forecasting import ( + MeanAbsolutePercentageError, + ) + + self._scoring = MeanAbsolutePercentageError(symmetric=True) + else: + self._scoring = scoring + + if scoring is None or ( + hasattr(scoring, "get_tag") and scoring.get_tag("lower_is_better", False) + ): + higher_or_lower_better = "lower" + else: + higher_or_lower_better = "higher" + self.set_tags(**{"property:higher_or_lower_is_better": higher_or_lower_better}) + + def _get_model_parameters(self): + """Return the parameters of the model. + + Returns + ------- + list + The parameters of the model. + """ + return list(self.estimator.get_params().keys()) + + def _evaluate(self, params): + """Evaluate the parameters. + + Parameters + ---------- + params : dict with string keys + Parameters to evaluate. + + Returns + ------- + float + The value of the parameters as per evaluation. + dict + Additional metadata about the search. + """ + from sktime.classification.model_evaluation import evaluate + + estimator = self.estimator.clone().set_params(**params) + + # determine metric function for sktime.evaluate via centralized coerce helper + metric_func = getattr(self._scoring, "_metric_func", None) + if metric_func is None: + # very defensive fallback (should not happen due to _coerce_to_scorer) + from sklearn.metrics import ( + mean_squared_error as metric_func, # type: ignore + ) + + results = evaluate( + estimator, + cv=self._cv, + X=self.X, + y=self.y, + scoring=metric_func, + error_score=self.error_score, + backend=self.backend, + backend_params=self.backend_params, + ) + + metric = metric_func + result_name = f"test_{getattr(metric, '__name__', 'score')}" + + res_float = results[result_name].mean() + + return res_float, {"results": results} + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the skbase object. + + ``get_test_params`` is a unified interface point to store + parameter settings for testing purposes. This function is also + used in ``create_test_instance`` and ``create_test_instances_and_names`` + to construct test instances. + + ``get_test_params`` should return a single ``dict``, or a ``list`` of ``dict``. + + Each ``dict`` is a parameter configuration for testing, + and can be used to construct an "interesting" test instance. + A call to ``cls(**params)`` should + be valid for all dictionaries ``params`` in the return of ``get_test_params``. + + The ``get_test_params`` need not return fixed lists of dictionaries, + it can also return dynamic or stochastic parameter settings. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return `"default"` set. + + Returns + ------- + params : dict or list of dict, default = {} + Parameters to create testing instances of the class + Each dict are parameters to construct an "interesting" test instance, i.e., + `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. + `create_test_instance` uses the first (or only) dictionary in `params` + """ + from sklearn.metrics import mean_absolute_error + from sklearn.model_selection import KFold + from sktime.datasets import load_unit_test + from sktime.regression.dummy import DummyRegressor + + X, y = load_unit_test(return_X_y=True, return_type="pd-multiindex") + y = y.astype(float) + + params0 = { + "estimator": DummyRegressor(strategy="mean"), + "X": X, + "y": y, + } + + params1 = { + "estimator": DummyRegressor(strategy="median"), + "cv": KFold(n_splits=2), + "X": X, + "y": y, + "scoring": mean_absolute_error, + } + + def passthrough_scorer(estimator, X, y): + return estimator.score(X, y) + + params2 = { + "estimator": DummyRegressor(strategy="quantile", quantile=0.5), + "X": X, + "y": y, + "cv": KFold(n_splits=2), + "scoring": passthrough_scorer, + } + + return [params0, params1, params2] + + @classmethod + def _get_score_params(self): + """Return settings for testing score/evaluate functions. Used in tests only. + + Returns a list, the i-th element should be valid arguments for + self.evaluate and self.score, of an instance constructed with + self.get_test_params()[i]. + + Returns + ------- + list of dict + The parameters to be used for scoring. + """ + val0 = {} + val1 = {"strategy": "mean"} + return [val0, val1] diff --git a/src/hyperactive/integrations/sktime/__init__.py b/src/hyperactive/integrations/sktime/__init__.py index 256d03ea..b766ef2c 100644 --- a/src/hyperactive/integrations/sktime/__init__.py +++ b/src/hyperactive/integrations/sktime/__init__.py @@ -2,5 +2,6 @@ from hyperactive.integrations.sktime._classification import TSCOptCV from hyperactive.integrations.sktime._forecasting import ForecastingOptCV +from hyperactive.integrations.sktime._regression import TSROptCV -__all__ = ["TSCOptCV", "ForecastingOptCV"] +__all__ = ["TSCOptCV", "ForecastingOptCV", "TSROptCV"] diff --git a/src/hyperactive/integrations/sktime/_regression.py b/src/hyperactive/integrations/sktime/_regression.py new file mode 100644 index 00000000..6abb2cad --- /dev/null +++ b/src/hyperactive/integrations/sktime/_regression.py @@ -0,0 +1,247 @@ +# copyright: hyperactive developers, MIT License (see LICENSE file) + +import numpy as np +from skbase.utils.dependencies import _check_soft_dependencies + +if _check_soft_dependencies("sktime", severity="none"): + from sktime.regression._delegate import _DelegatedRegressor +else: + from skbase.base import BaseEstimator as _DelegatedRegressor + +from hyperactive.experiment.integrations.sktime_regression import ( + SktimeRegressionExperiment, +) + + +class TSROptCV(_DelegatedRegressor): + """Tune an sktime regressor via any optimizer in the hyperactive toolbox. + + ``TSROptCV`` uses any available tuning engine from ``hyperactive`` + to tune a regressor by backtesting. + + It passes backtesting results as scores to the tuning engine, + which identifies the best hyperparameters. + + Any available tuning engine from hyperactive can be used, for example: + + * grid search - ``from hyperactive.opt import GridSearchSk as GridSearch``, + this results in the same algorithm as ``TSRGridSearchCV`` + * hill climbing - ``from hyperactive.opt import HillClimbing`` + * optuna parzen-tree search - ``from hyperactive.opt.optuna import TPEOptimizer`` + + Configuration of the tuning engine is as per the respective documentation. + + Formally, ``TSROptCV`` does the following: + + In ``fit``: + + * wraps the ``estimator``, ``scoring``, and other parameters + into a ``SktimeRegressionExperiment`` instance, which is passed to the + optimizer ``optimizer`` as the ``experiment`` argument. + * Optimal parameters are then obtained from ``optimizer.solve``, and set + as ``best_params_`` and ``best_estimator_`` attributes. + * If ``refit=True``, ``best_estimator_`` is fitted to the entire ``y`` and ``X``. + + In ``predict`` and ``predict``-like methods, calls the respective method + of the ``best_estimator_`` if ``refit=True``. + + Parameters + ---------- + estimator : sktime regressor, BaseRegressor instance or interface compatible + The regressor to tune, must implement the sktime regressor interface. + + optimizer : hyperactive BaseOptimizer + The optimizer to be used for hyperparameter search. + + cv : int, sklearn cross-validation generator or an iterable, default=3-fold CV + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None = default = ``KFold(n_splits=3, shuffle=True)`` + - integer, number of folds folds in a ``KFold`` splitter, ``shuffle=True`` + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, :class:`KFold` is used. + These splitters are instantiated with ``shuffle=False`` so the splits + will be the same across calls. + + scoring : str, callable, default=None + Strategy to evaluate the performance of the cross-validated model on + the test set. Can be: + + - a single string resolvable to an sklearn scorer + - a callable that returns a single value; + - ``None`` = default = ``mean_squared_error`` + + refit : bool, optional (default=True) + True = refit the forecaster with the best parameters on the entire data in fit + False = no refitting takes place. The forecaster cannot be used to predict. + This is to be used to tune the hyperparameters, and then use the estimator + as a parameter estimator, e.g., via get_fitted_params or PluginParamsForecaster. + + error_score : "raise" or numeric, default=np.nan + Value to assign to the score if an exception occurs in estimator fitting. If set + to "raise", the exception is raised. If a numeric value is given, + FitFailedWarning is raised. + + backend : string, by default "None". + Parallelization backend to use for runs. + Runs parallel evaluate if specified and ``strategy="refit"``. + + - "None": executes loop sequentially, simple list comprehension + - "loky", "multiprocessing" and "threading": uses ``joblib.Parallel`` loops + - "joblib": custom and 3rd party ``joblib`` backends, e.g., ``spark`` + - "dask": uses ``dask``, requires ``dask`` package in environment + - "dask_lazy": same as "dask", + but changes the return to (lazy) ``dask.dataframe.DataFrame``. + - "ray": uses ``ray``, requires ``ray`` package in environment + + Recommendation: Use "dask" or "loky" for parallel evaluate. + "threading" is unlikely to see speed ups due to the GIL and the serialization + backend (``cloudpickle``) for "dask" and "loky" is generally more robust + than the standard ``pickle`` library used in "multiprocessing". + + backend_params : dict, optional + additional parameters passed to the backend as config. + Directly passed to ``utils.parallel.parallelize``. + Valid keys depend on the value of ``backend``: + + - "None": no additional parameters, ``backend_params`` is ignored + - "loky", "multiprocessing" and "threading": default ``joblib`` backends + any valid keys for ``joblib.Parallel`` can be passed here, e.g., ``n_jobs``, + with the exception of ``backend`` which is directly controlled by ``backend``. + If ``n_jobs`` is not passed, it will default to ``-1``, other parameters + will default to ``joblib`` defaults. + - "joblib": custom and 3rd party ``joblib`` backends, e.g., ``spark``. + any valid keys for ``joblib.Parallel`` can be passed here, e.g., ``n_jobs``, + ``backend`` must be passed as a key of ``backend_params`` in this case. + If ``n_jobs`` is not passed, it will default to ``-1``, other parameters + will default to ``joblib`` defaults. + - "dask": any valid keys for ``dask.compute``, e.g., ``scheduler``. + - "dask_lazy": any valid keys for ``dask.compute``, e.g., ``scheduler``. + - "ray": any valid keys for ``ray.init``, e.g., ``num_cpus``. + """ + + _tags = { + "authors": ["Omswastik-11", "fkiraly"], + "maintainers": ["fkiraly", "SimonBlanke", "Omswastik-11"], + "python_dependencies": "sktime", + } + + _delegate_name = "best_estimator_" + + def __init__( + self, + estimator, + optimizer, + cv=None, + scoring=None, + refit=True, + error_score=np.nan, + backend=None, + backend_params=None, + ): + self.estimator = estimator + self.optimizer = optimizer + self.cv = cv + self.scoring = scoring + self.refit = refit + self.error_score = error_score + self.backend = backend + self.backend_params = backend_params + + super().__init__() + + def _fit(self, X, y=None): + """Fit the estimator to the training data. + + Parameters + ---------- + X : sktime-compatible panel data (Panel scitype) + Panel data container. Supported formats include: + + - ``pd.DataFrame`` with MultiIndex [instance, time] and variable columns + - 3D ``np.array`` with shape ``[n_instances, n_dimensions, series_length]`` + - Other formats listed in ``datatypes.SCITYPE_REGISTER`` + + y : sktime-compatible tabular data (Table scitype) + Target variable, typically a 1D ``np.ndarray`` or ``pd.Series`` + of shape ``[n_instances]``. + + Returns + ------- + self : object + Returns self. + """ + self.best_estimator_ = self.estimator.clone() + + experiment = SktimeRegressionExperiment( + estimator=self.estimator, + X=X, + y=y, + cv=self.cv, + scoring=self.scoring, + error_score=self.error_score, + backend=self.backend, + backend_params=self.backend_params, + ) + + optimizer = self.optimizer.clone() + optimizer.set_params(experiment=experiment) + self.best_params_ = optimizer.solve() + + self.best_score_ = getattr(optimizer, "best_score_", np.nan) + + self.best_estimator_.set_params(**self.best_params_) + + if self.refit: + self.best_estimator_.fit(X, y) + + return self + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return ``"default"`` set. + + Returns + ------- + params : dict or list of dict + """ + from sklearn.metrics import mean_squared_error + from sklearn.model_selection import KFold + from sktime.regression.distance_based import KNeighborsTimeSeriesRegressor + from sktime.regression.dummy import DummyRegressor + + from hyperactive.opt.gfo import HillClimbing + from hyperactive.opt.gridsearch import GridSearchSk + from hyperactive.opt.random_search import RandomSearchSk + + params_gridsearch = { + "estimator": DummyRegressor(), + "optimizer": GridSearchSk(param_grid={"strategy": ["mean", "median"]}), + } + params_randomsearch = { + "estimator": DummyRegressor(), + "cv": 2, + "optimizer": RandomSearchSk( + param_distributions={"strategy": ["mean", "median"]}, + ), + "scoring": mean_squared_error, + } + params_hillclimb = { + "estimator": KNeighborsTimeSeriesRegressor(), + "cv": KFold(n_splits=2, shuffle=False), + "optimizer": HillClimbing( + search_space={"n_neighbors": [1, 2, 4]}, + n_iter=10, + n_neighbours=5, + ), + "scoring": "mean_squared_error", + } + return [params_gridsearch, params_randomsearch, params_hillclimb] diff --git a/src/hyperactive/integrations/sktime/tests/test_sktime_estimators.py b/src/hyperactive/integrations/sktime/tests/test_sktime_estimators.py index eeed78d3..dc1ae3d8 100644 --- a/src/hyperactive/integrations/sktime/tests/test_sktime_estimators.py +++ b/src/hyperactive/integrations/sktime/tests/test_sktime_estimators.py @@ -5,9 +5,9 @@ from skbase.utils.dependencies import _check_soft_dependencies if _check_soft_dependencies("sktime", severity="none"): - from hyperactive.integrations.sktime import ForecastingOptCV, TSCOptCV + from hyperactive.integrations.sktime import ForecastingOptCV, TSCOptCV, TSROptCV - EST_TO_TEST = [ForecastingOptCV, TSCOptCV] + EST_TO_TEST = [ForecastingOptCV, TSCOptCV, TSROptCV] else: EST_TO_TEST = []