[train+data] Remove preprocessor argument from trainers (ray-projec…

…t#43146) This PR removes the hard-deprecated `preprocessor` argument fully. This is a follow-up to ray-project#38640. --------- Signed-off-by: Justin Yu <justinvyu@anyscale.com>
kevin85421 · Feb 17, 2024 · 8b64562 · 8b64562
1 parent b9d2481
commit 8b64562
Show file tree

Hide file tree

Showing 10 changed files with 33 additions and 150 deletions.
diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py
@@ -3,7 +3,6 @@
 import ray
 from ray.train import Checkpoint, CheckpointConfig, ScalingConfig
 from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated
-from ray.data.preprocessor import Preprocessor
 from ray.train.trainer import BaseTrainer
 
 
@@ -201,11 +200,6 @@ def test_datasets():
     DummyTrainer(datasets={"test": DummyDataset()})
 
 
-def test_preprocessor_deprecated():
-    with pytest.raises(DeprecationWarning):
-        DummyTrainer(preprocessor=Preprocessor())
-
-
 def test_resume_from_checkpoint(tmpdir):
     with pytest.raises(ValueError):
         DummyTrainer(resume_from_checkpoint="invalid")

diff --git a/python/ray/train/_internal/data_config.py b/python/ray/train/_internal/data_config.py
@@ -5,7 +5,6 @@
 from ray.actor import ActorHandle
 from ray.data import DataIterator, Dataset, ExecutionOptions, NodeIdStr
 from ray.data._internal.execution.interfaces.execution_options import ExecutionResources
-from ray.data.preprocessor import Preprocessor
 from ray.util.annotations import DeveloperAPI, PublicAPI
 
 
@@ -132,12 +131,3 @@ def default_ingest_options() -> ExecutionOptions:
             preserve_order=ctx.execution_options.preserve_order,
             verbose_progress=ctx.execution_options.verbose_progress,
         )
-
-    def _legacy_preprocessing(
-        self, datasets: Dict[str, Dataset], preprocessor: Optional[Preprocessor]
-    ) -> Dict[str, Dataset]:
-        """Legacy hook for backwards compatiblity.
-
-        This will be removed in the future.
-        """
-        return datasets  # No-op for non-legacy configs.
diff --git a/python/ray/train/base_trainer.py b/python/ray/train/base_trainer.py
@@ -21,13 +21,11 @@
 from ray.train import Checkpoint
 from ray.train._internal.session import _get_session
 from ray.train._internal.storage import _exists_at_fs_path, get_fs_and_path
-from ray.train.constants import TRAIN_DATASET_KEY
 from ray.util import PublicAPI
 from ray.util.annotations import DeveloperAPI
 
 if TYPE_CHECKING:
     from ray.data import Dataset
-    from ray.data.preprocessor import Preprocessor
     from ray.tune import Trainable
 
 _TRAINER_PKL = "trainer.pkl"
@@ -89,9 +87,7 @@ class BaseTrainer(abc.ABC):
       called in sequence on the remote actor.
     - ``trainer.setup()``: Any heavyweight Trainer setup should be
       specified here.
-    - ``trainer.preprocess_datasets()``: The datasets passed to the Trainer will be
-      setup here.
-    - ``trainer.train_loop()``: Executes the main training logic.
+    - ``trainer.training_loop()``: Executes the main training logic.
     - Calling ``trainer.fit()`` will return a ``ray.result.Result``
       object where you can access metrics from your training run, as well
       as any checkpoints that may have been saved.
@@ -191,16 +187,13 @@ def __init__(
         datasets: Optional[Dict[str, GenDataset]] = None,
         metadata: Optional[Dict[str, Any]] = None,
         resume_from_checkpoint: Optional[Checkpoint] = None,
-        # Deprecated.
-        preprocessor: Optional["Preprocessor"] = None,
     ):
         self.scaling_config = (
             scaling_config if scaling_config is not None else ScalingConfig()
         )
         self.run_config = run_config if run_config is not None else RunConfig()
         self.metadata = metadata
         self.datasets = datasets if datasets is not None else {}
-        self.preprocessor = preprocessor
         self.starting_checkpoint = resume_from_checkpoint
 
         # These attributes should only be set through `BaseTrainer.restore`
@@ -211,17 +204,13 @@ def __init__(
 
         air_usage.tag_air_trainer(self)
 
-        if preprocessor is not None:
-            raise DeprecationWarning(PREPROCESSOR_DEPRECATION_MESSAGE)
-
     @PublicAPI(stability="alpha")
     @classmethod
     def restore(
         cls: Type["BaseTrainer"],
         path: Union[str, os.PathLike],
         storage_filesystem: Optional[pyarrow.fs.FileSystem] = None,
         datasets: Optional[Dict[str, GenDataset]] = None,
-        preprocessor: Optional["Preprocessor"] = None,
         scaling_config: Optional[ScalingConfig] = None,
         **kwargs,
     ) -> "BaseTrainer":
@@ -351,10 +340,6 @@ def training_loop(self):
             )
         param_dict["datasets"] = datasets
 
-        # If no preprocessor is re-specified, then it will be set to None
-        # here and loaded from the latest checkpoint
-        param_dict["preprocessor"] = preprocessor
-
         if scaling_config:
             param_dict["scaling_config"] = scaling_config
 
@@ -470,15 +455,6 @@ def _validate_attributes(self):
                 f"{self.metadata}: {e}"
             )
 
-        # Preprocessor
-        if self.preprocessor is not None and not isinstance(
-            self.preprocessor, ray.data.Preprocessor
-        ):
-            raise ValueError(
-                f"`preprocessor` should be an instance of `ray.data.Preprocessor`, "
-                f"found {type(self.preprocessor)} with value `{self.preprocessor}`."
-            )
-
         if self.starting_checkpoint is not None and not isinstance(
             self.starting_checkpoint, Checkpoint
         ):
@@ -511,50 +487,19 @@ def setup(self) -> None:
         pass
 
     def preprocess_datasets(self) -> None:
-        """Called during fit() to preprocess dataset attributes with preprocessor.
-
-        .. note:: This method is run on a remote process.
-
-        This method is called prior to entering the training_loop.
-
-        If the ``Trainer`` has both a datasets dict and
-        a preprocessor, the datasets dict contains a training dataset (denoted by
-        the "train" key), and the preprocessor has not yet
-        been fit, then it will be fit on the train dataset.
-
-        Then, all Trainer's datasets will be transformed by the preprocessor.
-
-        The transformed datasets will be set back in the ``self.datasets`` attribute
-        of the Trainer to be used when overriding ``training_loop``.
-        """
-        # Evaluate all datasets.
-        self.datasets = {k: d() if callable(d) else d for k, d in self.datasets.items()}
-
-        if self.preprocessor:
-            train_dataset = self.datasets.get(TRAIN_DATASET_KEY, None)
-            if train_dataset and self.preprocessor.fit_status() in (
-                ray.data.Preprocessor.FitStatus.NOT_FITTED,
-                ray.data.Preprocessor.FitStatus.PARTIALLY_FITTED,
-            ):
-                self.preprocessor.fit(train_dataset)
-
-            # Execute dataset transformations serially for now.
-            # Cannot execute them in remote tasks due to dataset ownership model:
-            # if datasets are created on a remote node, then if that node fails,
-            # we cannot recover the dataset.
-            new_datasets = {}
-            for key, dataset in self.datasets.items():
-                new_datasets[key] = self.preprocessor.transform(dataset)
-
-            self.datasets = new_datasets
+        """Deprecated."""
+        raise DeprecationWarning(
+            "`preprocess_datasets` is no longer used, since preprocessors "
+            f"are no longer accepted by Trainers.\n{PREPROCESSOR_DEPRECATION_MESSAGE}"
+        )
 
     @abc.abstractmethod
     def training_loop(self) -> None:
         """Loop called by fit() to run training and report results to Tune.
 
         .. note:: This method runs on a remote process.
 
-        ``self.datasets`` have already been preprocessed by ``self.preprocessor``.
+        ``self.datasets`` have already been evaluated if they were wrapped in a factory.
 
         You can use the :ref:`Ray Train utilities <train-loop-api>`
         (:func:`train.report() <ray.train.report>` and
@@ -729,8 +674,12 @@ def train_func(config):
             # else: Train will restore from the user-provided
             # `resume_from_checkpoint` == `starting_checkpoint`.
 
+            # Evaluate datasets if they are wrapped in a factory.
+            trainer.datasets = {
+                k: d() if callable(d) else d for k, d in self.datasets.items()
+            }
+
             trainer.setup()
-            trainer.preprocess_datasets()
             trainer.training_loop()
 
         # Change the name of the training function to match the name of the Trainer

diff --git a/python/ray/train/data_parallel_trainer.py b/python/ray/train/data_parallel_trainer.py
@@ -1,5 +1,5 @@
 import logging
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Type, Union
 
 import ray
 from ray._private.thirdparty.tabulate.tabulate import tabulate
@@ -15,9 +15,6 @@
 from ray.widgets import Template
 from ray.widgets.util import repr_with_fallback
 
-if TYPE_CHECKING:
-    from ray.data.preprocessor import Preprocessor
-
 logger = logging.getLogger(__name__)
 
 
@@ -191,11 +188,12 @@ def __init__(self, train_loop_per_worker, my_backend_config:
         dataset_config: Configuration for dataset ingest. This is merged with the
             default dataset config for the given trainer (`cls._dataset_config`).
         run_config: Configuration for the execution of the training run.
-        datasets: Any Datasets to use for training. Use
-            the key "train" to denote which dataset is the training
-            dataset. If a ``preprocessor`` is provided and has not already been fit,
-            it will be fit on the training dataset. All datasets will be transformed
-            by the ``preprocessor`` if one is provided.
+        datasets: Ray Datasets to use for training and evaluation.
+            This is a dict where the key is the name of the dataset, which
+            can be accessed from within the ``train_loop_per_worker`` by calling
+            ``train.get_dataset_shard(dataset_key)``.
+            By default, all datasets are sharded equally across workers.
+            This can be configured via ``dataset_config``.
         metadata: Dict that should be made available via
             `train.get_context().get_metadata()` and in `checkpoint.get_metadata()`
             for checkpoints saved from this Trainer. Must be JSON-serializable.
@@ -233,8 +231,6 @@ def __init__(
         datasets: Optional[Dict[str, GenDataset]] = None,
         metadata: Optional[Dict[str, Any]] = None,
         resume_from_checkpoint: Optional[Checkpoint] = None,
-        # Deprecated.
-        preprocessor: Optional["Preprocessor"] = None,
     ):
         self._train_loop_per_worker = train_loop_per_worker
         self._train_loop_config = train_loop_config
@@ -259,7 +255,6 @@ def __init__(
             run_config=run_config,
             datasets=datasets,
             metadata=metadata,
-            preprocessor=preprocessor,
             resume_from_checkpoint=resume_from_checkpoint,
         )
 
@@ -314,13 +309,6 @@ def _validate_attributes(self):
             self._train_loop_per_worker, "train_loop_per_worker"
         )
 
-    def preprocess_datasets(self) -> None:
-        # Evaluate all datasets.
-        self.datasets = {k: d() if callable(d) else d for k, d in self.datasets.items()}
-        self.datasets = self._data_config._legacy_preprocessing(
-            self.datasets, self.preprocessor
-        )
-
     def _validate_train_loop_per_worker(
         self, train_loop_per_worker: Callable, fn_name: str
     ) -> None:

diff --git a/python/ray/train/gbdt_trainer.py b/python/ray/train/gbdt_trainer.py
@@ -3,6 +3,8 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, Optional, Type
 
+from packaging.version import Version
+
 from ray.train import Checkpoint, RunConfig, ScalingConfig
 from ray.train.constants import TRAIN_DATASET_KEY
 from ray.train.trainer import BaseTrainer, GenDataset
@@ -13,7 +15,6 @@
 if TYPE_CHECKING:
     import xgboost_ray
 
-    from ray.data.preprocessor import Preprocessor
 
 _WARN_REPARTITION_THRESHOLD = 10 * 1024**3
 _DEFAULT_NUM_ITERATIONS = 10
@@ -149,7 +150,6 @@ def __init__(
         num_boost_round: int = _DEFAULT_NUM_ITERATIONS,
         scaling_config: Optional[ScalingConfig] = None,
         run_config: Optional[RunConfig] = None,
-        preprocessor: Optional["Preprocessor"] = None,  # Deprecated
         resume_from_checkpoint: Optional[Checkpoint] = None,
         metadata: Optional[Dict[str, Any]] = None,
         **train_kwargs,
@@ -165,7 +165,6 @@ def __init__(
             scaling_config=scaling_config,
             run_config=run_config,
             datasets=datasets,
-            preprocessor=preprocessor,
             resume_from_checkpoint=resume_from_checkpoint,
             metadata=metadata,
         )
@@ -276,6 +275,15 @@ def _repartition_datasets_to_match_num_actors(self):
                     self._ray_params.num_actors
                 )
 
+    def setup(self) -> None:
+        import xgboost_ray
+
+        # XGBoost/LightGBM-Ray requires each dataset to have at least as many
+        # blocks as there are workers.
+        # This is only applicable for xgboost-ray<0.1.16
+        if Version(xgboost_ray.__version__) < Version("0.1.16"):
+            self._repartition_datasets_to_match_num_actors()
+
     def training_loop(self) -> None:
         config = self.train_kwargs.copy()
         config[self._num_iterations_argument] = self.num_boost_round

diff --git a/python/ray/train/horovod/horovod_trainer.py b/python/ray/train/horovod/horovod_trainer.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
+from typing import Any, Callable, Dict, Optional, Union
 
 from ray.air.config import RunConfig, ScalingConfig
 from ray.train import Checkpoint, DataConfig
@@ -7,9 +7,6 @@
 from ray.train.trainer import GenDataset
 from ray.util.annotations import PublicAPI
 
-if TYPE_CHECKING:
-    from ray.data.preprocessor import Preprocessor
-
 
 @PublicAPI(stability="beta")
 class HorovodTrainer(DataParallelTrainer):
@@ -191,8 +188,6 @@ def __init__(
         datasets: Optional[Dict[str, GenDataset]] = None,
         metadata: Optional[Dict[str, Any]] = None,
         resume_from_checkpoint: Optional[Checkpoint] = None,
-        # Deprecated.
-        preprocessor: Optional["Preprocessor"] = None,
     ):
         super().__init__(
             train_loop_per_worker=train_loop_per_worker,
@@ -202,7 +197,6 @@ def __init__(
             dataset_config=dataset_config,
             run_config=run_config,
             datasets=datasets,
-            preprocessor=preprocessor,
             resume_from_checkpoint=resume_from_checkpoint,
             metadata=metadata,
         )
diff --git a/python/ray/train/lightgbm/lightgbm_trainer.py b/python/ray/train/lightgbm/lightgbm_trainer.py
@@ -7,11 +7,6 @@
 from ray.train.lightgbm import RayTrainReportCallback
 from ray.util.annotations import PublicAPI
 
-try:
-    from packaging.version import Version
-except ImportError:
-    from distutils.version import LooseVersion as Version
-
 
 @PublicAPI(stability="beta")
 class LightGBMTrainer(GBDTTrainer):
@@ -123,14 +118,3 @@ def _model_iteration(
         if isinstance(model, lightgbm.Booster):
             return model.current_iteration()
         return model.booster_.current_iteration()
-
-    def preprocess_datasets(self) -> None:
-        super().preprocess_datasets()
-
-        # XGBoost/LightGBM-Ray requires each dataset to have at least as many
-        # blocks as there are workers.
-        # This is only applicable for xgboost-ray<0.1.16
-        import xgboost_ray
-
-        if Version(xgboost_ray.__version__) < Version("0.1.16"):
-            self._repartition_datasets_to_match_num_actors()