Skip to content

Commit

Permalink
[tune] Fix resume="AUTO" compatibility with the new ResumeConfig
Browse files Browse the repository at this point in the history
…implementation (ray-project#43179)

Fixes an issue where `resume="AUTO"` should map to `ResumeConfig()` but was not being handled properly by the API adapter code. Adds a test so that CI will catch these issues in the future.

---------

Signed-off-by: Justin Yu <justinvyu@anyscale.com>
  • Loading branch information
justinvyu authored and kevin85421 committed Feb 17, 2024
1 parent 45d37cc commit 012f7f7
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 34 deletions.
2 changes: 0 additions & 2 deletions python/ray/tune/execution/experiment_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
logger = logging.getLogger(__name__)


VALID_RESUME_TYPES = [True, "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY", "AUTO"]

_EXPERIMENT_SYNC_TIMEOUT_MESSAGE = (
"If this warning keeps showing up, consider diagnosing the "
"reason behind the hanging sync operation, or increase the "
Expand Down
104 changes: 76 additions & 28 deletions python/ray/tune/tests/test_tune_restore.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,34 +625,6 @@ def f(config):
tune.run(f, fail_fast=TuneController.RAISE)


# For some reason, different tests are coupled through tune.registry.
# After running `ResourceExhaustedTest`, there is always a super huge `training_func` to
# be put through GCS, which will fail subsequent tests.
# tldr, make sure that this test is the last test in the file.
class ResourceExhaustedTest(unittest.TestCase):
def test_resource_exhausted_info(self):
"""This is to test if helpful information is displayed when
the objects captured in trainable/training function are too
large and RESOURCES_EXHAUSTED error of gRPC is triggered."""

# generate some random data to be captured implicitly in training func.
from sklearn.datasets import fetch_olivetti_faces

a_large_array = []
for i in range(50):
a_large_array.append(fetch_olivetti_faces())

def training_func(config):
for item in a_large_array:
assert item

with self.assertRaisesRegex(
TuneError,
"The Trainable/training function is too large for grpc resource limit.",
):
tune.run(training_func)


@pytest.mark.parametrize(
"trial_config", [{}, {"attr": 4}, {"nested": {"key": "value"}}]
)
Expand Down Expand Up @@ -687,6 +659,82 @@ def train_fn(config):
assert "Inducing exception for testing purposes." in exc_info.value.output.decode()


@pytest.mark.parametrize(
"resume",
[
True,
"AUTO",
"AUTO+ERRORED",
"AUTO+ERRORED_ONLY",
"AUTO+RESTART_ERRORED",
"AUTO+RESTART_ERRORED_ONLY",
],
)
def test_resume_options(tmp_path, resume):
tmp_path.joinpath("dummy_ckpt").mkdir()

def train_fn(config):
checkpoint = ray.train.get_checkpoint()
if not checkpoint:
ray.train.report(
{"finish_marker": False},
checkpoint=Checkpoint.from_directory(tmp_path / "dummy_ckpt"),
)
raise RuntimeError("failing on the first run!!")
ray.train.report({"finish_marker": True})

analysis = tune.run(
train_fn,
storage_path=str(tmp_path),
name="test_resume_options",
raise_on_failed_trial=False,
)
results = ray.tune.ResultGrid(analysis)
assert not results[0].metrics.get("finish_marker", False)
analysis = tune.run(
train_fn,
storage_path=str(tmp_path),
name="test_resume_options",
resume=resume,
raise_on_failed_trial=False,
)
results = ray.tune.ResultGrid(analysis)
if resume in [True, "AUTO", "AUTO+RESTART_ERRORED", "AUTO+RESTART_ERRORED_ONLY"]:
# These options either don't resume the errored trial,
# or restart it without a checkpoint --> leading to the RuntimeError again
assert not results[0].metrics.get("finish_marker")
else:
assert results[0].metrics.get("finish_marker")


# For some reason, different tests are coupled through tune.registry.
# After running `ResourceExhaustedTest`, there is always a super huge `training_func` to
# be put through GCS, which will fail subsequent tests.
# tldr, make sure that this test is the last test in the file.
class ResourceExhaustedTest(unittest.TestCase):
def test_resource_exhausted_info(self):
"""This is to test if helpful information is displayed when
the objects captured in trainable/training function are too
large and RESOURCES_EXHAUSTED error of gRPC is triggered."""

# generate some random data to be captured implicitly in training func.
from sklearn.datasets import fetch_olivetti_faces

a_large_array = []
for i in range(50):
a_large_array.append(fetch_olivetti_faces())

def training_func(config):
for item in a_large_array:
assert item

with self.assertRaisesRegex(
TuneError,
"The Trainable/training function is too large for grpc resource limit.",
):
tune.run(training_func)


if __name__ == "__main__":
import sys

Expand Down
7 changes: 3 additions & 4 deletions python/ray/tune/tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ def _build_resume_config_from_legacy_config(
"Please pass in one of (True, False, 'AUTO')."
)

resume_config = ResumeConfig()
for setting in resume_settings[1:]:
if setting == "ERRORED":
resume_config = ResumeConfig(errored=ResumeConfig.ResumeType.RESUME)
Expand Down Expand Up @@ -457,17 +458,15 @@ def run(
resume: One of [True, False, "AUTO"]. Can
be suffixed with one or more of ["+ERRORED", "+ERRORED_ONLY",
"+RESTART_ERRORED", "+RESTART_ERRORED_ONLY"] (e.g. ``AUTO+ERRORED``).
"AUTO" will attempt to resume from a checkpoint and otherwise
start a new experiment.
`resume=True` and `resume="AUTO"` will attempt to resume from a
checkpoint and otherwise start a new experiment.
The suffix "+ERRORED" resets and reruns errored trials upon resume -
previous trial artifacts will be left untouched. It will try to continue
from the last observed checkpoint.
The suffix "+RESTART_ERRORED" will instead start the errored trials from
scratch. "+ERRORED_ONLY" and "+RESTART_ERRORED_ONLY" will disable
resuming non-errored trials - they will be added as finished instead. New
trials can still be generated by the search algorithm.
If resume is set but checkpoint does not exist,
ValueError will be thrown.
resume_config: [Experimental] Config object that controls how to resume
trials of different statuses. Can be used as a substitute to the
`resume` suffixes described above.
Expand Down

0 comments on commit 012f7f7

Please sign in to comment.