Skip to content

Commit

Permalink
Make log_line_prefix_template Optional in Elastic Launcher for Back…
Browse files Browse the repository at this point in the history
…ward Compatibility (#2888)

* Fix unexpected keyword argument err for elastic launch config

* Update torch version flow

* Del log prefix template from env vars
  • Loading branch information
yhna940 committed Jul 3, 2024
1 parent 3086e26 commit 404510a
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 3 deletions.
12 changes: 9 additions & 3 deletions src/accelerate/launchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@
check_cuda_p2p_ib_support,
get_gpu_info,
is_mps_available,
is_torch_version,
patch_environment,
)
from .utils.constants import ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION


def test_launch():
Expand All @@ -50,6 +52,7 @@ def notebook_launcher(
rdzv_id="none",
max_restarts=0,
monitor_interval=0.1,
log_line_prefix_template=None,
):
"""
Launches a training function, using several processes or multiple nodes if it's possible in the current environment
Expand Down Expand Up @@ -96,6 +99,8 @@ def notebook_launcher(
The maximum amount of restarts that elastic agent will conduct on workers before failure.
monitor_interval (`float`, *optional*, defaults to 0.1):
The interval in seconds that is used by the elastic_agent as a period of monitoring workers.
log_line_prefix_template (`str`, *optional*, defaults to `None`):
The prefix template for elastic launch logging. Available from PyTorch 2.2.0.
Example:
Expand Down Expand Up @@ -223,7 +228,7 @@ def train(*args):
rdzv_conf["rank"] = node_rank
if not rdzv_endpoint:
rdzv_endpoint = f"{master_addr}:{use_port}"
launch_config = LaunchConfig(
launch_config_kwargs = dict(
min_nodes=num_nodes,
max_nodes=num_nodes,
nproc_per_node=num_processes,
Expand All @@ -234,9 +239,10 @@ def train(*args):
max_restarts=max_restarts,
monitor_interval=monitor_interval,
start_method="fork",
log_line_prefix_template=os.environ.get("TORCHELASTIC_LOG_LINE_PREFIX_TEMPLATE"),
)
elastic_launch(config=launch_config, entrypoint=function)(*args)
if is_torch_version(">=", ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION):
launch_config_kwargs["log_line_prefix_template"] = log_line_prefix_template
elastic_launch(config=LaunchConfig(**launch_config_kwargs), entrypoint=function)(*args)
except ProcessRaisedException as e:
if "Cannot re-initialize CUDA in forked subprocess" in e.args[0]:
raise RuntimeError(
Expand Down
1 change: 1 addition & 0 deletions src/accelerate/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
FSDP_MODEL_NAME = "pytorch_model_fsdp"
DEEPSPEED_MULTINODE_LAUNCHERS = ["pdsh", "standard", "openmpi", "mvapich", "mpich"]
TORCH_DYNAMO_MODES = ["default", "reduce-overhead", "max-autotune"]
ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION = "2.2.0"

STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt}

Expand Down

0 comments on commit 404510a

Please sign in to comment.