Make log_line_prefix_template Optional in Elastic Launcher for Back…

…ward Compatibility (#2888) * Fix unexpected keyword argument err for elastic launch config * Update torch version flow * Del log prefix template from env vars
huggingface · Jul 3, 2024 · 404510a · 404510a
1 parent 3086e26
commit 404510a
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 3 deletions.
diff --git a/src/accelerate/launchers.py b/src/accelerate/launchers.py
@@ -26,8 +26,10 @@
     check_cuda_p2p_ib_support,
     get_gpu_info,
     is_mps_available,
+    is_torch_version,
     patch_environment,
 )
+from .utils.constants import ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION
 
 
 def test_launch():
@@ -50,6 +52,7 @@ def notebook_launcher(
     rdzv_id="none",
     max_restarts=0,
     monitor_interval=0.1,
+    log_line_prefix_template=None,
 ):
     """
     Launches a training function, using several processes or multiple nodes if it's possible in the current environment
@@ -96,6 +99,8 @@ def notebook_launcher(
             The maximum amount of restarts that elastic agent will conduct on workers before failure.
         monitor_interval (`float`, *optional*, defaults to 0.1):
             The interval in seconds that is used by the elastic_agent as a period of monitoring workers.
+        log_line_prefix_template (`str`, *optional*, defaults to `None`):
+            The prefix template for elastic launch logging. Available from PyTorch 2.2.0.
 
     Example:
 
@@ -223,7 +228,7 @@ def train(*args):
                         rdzv_conf["rank"] = node_rank
                         if not rdzv_endpoint:
                             rdzv_endpoint = f"{master_addr}:{use_port}"
-                    launch_config = LaunchConfig(
+                    launch_config_kwargs = dict(
                         min_nodes=num_nodes,
                         max_nodes=num_nodes,
                         nproc_per_node=num_processes,
@@ -234,9 +239,10 @@ def train(*args):
                         max_restarts=max_restarts,
                         monitor_interval=monitor_interval,
                         start_method="fork",
-                        log_line_prefix_template=os.environ.get("TORCHELASTIC_LOG_LINE_PREFIX_TEMPLATE"),
                     )
-                    elastic_launch(config=launch_config, entrypoint=function)(*args)
+                    if is_torch_version(">=", ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION):
+                        launch_config_kwargs["log_line_prefix_template"] = log_line_prefix_template
+                    elastic_launch(config=LaunchConfig(**launch_config_kwargs), entrypoint=function)(*args)
                 except ProcessRaisedException as e:
                     if "Cannot re-initialize CUDA in forked subprocess" in e.args[0]:
                         raise RuntimeError(

diff --git a/src/accelerate/utils/constants.py b/src/accelerate/utils/constants.py
@@ -41,6 +41,7 @@
 FSDP_MODEL_NAME = "pytorch_model_fsdp"
 DEEPSPEED_MULTINODE_LAUNCHERS = ["pdsh", "standard", "openmpi", "mvapich", "mpich"]
 TORCH_DYNAMO_MODES = ["default", "reduce-overhead", "max-autotune"]
+ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION = "2.2.0"
 
 STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt}