From 60d28cde9c9b518ad947a976a7111542def713cf Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Tue, 13 Aug 2019 09:04:26 -0700 Subject: [PATCH 1/2] Changed Gloo mode to attempt changing directory into the current working directory that the command was launched from, if it exists, to be consistent with MPI Signed-off-by: Travis Addair --- horovod/run/gloo_run.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/horovod/run/gloo_run.py b/horovod/run/gloo_run.py index 25def0e176..48f499b37c 100644 --- a/horovod/run/gloo_run.py +++ b/horovod/run/gloo_run.py @@ -141,10 +141,7 @@ def _exec_command(_command, _index, event_): 'message: {message}'.format(message=e)) return 0 - if settings.ssh_port: - ssh_port_arg = "-p {ssh_port}".format(ssh_port=settings.ssh_port) - else: - ssh_port_arg = "" + ssh_port_arg = '-p {ssh_port}'.format(ssh_port=settings.ssh_port) if settings.ssh_port else '' # Create a event for communication between threads event = threading.Event() @@ -171,7 +168,7 @@ def set_event_on_sigterm(signum, frame): local_command = '{horovod_env} {env} {run_command}' .format( horovod_env=horovod_rendez_env, env=' '.join(['%s=%s' % (key, quote(value)) for key, value in env.items() - if env_util.is_exportable(key)]), + if env_util.is_exportable(key)]), run_command=_run_command) if host_name not in remote_host_names: @@ -181,7 +178,8 @@ def set_event_on_sigterm(signum, frame): '{local_command}'.format( host=host_name, ssh_port_arg=ssh_port_arg, - local_command=quote(local_command) + local_command=quote('cd {pwd} ; {local_command}'.format(pwd=os.getcwd(), + local_command=local_command)) ) args_list.append([command, alloc_info.rank, event]) From 24671f1cbf915ccda4f3c5477824571307c9269c Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Tue, 13 Aug 2019 09:57:38 -0700 Subject: [PATCH 2/2] Suppress output from command Signed-off-by: Travis Addair --- horovod/run/gloo_run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/horovod/run/gloo_run.py b/horovod/run/gloo_run.py index 48f499b37c..2c19158885 100644 --- a/horovod/run/gloo_run.py +++ b/horovod/run/gloo_run.py @@ -178,8 +178,8 @@ def set_event_on_sigterm(signum, frame): '{local_command}'.format( host=host_name, ssh_port_arg=ssh_port_arg, - local_command=quote('cd {pwd} ; {local_command}'.format(pwd=os.getcwd(), - local_command=local_command)) + local_command=quote('cd {pwd} >& /dev/null ; {local_command}' + .format(pwd=os.getcwd(), local_command=local_command)) ) args_list.append([command, alloc_info.rank, event])