From c9ee58a5339042dc2b7c1a2996bf46fe400869cd Mon Sep 17 00:00:00 2001 From: Mohamed Yousef <124146+ASDen@users.noreply.github.com> Date: Tue, 1 Mar 2022 16:31:50 +0200 Subject: [PATCH] Catch any exception to prevent the discovery thread from silently dying Signed-off-by: Mohamed Yousef --- horovod/runner/elastic/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/horovod/runner/elastic/driver.py b/horovod/runner/elastic/driver.py index 20a2dd7b97..18a697ab3f 100644 --- a/horovod/runner/elastic/driver.py +++ b/horovod/runner/elastic/driver.py @@ -187,7 +187,7 @@ def _discover_hosts(self): if update_res != HostUpdateResult.no_update: self._notify_workers_host_changes(self._host_manager.current_hosts, update_res) self._wait_hosts_cond.notify_all() - except RuntimeError as e: + except BaseException as e: if first_update: # Misconfiguration, fail the job immediately self._shutdown.set()