From afb04976e26680b287b70986bd9fdfc884a64543 Mon Sep 17 00:00:00 2001 From: Mohamed Yousef Date: Wed, 30 Mar 2022 10:11:29 +0200 Subject: [PATCH] Catch any exception to prevent the discovery thread from silently dying (#3436) Signed-off-by: Mohamed Yousef --- horovod/runner/elastic/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/horovod/runner/elastic/driver.py b/horovod/runner/elastic/driver.py index 9a2c9e30c1..b26619f002 100644 --- a/horovod/runner/elastic/driver.py +++ b/horovod/runner/elastic/driver.py @@ -194,7 +194,7 @@ def _discover_hosts(self): if update_res != HostUpdateResult.no_update: self._notify_workers_host_changes(self._host_manager.current_hosts, update_res) self._wait_hosts_cond.notify_all() - except RuntimeError as e: + except BaseException as e: if first_update: # Misconfiguration, fail the job immediately self._shutdown.set()