Add synchronization barriers to the ends of the test_*_duplicate_name…

…_error Without this, deadlocks in the subsequent test were possible: One process would already have enqueued a collective op like hvd.broadcast(), while the other would still block in hvd.init() [specifically in _get_process_set_ids_and_ranks()]. I could not use hvd.barrier() for this second barrier because that would somehow cause a segmentation fault. Went for an allreduce instead. Signed-off-by: Max H. Gerlach <git@maxgerlach.de>
horovod · Dec 5, 2021 · 6593da5 · 6593da5
1 parent 952d17a
commit 6593da5
Showing 1 changed file with 3 additions and 0 deletions.
diff --git a/test/parallel/test_torch.py b/test/parallel/test_torch.py
@@ -631,6 +631,7 @@ def test_horovod_allreduce_duplicate_name_error(self):
                 assert False, 'hvd.allreduce_async did not throw error'
             except (torch.FatalError, ValueError):
                 pass
+        hvd.allreduce(torch.FloatTensor([0]), name="synch")
 
     def test_horovod_allreduce_grad(self):
         """Test the correctness of the allreduce gradient."""
@@ -1246,6 +1247,7 @@ def test_horovod_allgather_duplicate_name_error(self):
                 assert False, 'hvd.allgather_async did not throw error'
             except (torch.FatalError, ValueError):
                 pass
+        hvd.allreduce(torch.FloatTensor([0]), name="synch")
 
     def test_horovod_allgather_grad(self):
         """Test the correctness of the allgather gradient."""
@@ -1565,6 +1567,7 @@ def test_horovod_broadcast_duplicate_name_error(self):
                 assert False, 'hvd.broadcast_async did not throw error'
             except (torch.FatalError, ValueError):
                 pass
+        hvd.allreduce(torch.FloatTensor([0]), name="synch")
 
     def test_horovod_broadcast_grad(self):
         """Test the correctness of the broadcast gradient."""