Add synchronization barriers to the ends of the test_*_duplicate_name…

…_error (including reducescatter test) Without this, deadlocks in the subsequent test were possible: One process would already have enqueued a collective op like hvd.broadcast(), while the other would still block in hvd.init() [specifically in _get_process_set_ids_and_ranks()]. I could not use hvd.barrier() for this second barrier because that would somehow cause a segmentation fault. Went for an allreduce instead. Signed-off-by: Max H. Gerlach <git@maxgerlach.de>
horovod · Dec 5, 2021 · a74c56f · a74c56f
1 parent 2f51e2d
commit a74c56f
Showing 1 changed file with 4 additions and 0 deletions.
diff --git a/test/parallel/test_torch.py b/test/parallel/test_torch.py
@@ -631,6 +631,7 @@ def test_horovod_allreduce_duplicate_name_error(self):
                 assert False, 'hvd.allreduce_async did not throw error'
             except (torch.FatalError, ValueError):
                 pass
+        hvd.allreduce(torch.FloatTensor([0]), name="synch")
 
     def test_horovod_allreduce_grad(self):
         """Test the correctness of the allreduce gradient."""
@@ -1246,6 +1247,7 @@ def test_horovod_allgather_duplicate_name_error(self):
                 assert False, 'hvd.allgather_async did not throw error'
             except (torch.FatalError, ValueError):
                 pass
+        hvd.allreduce(torch.FloatTensor([0]), name="synch")
 
     def test_horovod_allgather_grad(self):
         """Test the correctness of the allgather gradient."""
@@ -1565,6 +1567,7 @@ def test_horovod_broadcast_duplicate_name_error(self):
                 assert False, 'hvd.broadcast_async did not throw error'
             except (torch.FatalError, ValueError):
                 pass
+        hvd.allreduce(torch.FloatTensor([0]), name="synch")
 
     def test_horovod_broadcast_grad(self):
         """Test the correctness of the broadcast gradient."""
@@ -3547,6 +3550,7 @@ def test_horovod_reducescatter_duplicate_name_error(self):
                 assert False, 'hvd.reducescatter_async did not throw error'
             except (torch.FatalError, ValueError):
                 pass
+        hvd.allreduce(torch.FloatTensor([0]), name="synch")
 
 
     def test_horovod_reducescatter_grad(self):