re-enable delta optimizer test and fix a bug in adasum communicator i…

…nit logic (#3379) Signed-off-by: TJ <tix@uber.com>
horovod · Jan 24, 2022 · a1f17d8 · a1f17d8
1 parent e0e982f
commit a1f17d8
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 12 deletions.
diff --git a/horovod/common/ops/adasum/adasum_mpi.cc b/horovod/common/ops/adasum/adasum_mpi.cc
@@ -28,8 +28,8 @@ AdasumMPI::~AdasumMPI() {
 
 void AdasumMPI::InitializeVHDDReductionComms() {
   int rank, size;
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
+  MPI_Comm_rank(mpi_context_->mpi_comm, &rank);
+  MPI_Comm_size(mpi_context_->mpi_comm, &size);
 
   // Initialize communication groups for the vector halving, distance doubling
   // (VHDD) Adasum reduction. These are used in computing dot products and
@@ -38,10 +38,10 @@ void AdasumMPI::InitializeVHDDReductionComms() {
   // includes two elements: this rank and it's first VHDD neighbor. The
   // subsequent groups grow to include any ranks the previous group
   // communicates with. Thus the sizes of the groups are 2,4,8... up to the
-  // size of MPI_COMM_WORLD. In essence, a reduction group includes all nodes
+  // size of mpi_context->mpi_comm. In essence, a reduction group includes all nodes
   // that a tensor may be split across.
   MPI_Group world_group;
-  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
+  MPI_Comm_group(mpi_context_->mpi_comm, &world_group);
   int nearest_power_2 = 1;
   int log_size;
   for (nearest_power_2 = 1, log_size = 0; (nearest_power_2 << 1) <= size;
@@ -59,7 +59,7 @@ void AdasumMPI::InitializeVHDDReductionComms() {
     }
     MPI_Group red_group;
     MPI_Group_incl(world_group, (level << 1), node_rank, &red_group);
-    MPI_Comm_create_group(MPI_COMM_WORLD, red_group, 0,
+    MPI_Comm_create_group(mpi_context_->mpi_comm, red_group, 0,
                           &reduction_comms_[shift_val - 1]);
     MPI_Group_free(&red_group);
   }

diff --git a/test/parallel/test_torch.py b/test/parallel/test_torch.py
@@ -43,7 +43,6 @@
 from common import mpi_env_rank_and_size, skip_or_fail_gpu_test, temppath
 
 _1_5_api = LooseVersion(torch.__version__) >= LooseVersion('1.5.0')
-_1_10_api = LooseVersion(torch.__version__) >= LooseVersion('1.10.0')
 _is_mac = platform.system() == 'Darwin'
 
 ccl_supported_types = set([torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
@@ -2540,12 +2539,6 @@ def forward(self, x):
 
     def test_delta_optimizer(self):
         """Test that delta optimizer."""
-        if _1_10_api:
-            # On PyTorch 1.10, if this test is not skipped and when tests are run in alphabetical order, the later
-            # test_dynamic_requires_grad can run into a deadlock.
-            # TODO: Understand and fix the root cause of these deadlocks.
-            self.skipTest("Deadlocks with PyTorch 1.10")
-
         hvd.init()
         if not hvd.mpi_enabled():
             # TODO support non-MPI Adasum operation
@@ -2583,6 +2576,7 @@ def forward(self, x):
         opt.zero_grad()
         loss.backward()
         opt.step()
+        hvd.barrier()
 
     def test_duplicate_names(self):
         """Test that passing duplicate names to optimizer will fail."""