Skip to content

Commit

Permalink
re-enable delta optimizer test and fix a bug in adasum communicator i…
Browse files Browse the repository at this point in the history
…nit logic (#3379)

Signed-off-by: TJ <tix@uber.com>
  • Loading branch information
TJ Xu committed Jan 24, 2022
1 parent e0e982f commit a1f17d8
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 12 deletions.
10 changes: 5 additions & 5 deletions horovod/common/ops/adasum/adasum_mpi.cc
Expand Up @@ -28,8 +28,8 @@ AdasumMPI::~AdasumMPI() {

void AdasumMPI::InitializeVHDDReductionComms() {
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(mpi_context_->mpi_comm, &rank);
MPI_Comm_size(mpi_context_->mpi_comm, &size);

// Initialize communication groups for the vector halving, distance doubling
// (VHDD) Adasum reduction. These are used in computing dot products and
Expand All @@ -38,10 +38,10 @@ void AdasumMPI::InitializeVHDDReductionComms() {
// includes two elements: this rank and it's first VHDD neighbor. The
// subsequent groups grow to include any ranks the previous group
// communicates with. Thus the sizes of the groups are 2,4,8... up to the
// size of MPI_COMM_WORLD. In essence, a reduction group includes all nodes
// size of mpi_context->mpi_comm. In essence, a reduction group includes all nodes
// that a tensor may be split across.
MPI_Group world_group;
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
MPI_Comm_group(mpi_context_->mpi_comm, &world_group);
int nearest_power_2 = 1;
int log_size;
for (nearest_power_2 = 1, log_size = 0; (nearest_power_2 << 1) <= size;
Expand All @@ -59,7 +59,7 @@ void AdasumMPI::InitializeVHDDReductionComms() {
}
MPI_Group red_group;
MPI_Group_incl(world_group, (level << 1), node_rank, &red_group);
MPI_Comm_create_group(MPI_COMM_WORLD, red_group, 0,
MPI_Comm_create_group(mpi_context_->mpi_comm, red_group, 0,
&reduction_comms_[shift_val - 1]);
MPI_Group_free(&red_group);
}
Expand Down
8 changes: 1 addition & 7 deletions test/parallel/test_torch.py
Expand Up @@ -43,7 +43,6 @@
from common import mpi_env_rank_and_size, skip_or_fail_gpu_test, temppath

_1_5_api = LooseVersion(torch.__version__) >= LooseVersion('1.5.0')
_1_10_api = LooseVersion(torch.__version__) >= LooseVersion('1.10.0')
_is_mac = platform.system() == 'Darwin'

ccl_supported_types = set([torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
Expand Down Expand Up @@ -2540,12 +2539,6 @@ def forward(self, x):

def test_delta_optimizer(self):
"""Test that delta optimizer."""
if _1_10_api:
# On PyTorch 1.10, if this test is not skipped and when tests are run in alphabetical order, the later
# test_dynamic_requires_grad can run into a deadlock.
# TODO: Understand and fix the root cause of these deadlocks.
self.skipTest("Deadlocks with PyTorch 1.10")

hvd.init()
if not hvd.mpi_enabled():
# TODO support non-MPI Adasum operation
Expand Down Expand Up @@ -2583,6 +2576,7 @@ def forward(self, x):
opt.zero_grad()
loss.backward()
opt.step()
hvd.barrier()

def test_duplicate_names(self):
"""Test that passing duplicate names to optimizer will fail."""
Expand Down

0 comments on commit a1f17d8

Please sign in to comment.