Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

re-enable delta optimizer test and fix a bug in adasum communicator init logic #3379

Merged
merged 1 commit into from Jan 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 5 additions & 5 deletions horovod/common/ops/adasum/adasum_mpi.cc
Expand Up @@ -28,8 +28,8 @@ AdasumMPI::~AdasumMPI() {

void AdasumMPI::InitializeVHDDReductionComms() {
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(mpi_context_->mpi_comm, &rank);
MPI_Comm_size(mpi_context_->mpi_comm, &size);

// Initialize communication groups for the vector halving, distance doubling
// (VHDD) Adasum reduction. These are used in computing dot products and
Expand All @@ -38,10 +38,10 @@ void AdasumMPI::InitializeVHDDReductionComms() {
// includes two elements: this rank and it's first VHDD neighbor. The
// subsequent groups grow to include any ranks the previous group
// communicates with. Thus the sizes of the groups are 2,4,8... up to the
// size of MPI_COMM_WORLD. In essence, a reduction group includes all nodes
// size of mpi_context->mpi_comm. In essence, a reduction group includes all nodes
// that a tensor may be split across.
MPI_Group world_group;
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
MPI_Comm_group(mpi_context_->mpi_comm, &world_group);
int nearest_power_2 = 1;
int log_size;
for (nearest_power_2 = 1, log_size = 0; (nearest_power_2 << 1) <= size;
Expand All @@ -59,7 +59,7 @@ void AdasumMPI::InitializeVHDDReductionComms() {
}
MPI_Group red_group;
MPI_Group_incl(world_group, (level << 1), node_rank, &red_group);
MPI_Comm_create_group(MPI_COMM_WORLD, red_group, 0,
MPI_Comm_create_group(mpi_context_->mpi_comm, red_group, 0,
&reduction_comms_[shift_val - 1]);
MPI_Group_free(&red_group);
}
Expand Down
8 changes: 1 addition & 7 deletions test/parallel/test_torch.py
Expand Up @@ -43,7 +43,6 @@
from common import mpi_env_rank_and_size, skip_or_fail_gpu_test, temppath

_1_5_api = LooseVersion(torch.__version__) >= LooseVersion('1.5.0')
_1_10_api = LooseVersion(torch.__version__) >= LooseVersion('1.10.0')
_is_mac = platform.system() == 'Darwin'

ccl_supported_types = set([torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
Expand Down Expand Up @@ -2540,12 +2539,6 @@ def forward(self, x):

def test_delta_optimizer(self):
"""Test that delta optimizer."""
if _1_10_api:
# On PyTorch 1.10, if this test is not skipped and when tests are run in alphabetical order, the later
# test_dynamic_requires_grad can run into a deadlock.
# TODO: Understand and fix the root cause of these deadlocks.
self.skipTest("Deadlocks with PyTorch 1.10")

hvd.init()
if not hvd.mpi_enabled():
# TODO support non-MPI Adasum operation
Expand Down Expand Up @@ -2583,6 +2576,7 @@ def forward(self, x):
opt.zero_grad()
loss.backward()
opt.step()
hvd.barrier()

def test_duplicate_names(self):
"""Test that passing duplicate names to optimizer will fail."""
Expand Down