Skip to content

Commit

Permalink
Reinit every torch test (#3194)
Browse files Browse the repository at this point in the history
  • Loading branch information
tgaddair committed Oct 5, 2021
1 parent 7798660 commit 062aaa0
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 11 deletions.
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ def build_extensions(self):
'pyspark>=3.0.0;python_version>="3.8"']
# Pin h5py: https://github.com/h5py/h5py/issues/1732
spark_require_list = ['h5py<3', 'numpy', 'petastorm>=0.11.0', 'pyarrow>=0.15.0', 'fsspec']
ray_require_list = ['ray']
# https://github.com/ray-project/ray/pull/17465
ray_require_list = ['ray', 'aioredis<2']
pytorch_spark_require_list = pytorch_require_list + \
spark_require_list + \
pyspark_require_list
Expand Down
11 changes: 7 additions & 4 deletions test/parallel/test_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@

_1_5_api = LooseVersion(torch.__version__) >= LooseVersion('1.5.0')
_1_10_api = LooseVersion(torch.__version__) >= LooseVersion('1.10.0')
_is_mac = platform.system() == 'Darwin'

ccl_supported_types = set([torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
torch.IntTensor, torch.LongTensor, torch.FloatTensor,
Expand All @@ -63,10 +64,13 @@ def __init__(self, *args, **kwargs):
super(TorchTests, self).__init__(*args, **kwargs)
warnings.simplefilter('module')

def setup(self):
hvd.init()

def tearDown(self):
if _1_10_api and hvd.is_initialized():
# To fix https://github.com/horovod/horovod/issues/3149
hvd.join()
gloo_rank = int(os.getenv('HOROVOD_RANK', -1))
if hvd.is_initialized() and not _is_mac and gloo_rank != -1:
hvd.shutdown()

def convert_cpu_fp16_to_fp32(self, *values):
# PyTorch doesn't support any CPU ops on FP16 tensors.
Expand All @@ -93,7 +97,6 @@ def test_gpu_required(self):
if not torch.cuda.is_available():
skip_or_fail_gpu_test(self, "No GPUs available")

@pytest.mark.skipif(platform.system() == 'Darwin', reason='Reinit not supported on macOS')
def test_horovod_reinit(self):
"""Test that Horovod can init -> shutdown -> init successfully."""
mpi_rank, _ = mpi_env_rank_and_size()
Expand Down
8 changes: 2 additions & 6 deletions test/single/test_ray_elastic.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,9 +215,7 @@ def fault_tolerance_patches():

@pytest.mark.skipif(
not gloo_built(), reason='Gloo is required for Ray integration')
@pytest.mark.skipif(
os.environ.get('GITHUB_ACTIONS', 'false') == 'true',
reason='This test fails on GitHub Workflow, see https://github.com/horovod/horovod/issues/2813')
@pytest.mark.skip(reason='https://github.com/horovod/horovod/issues/3197')
def test_fault_tolerance_hosts_added_and_removed(ray_8_cpus):
with fault_tolerance_patches():
discovery_schedule = [
Expand Down Expand Up @@ -245,9 +243,7 @@ def test_fault_tolerance_hosts_added_and_removed(ray_8_cpus):

@pytest.mark.skipif(
not gloo_built(), reason='Gloo is required for Ray integration')
@pytest.mark.skipif(
os.environ.get('GITHUB_ACTIONS', 'false') == 'true',
reason='This test fails on GitHub Workflow, see https://github.com/horovod/horovod/issues/2813')
@pytest.mark.skip(reason='https://github.com/horovod/horovod/issues/3197')
def test_fault_tolerance_hosts_remove_and_add(ray_8_cpus):
with fault_tolerance_patches():
discovery_schedule = [
Expand Down

0 comments on commit 062aaa0

Please sign in to comment.