From 062aaa0d38b2380904cef68e98bb8607c0ecd79f Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Tue, 5 Oct 2021 14:28:50 -0700 Subject: [PATCH] Reinit every torch test (#3194) --- setup.py | 3 ++- test/parallel/test_torch.py | 11 +++++++---- test/single/test_ray_elastic.py | 8 ++------ 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/setup.py b/setup.py index 4d4ff499c4..51ab6b39e2 100644 --- a/setup.py +++ b/setup.py @@ -124,7 +124,8 @@ def build_extensions(self): 'pyspark>=3.0.0;python_version>="3.8"'] # Pin h5py: https://github.com/h5py/h5py/issues/1732 spark_require_list = ['h5py<3', 'numpy', 'petastorm>=0.11.0', 'pyarrow>=0.15.0', 'fsspec'] -ray_require_list = ['ray'] +# https://github.com/ray-project/ray/pull/17465 +ray_require_list = ['ray', 'aioredis<2'] pytorch_spark_require_list = pytorch_require_list + \ spark_require_list + \ pyspark_require_list diff --git a/test/parallel/test_torch.py b/test/parallel/test_torch.py index a44246fe0b..84005f14a5 100644 --- a/test/parallel/test_torch.py +++ b/test/parallel/test_torch.py @@ -43,6 +43,7 @@ _1_5_api = LooseVersion(torch.__version__) >= LooseVersion('1.5.0') _1_10_api = LooseVersion(torch.__version__) >= LooseVersion('1.10.0') +_is_mac = platform.system() == 'Darwin' ccl_supported_types = set([torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, @@ -63,10 +64,13 @@ def __init__(self, *args, **kwargs): super(TorchTests, self).__init__(*args, **kwargs) warnings.simplefilter('module') + def setup(self): + hvd.init() + def tearDown(self): - if _1_10_api and hvd.is_initialized(): - # To fix https://github.com/horovod/horovod/issues/3149 - hvd.join() + gloo_rank = int(os.getenv('HOROVOD_RANK', -1)) + if hvd.is_initialized() and not _is_mac and gloo_rank != -1: + hvd.shutdown() def convert_cpu_fp16_to_fp32(self, *values): # PyTorch doesn't support any CPU ops on FP16 tensors. @@ -93,7 +97,6 @@ def test_gpu_required(self): if not torch.cuda.is_available(): skip_or_fail_gpu_test(self, "No GPUs available") - @pytest.mark.skipif(platform.system() == 'Darwin', reason='Reinit not supported on macOS') def test_horovod_reinit(self): """Test that Horovod can init -> shutdown -> init successfully.""" mpi_rank, _ = mpi_env_rank_and_size() diff --git a/test/single/test_ray_elastic.py b/test/single/test_ray_elastic.py index 5c70fb0d38..3a514ad2ed 100644 --- a/test/single/test_ray_elastic.py +++ b/test/single/test_ray_elastic.py @@ -215,9 +215,7 @@ def fault_tolerance_patches(): @pytest.mark.skipif( not gloo_built(), reason='Gloo is required for Ray integration') -@pytest.mark.skipif( - os.environ.get('GITHUB_ACTIONS', 'false') == 'true', - reason='This test fails on GitHub Workflow, see https://github.com/horovod/horovod/issues/2813') +@pytest.mark.skip(reason='https://github.com/horovod/horovod/issues/3197') def test_fault_tolerance_hosts_added_and_removed(ray_8_cpus): with fault_tolerance_patches(): discovery_schedule = [ @@ -245,9 +243,7 @@ def test_fault_tolerance_hosts_added_and_removed(ray_8_cpus): @pytest.mark.skipif( not gloo_built(), reason='Gloo is required for Ray integration') -@pytest.mark.skipif( - os.environ.get('GITHUB_ACTIONS', 'false') == 'true', - reason='This test fails on GitHub Workflow, see https://github.com/horovod/horovod/issues/2813') +@pytest.mark.skip(reason='https://github.com/horovod/horovod/issues/3197') def test_fault_tolerance_hosts_remove_and_add(ray_8_cpus): with fault_tolerance_patches(): discovery_schedule = [