spark: check all ranks have same device type

Signed-off-by: Chongxiao Cao <chongxiaoc@uber.com>
horovod · May 27, 2021 · 8efcb87 · 8efcb87
1 parent b218973
commit 8efcb87
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 0 deletions.
diff --git a/horovod/spark/keras/remote.py b/horovod/spark/keras/remote.py
@@ -322,6 +322,12 @@ def _pin_gpu_fn():
 def _pin_gpu_tensorflow2_fn():
     def fn(hvd, tf, keras):
         gpus = tf.config.experimental.list_physical_devices('GPU')
+        # We need to check all ranks have same device type for traning.
+        # Horovod doesn't support heterogeneous allreduce for gradients.
+        cuda_available = True if gpus else False
+        cuda_avail_list = hvd.allgather_object(cuda_available, name='device type')
+        if hvd.rank() == 0:
+            assert cuda_avail_list.count(cuda_available) == hvd.size(), "All ranks don't have same device type!"
         for gpu in gpus:
             tf.config.experimental.set_memory_growth(gpu, True)
         if gpus:
@@ -332,6 +338,12 @@ def fn(hvd, tf, keras):
 
 def _pin_gpu_tensorflow1_fn():
     def fn(hvd, tf, keras):
+        # We need to check all ranks have same device type for traning.
+        # Horovod doesn't support heterogeneous allreduce for gradients.
+        cuda_available = tf.test.is_gpu_available()
+        cuda_avail_list = hvd.allgather_object(cuda_available, name='device type')
+        if hvd.rank() == 0:
+            assert cuda_avail_list.count(cuda_available) == hvd.size(), "All ranks don't have same device type!"
         config = tf.ConfigProto()
         config.gpu_options.allow_growth = True
         config.gpu_options.visible_device_list = \

diff --git a/horovod/spark/lightning/remote.py b/horovod/spark/lightning/remote.py
@@ -121,6 +121,12 @@ def train(serialized_model):
             _val_steps_per_epoch = val_steps_per_epoch if val_steps_per_epoch else 1.0
 
             cuda_available = torch.cuda.is_available()
+            # We need to check all ranks have same device type for traning.
+            # Horovod doesn't support heterogeneous allreduce for gradients.
+            cuda_avail_list = hvd.allgather_object(cuda_available, name='device type')
+            if hvd.rank() == 0:
+                assert cuda_avail_list.count(cuda_available) == hvd.size(), "All ranks don't have same device type!"
+
             if cuda_available:
                 # Horovod: pin GPU to local rank or the assigned GPU from spark.
                 torch.cuda.set_device(_get_assigned_gpu_or_default(default=hvd.local_rank()))

diff --git a/horovod/spark/torch/remote.py b/horovod/spark/torch/remote.py
@@ -123,6 +123,12 @@ def train(serialized_model, optimizer_cls, model_opt_state_serialized,
             shuffle_buffer_size = user_shuffle_buffer_size
 
         cuda_available = torch.cuda.is_available()
+        # We need to check all ranks have same device type for traning.
+        # Horovod doesn't support heterogeneous allreduce for gradients.
+        cuda_avail_list = hvd.allgather_object(cuda_available, name='device type')
+        if hvd.rank() == 0:
+            assert cuda_avail_list.count(cuda_available) == hvd.size(), "All ranks don't have same device type!"
+
         if cuda_available:
             # Horovod: pin GPU to local rank or the assigned GPU from spark.
             torch.cuda.set_device(_get_assigned_gpu_or_default(default=hvd.local_rank()))