Skip to content

Commit

Permalink
spark: check all ranks have same device type
Browse files Browse the repository at this point in the history
Signed-off-by: Chongxiao Cao <chongxiaoc@uber.com>
  • Loading branch information
chongxiaoc committed May 27, 2021
1 parent b218973 commit 8efcb87
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 0 deletions.
12 changes: 12 additions & 0 deletions horovod/spark/keras/remote.py
Expand Up @@ -322,6 +322,12 @@ def _pin_gpu_fn():
def _pin_gpu_tensorflow2_fn():
def fn(hvd, tf, keras):
gpus = tf.config.experimental.list_physical_devices('GPU')
# We need to check all ranks have same device type for traning.
# Horovod doesn't support heterogeneous allreduce for gradients.
cuda_available = True if gpus else False
cuda_avail_list = hvd.allgather_object(cuda_available, name='device type')
if hvd.rank() == 0:
assert cuda_avail_list.count(cuda_available) == hvd.size(), "All ranks don't have same device type!"
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
Expand All @@ -332,6 +338,12 @@ def fn(hvd, tf, keras):

def _pin_gpu_tensorflow1_fn():
def fn(hvd, tf, keras):
# We need to check all ranks have same device type for traning.
# Horovod doesn't support heterogeneous allreduce for gradients.
cuda_available = tf.test.is_gpu_available()
cuda_avail_list = hvd.allgather_object(cuda_available, name='device type')
if hvd.rank() == 0:
assert cuda_avail_list.count(cuda_available) == hvd.size(), "All ranks don't have same device type!"
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = \
Expand Down
6 changes: 6 additions & 0 deletions horovod/spark/lightning/remote.py
Expand Up @@ -121,6 +121,12 @@ def train(serialized_model):
_val_steps_per_epoch = val_steps_per_epoch if val_steps_per_epoch else 1.0

cuda_available = torch.cuda.is_available()
# We need to check all ranks have same device type for traning.
# Horovod doesn't support heterogeneous allreduce for gradients.
cuda_avail_list = hvd.allgather_object(cuda_available, name='device type')
if hvd.rank() == 0:
assert cuda_avail_list.count(cuda_available) == hvd.size(), "All ranks don't have same device type!"

if cuda_available:
# Horovod: pin GPU to local rank or the assigned GPU from spark.
torch.cuda.set_device(_get_assigned_gpu_or_default(default=hvd.local_rank()))
Expand Down
6 changes: 6 additions & 0 deletions horovod/spark/torch/remote.py
Expand Up @@ -123,6 +123,12 @@ def train(serialized_model, optimizer_cls, model_opt_state_serialized,
shuffle_buffer_size = user_shuffle_buffer_size

cuda_available = torch.cuda.is_available()
# We need to check all ranks have same device type for traning.
# Horovod doesn't support heterogeneous allreduce for gradients.
cuda_avail_list = hvd.allgather_object(cuda_available, name='device type')
if hvd.rank() == 0:
assert cuda_avail_list.count(cuda_available) == hvd.size(), "All ranks don't have same device type!"

if cuda_available:
# Horovod: pin GPU to local rank or the assigned GPU from spark.
torch.cuda.set_device(_get_assigned_gpu_or_default(default=hvd.local_rank()))
Expand Down

0 comments on commit 8efcb87

Please sign in to comment.