Skip to content

Commit

Permalink
Fix PyTorch and Keras issues.
Browse files Browse the repository at this point in the history
Signed-off-by: Josh Romero <joshr@nvidia.com>
  • Loading branch information
romerojosh committed Nov 16, 2020
1 parent dc79a9a commit 219f551
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 3 deletions.
6 changes: 4 additions & 2 deletions horovod/_keras/__init__.py
Expand Up @@ -28,7 +28,8 @@
def create_distributed_optimizer(keras, optimizer, name, device_dense, device_sparse,
compression, sparse_as_dense, gradient_predivide_factor,
op, backward_passes_per_step=1,
average_aggregated_gradients=False):
average_aggregated_gradients=False,
num_groups=0):
class _DistributedOptimizer(keras.optimizers.Optimizer):
_HAS_AGGREGATE_GRAD = True

Expand All @@ -43,7 +44,8 @@ def __init__(self, **kwargs):
compression,
sparse_as_dense,
op,
gradient_predivide_factor)
gradient_predivide_factor,
num_groups)

self._agg_helper = None
if backward_passes_per_step > 1:
Expand Down
6 changes: 5 additions & 1 deletion horovod/keras/__init__.py
Expand Up @@ -38,7 +38,8 @@ def DistributedOptimizer(optimizer, name=None,
compression=Compression.none,
sparse_as_dense=False,
gradient_predivide_factor=1.0,
op=Average):
op=Average,
num_groups=0):
"""
An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to
average gradient values before applying gradients to model weights.
Expand All @@ -65,6 +66,8 @@ def DistributedOptimizer(optimizer, name=None,
gradient_predivide_factor / size after the sum.
op: The reduction operation to use when combining gradients across
different ranks. Defaults to Average.
num_groups: Number of groups to assign gradient allreduce ops to for explicit
grouping. Defaults to no explicit groups.
"""
if gradient_predivide_factor != 1.0 and rocm_built():
raise ValueError('gradient_predivide_factor not supported yet with ROCm')
Expand All @@ -82,6 +85,7 @@ def DistributedOptimizer(optimizer, name=None,
sparse_as_dense=sparse_as_dense,
gradient_predivide_factor=gradient_predivide_factor,
op=op,
num_groups=num_groups,
)


Expand Down
1 change: 1 addition & 0 deletions test/parallel/test_torch.py
Expand Up @@ -644,6 +644,7 @@ def test_horovod_grouped_allreduce_average(self):
tensors = [torch.FloatTensor(*([17] * dim)).random_(-100, 100) for _ in range(5)]
tensors = [self.cast_and_place(tensor, dtype) for tensor in tensors]
averaged = hvd.grouped_allreduce(tensors, average=True)
tensors, averaged = zip(*[self.convert_cpu_fp16_to_fp32(t, m) for t, m in zip(tensors, averaged)])

# Threshold for floating point equality depends on number of
# ranks, since we're comparing against precise multiplication.
Expand Down

0 comments on commit 219f551

Please sign in to comment.