Skip to content

Commit

Permalink
clear locally accumulated gradient by assigning with zeros_like to av…
Browse files Browse the repository at this point in the history
…oid infinite gradient not correctly cleared up (#3505)

Signed-off-by: Yun Dai <yudai@yudai-ld2.linkedin.biz>
  • Loading branch information
yundai424 committed Apr 15, 2022
1 parent f9d7f77 commit 98db066
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 2 deletions.
4 changes: 2 additions & 2 deletions horovod/tensorflow/gradient_aggregation_eager.py
Expand Up @@ -119,8 +119,8 @@ def _allreduce_helper(self, grads, vars):
def _clear_vars(self):
self.counter.assign(0)
for idx in self.locally_aggregated_grads.keys():
self.locally_aggregated_grads[idx].assign_add(
-1 * self.locally_aggregated_grads[idx])
self.locally_aggregated_grads[idx].assign(
tf.zeros_like(self.locally_aggregated_grads[idx]))

def apply_gradients(self, apply_grads_closure, optimizer, *args, **kwargs):
def increment_optimizer_iteration():
Expand Down
27 changes: 27 additions & 0 deletions test/parallel/test_tensorflow2_keras.py
Expand Up @@ -182,6 +182,33 @@ def test_sparse_as_dense_with_grad_aggregation(self):
aggregation_counter = opt._agg_helper.counter.numpy()
assert aggregation_counter == training_steps % backward_passes_per_step

def test_grad_aggregation_with_inf_grad(self):
backward_passes_per_step = 2
step_count = tf.Variable(0, trainable=False, dtype=tf.int32)
opt = tf.optimizers.SGD()
opt = hvd.DistributedOptimizer(
opt,
backward_passes_per_step=backward_passes_per_step,
sparse_as_dense=True
)
x = tf.Variable(0.)
var = [x]

def loss():
step_count.assign_add(1)
return tf.cond(
pred=tf.greater(step_count, 1),
true_fn=lambda: x,
false_fn=lambda: x * float('inf')
)
for _ in range(2 * backward_passes_per_step):
# in the first aggregation cycle the gradient is infinite,
# and it should be cleaned up to zero after apply_gradients
# and doesn't affect the 2nd aggregation cycle
grads_and_vars = opt._compute_gradients(loss=loss, var_list=var)
opt.apply_gradients(grads_and_vars)
assert tf.math.is_finite(grads_and_vars[0][0])

def test_from_config(self):
opt = keras.optimizers.Adam()
hopt = hvd.DistributedOptimizer(opt)
Expand Down

0 comments on commit 98db066

Please sign in to comment.