horovod · tgaddair · Nov 20, 2020 · Nov 17, 2020 · Nov 17, 2020 · Nov 20, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
+- Changed Keras callbacks to require parameter `initial_lr` of `LearningRateScheduleCallback` and `LearningRateWarmupCallback`. ([#2459](https://github.com/horovod/horovod/pull/2459))
+
 - Changed default cycle time from 5ms to 1ms and fusion threshold from 64MB to 128MB. ([#2468](https://github.com/horovod/horovod/pull/2468))
 
 ### Deprecated

diff --git a/examples/keras/keras_imagenet_resnet50.py b/examples/keras/keras_imagenet_resnet50.py
@@ -142,15 +142,18 @@
     # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
     # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
     # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
-    hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=args.warmup_epochs, initial_lr=initial_lr,
+    hvd.callbacks.LearningRateWarmupCallback(initial_lr=initial_lr,
+                                             warmup_epochs=args.warmup_epochs,
                                              verbose=verbose),
 
     # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs.
-    hvd.callbacks.LearningRateScheduleCallback(start_epoch=args.warmup_epochs, end_epoch=30, multiplier=1.,
-                                               initial_lr=initial_lr),
-    hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1, initial_lr=initial_lr),
-    hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2, initial_lr=initial_lr),
-    hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3, initial_lr=initial_lr),
+    hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr,
+                                               multiplier=1.,
+                                               start_epoch=args.warmup_epochs,
+                                               end_epoch=30),
+    hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr, multiplier=1e-1, start_epoch=30, end_epoch=60),
+    hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr, multiplier=1e-2, start_epoch=60, end_epoch=80),
+    hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr, multiplier=1e-3, start_epoch=80),
 ]
 
 # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.

diff --git a/examples/keras/keras_mnist_advanced.py b/examples/keras/keras_mnist_advanced.py
@@ -106,7 +106,7 @@
     # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
     # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
     # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
-    hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=scaled_lr, verbose=1),
+    hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=5, verbose=1),
 
     # Reduce the learning rate if training plateaues.
     keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1),

diff --git a/examples/ray/tensorflow2_mnist_ray.py b/examples/ray/tensorflow2_mnist_ray.py
@@ -73,7 +73,7 @@ def train(num_epochs):
         # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
         # the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
         hvd.callbacks.LearningRateWarmupCallback(
-            warmup_epochs=3, initial_lr=scaled_lr, verbose=1),
+            initial_lr=scaled_lr, warmup_epochs=3, verbose=1),
     ]
 
     # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.

diff --git a/examples/spark/keras/keras_spark3_rossmann.py b/examples/spark/keras/keras_spark3_rossmann.py
@@ -446,7 +446,7 @@ def train_fn(model_bytes):
             # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
             # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
             # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
-            hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=scaled_lr, verbose=verbose),
+            hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=5, verbose=verbose),
 
             # Reduce LR if the metric is not improved for 10 epochs, and stop training
             # if it has not improved for 20 epochs.

diff --git a/examples/spark/keras/keras_spark_rossmann_run.py b/examples/spark/keras/keras_spark_rossmann_run.py
@@ -435,7 +435,7 @@ def train_fn(model_bytes):
             # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
             # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
             # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
-            hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=scaled_lr, verbose=verbose),
+            hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=5, verbose=verbose),
 
             # Reduce LR if the metric is not improved for 10 epochs, and stop training
             # if it has not improved for 20 epochs.

diff --git a/examples/tensorflow2/tensorflow2_keras_mnist.py b/examples/tensorflow2/tensorflow2_keras_mnist.py
@@ -76,7 +76,7 @@
     # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
     # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
     # the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
-    hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=3, initial_lr=scaled_lr, verbose=1),
+    hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=3, verbose=1),
 ]
 
 # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.

diff --git a/horovod/_keras/callbacks.py b/horovod/_keras/callbacks.py
@@ -87,8 +87,8 @@ def on_epoch_end(self, epoch, logs=None):
 
 
 class LearningRateScheduleCallbackImpl(object):
-    def __init__(self, backend, multiplier, start_epoch=0, end_epoch=None, staircase=True,
-                 momentum_correction=True, steps_per_epoch=None, initial_lr=None, *args):
+    def __init__(self, backend, initial_lr, multiplier, start_epoch=0, end_epoch=None, staircase=True,
+                 momentum_correction=True, steps_per_epoch=None, *args):
         super(LearningRateScheduleCallbackImpl, self).__init__(*args)
         self.backend = backend
         self.start_epoch = start_epoch
@@ -107,7 +107,7 @@ def __init__(self, backend, multiplier, start_epoch=0, end_epoch=None, staircase
             self.multiplier = multiplier
 
         if self.initial_lr is None:
-            warnings.warn('Parameter `initial_lr` will be required in v0.21.0', DeprecationWarning)
+            raise ValueError('Parameter `initial_lr` is required')
 
     def _autodetect_steps_per_epoch(self):
         if self.params.get('steps'):
@@ -170,16 +170,16 @@ def on_epoch_end(self, epoch, logs=None):
 
 
 class LearningRateWarmupCallbackImpl(LearningRateScheduleCallbackImpl):
-    def __init__(self, backend, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
-                 verbose=0, initial_lr=None, *args):
+    def __init__(self, backend, initial_lr, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
+                 verbose=0, *args):
         def multiplier(epoch):
             # Adjust epoch to produce round numbers at the end of each epoch, so that TensorBoard
             # learning rate graphs look better.
             epoch += 1. / self.steps_per_epoch
             return 1. / hvd.size() * (epoch * (hvd.size() - 1) / warmup_epochs + 1)
         super(LearningRateWarmupCallbackImpl, self).__init__(
-            backend, multiplier, start_epoch=0, end_epoch=warmup_epochs, staircase=False,
-            momentum_correction=momentum_correction, steps_per_epoch=steps_per_epoch, initial_lr=initial_lr,
+            backend, initial_lr, multiplier, start_epoch=0, end_epoch=warmup_epochs, staircase=False,
+            momentum_correction=momentum_correction, steps_per_epoch=steps_per_epoch,
             *args)
         self.verbose = verbose
 

diff --git a/horovod/keras/callbacks.py b/horovod/keras/callbacks.py
@@ -82,12 +82,13 @@ class LearningRateScheduleCallback(_impl.LearningRateScheduleCallbackImpl, keras
     `initial_lr` is the learning rate of the model optimizer at the start of the training.
     """
 
-    def __init__(self, multiplier, start_epoch=0, end_epoch=None, staircase=True,
-                 momentum_correction=True, steps_per_epoch=None, initial_lr=None):
+    def __init__(self, initial_lr, multiplier, start_epoch=0, end_epoch=None, staircase=True,
+                 momentum_correction=True, steps_per_epoch=None):
         """
         Construct a new LearningRateScheduleCallback.
 
         Args:
+            initial_lr: Initial learning rate at the start of training.
             multiplier: A constant multiplier or a function `f(epoch) = lr'`
             start_epoch: The first epoch this adjustment will be applied to. Defaults to 0.
             end_epoch: The epoch this adjustment will stop applying (exclusive end).
@@ -99,14 +100,9 @@ def __init__(self, multiplier, start_epoch=0, end_epoch=None, staircase=True,
             steps_per_epoch: The callback will attempt to autodetect number of batches per
                              epoch with Keras >= 2.0.0. Provide this value if you have an older
                              version of Keras.
-            initial_lr: Initial learning rate at the start of training.
-
-                .. warning:: Will be required in v0.21.0.
-
         """
-        super(LearningRateScheduleCallback, self).__init__(K, multiplier, start_epoch, end_epoch,
-                                                           staircase, momentum_correction, steps_per_epoch,
-                                                           initial_lr)
+        super(LearningRateScheduleCallback, self).__init__(K, initial_lr, multiplier, start_epoch, end_epoch,
+                                                           staircase, momentum_correction, steps_per_epoch)
 
 
 class LearningRateWarmupCallback(_impl.LearningRateWarmupCallbackImpl, keras.callbacks.Callback):
@@ -133,25 +129,23 @@ class LearningRateWarmupCallback(_impl.LearningRateWarmupCallbackImpl, keras.cal
         lr'(epoch = warmup) &= lr
     """
 
-    def __init__(self, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
-                 verbose=0, initial_lr=None):
+    def __init__(self, initial_lr, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
+                 verbose=0):
         """
         Construct a new LearningRateWarmupCallback that will gradually warm up the learning rate.
 
         Args:
+            initial_lr: Initial learning rate at the start of training.
             warmup_epochs: The number of epochs of the warmup phase. Defaults to 5.
             momentum_correction: Apply momentum correction to optimizers that have momentum.
                                  Defaults to True.
             steps_per_epoch: The callback will attempt to autodetect number of batches per
                              epoch with Keras >= 2.0.0. Provide this value if you have an older
                              version of Keras.
             verbose: verbosity mode, 0 or 1.
-            initial_lr: Initial learning rate at the start of training.
-
-                .. warning:: Will be required in v0.21.0.
         """
-        super(LearningRateWarmupCallback, self).__init__(K, warmup_epochs, momentum_correction,
-                                                         steps_per_epoch, verbose, initial_lr)
+        super(LearningRateWarmupCallback, self).__init__(K, initial_lr, warmup_epochs, momentum_correction,
+                                                         steps_per_epoch, verbose)
 
 
 class BestModelCheckpoint(keras.callbacks.ModelCheckpoint):

diff --git a/horovod/tensorflow/keras/callbacks.py b/horovod/tensorflow/keras/callbacks.py
@@ -82,12 +82,13 @@ class LearningRateScheduleCallback(_impl.LearningRateScheduleCallbackImpl, keras
     `initial_lr` is the learning rate of the model optimizer at the start of the training.
     """
 
-    def __init__(self, multiplier, start_epoch=0, end_epoch=None, staircase=True,
-                 momentum_correction=True, steps_per_epoch=None, initial_lr=None):
+    def __init__(self, initial_lr, multiplier, start_epoch=0, end_epoch=None, staircase=True,
+                 momentum_correction=True, steps_per_epoch=None):
         """
         Construct a new LearningRateScheduleCallback.
 
         Args:
+            initial_lr: Initial learning rate at the start of training.
             multiplier: A constant multiplier or a function `f(epoch) = lr'`
             start_epoch: The first epoch this adjustment will be applied to. Defaults to 0.
             end_epoch: The epoch this adjustment will stop applying (exclusive end).
@@ -99,14 +100,9 @@ def __init__(self, multiplier, start_epoch=0, end_epoch=None, staircase=True,
             steps_per_epoch: The callback will attempt to autodetect number of batches per
                              epoch with Keras >= 2.0.0. Provide this value if you have an older
                              version of Keras.
-            initial_lr: Initial learning rate at the start of training.
-
-                .. warning:: Will be required in v0.21.0.
-
         """
-        super(LearningRateScheduleCallback, self).__init__(K, multiplier, start_epoch, end_epoch,
-                                                           staircase, momentum_correction, steps_per_epoch,
-                                                           initial_lr)
+        super(LearningRateScheduleCallback, self).__init__(K, initial_lr, multiplier, start_epoch, end_epoch,
+                                                           staircase, momentum_correction, steps_per_epoch)
 
 
 class LearningRateWarmupCallback(_impl.LearningRateWarmupCallbackImpl, keras.callbacks.Callback):
@@ -133,25 +129,23 @@ class LearningRateWarmupCallback(_impl.LearningRateWarmupCallbackImpl, keras.cal
         lr'(epoch = warmup) &= lr
     """
 
-    def __init__(self, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
-                 verbose=0, initial_lr=None):
+    def __init__(self, initial_lr, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
+                 verbose=0):
         """
         Construct a new LearningRateWarmupCallback that will gradually warm up the learning rate.
 
         Args:
+            initial_lr: Initial learning rate at the start of training.
             warmup_epochs: The number of epochs of the warmup phase. Defaults to 5.
             momentum_correction: Apply momentum correction to optimizers that have momentum.
                                  Defaults to True.
             steps_per_epoch: The callback will attempt to autodetect number of batches per
                              epoch with Keras >= 2.0.0. Provide this value if you have an older
                              version of Keras.
             verbose: verbosity mode, 0 or 1.
-            initial_lr: Initial learning rate at the start of training.
-
-                .. warning:: Will be required in v0.21.0.
         """
-        super(LearningRateWarmupCallback, self).__init__(K, warmup_epochs, momentum_correction,
-                                                         steps_per_epoch, verbose, initial_lr)
+        super(LearningRateWarmupCallback, self).__init__(K, initial_lr, warmup_epochs, momentum_correction,
+                                                         steps_per_epoch, verbose)
 
 
 class BestModelCheckpoint(keras.callbacks.ModelCheckpoint):