From e0ab9719cac18ed1abebabfaa47308b239a7ed74 Mon Sep 17 00:00:00 2001
From: Travis Addair <tgaddair@gmail.com>
Date: Tue, 17 Nov 2020 13:25:16 -0800
Subject: [PATCH 1/4] Require initial_lr parameter to
 LearningRateScheduleCallback

Signed-off-by: Travis Addair <tgaddair@gmail.com>
---
 horovod/_keras/callbacks.py           |  6 +++---
 horovod/keras/callbacks.py            | 14 +++++---------
 horovod/tensorflow/keras/callbacks.py | 14 +++++---------
 3 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/horovod/_keras/callbacks.py b/horovod/_keras/callbacks.py
index 4b4ca570d5..fce39d944a 100644
--- a/horovod/_keras/callbacks.py
+++ b/horovod/_keras/callbacks.py
@@ -87,8 +87,8 @@ def on_epoch_end(self, epoch, logs=None):
 
 
 class LearningRateScheduleCallbackImpl(object):
-    def __init__(self, backend, multiplier, start_epoch=0, end_epoch=None, staircase=True,
-                 momentum_correction=True, steps_per_epoch=None, initial_lr=None, *args):
+    def __init__(self, backend, initial_lr, multiplier, start_epoch=0, end_epoch=None, staircase=True,
+                 momentum_correction=True, steps_per_epoch=None, *args):
         super(LearningRateScheduleCallbackImpl, self).__init__(*args)
         self.backend = backend
         self.start_epoch = start_epoch
@@ -107,7 +107,7 @@ def __init__(self, backend, multiplier, start_epoch=0, end_epoch=None, staircase
             self.multiplier = multiplier
 
         if self.initial_lr is None:
-            warnings.warn('Parameter `initial_lr` will be required in v0.21.0', DeprecationWarning)
+            raise ValueError('Parameter `initial_lr` is required')
 
     def _autodetect_steps_per_epoch(self):
         if self.params.get('steps'):
diff --git a/horovod/keras/callbacks.py b/horovod/keras/callbacks.py
index 2bf11cd91f..159f887f12 100644
--- a/horovod/keras/callbacks.py
+++ b/horovod/keras/callbacks.py
@@ -82,12 +82,13 @@ class LearningRateScheduleCallback(_impl.LearningRateScheduleCallbackImpl, keras
     `initial_lr` is the learning rate of the model optimizer at the start of the training.
     """
 
-    def __init__(self, multiplier, start_epoch=0, end_epoch=None, staircase=True,
-                 momentum_correction=True, steps_per_epoch=None, initial_lr=None):
+    def __init__(self, initial_lr, multiplier, start_epoch=0, end_epoch=None, staircase=True,
+                 momentum_correction=True, steps_per_epoch=None):
         """
         Construct a new LearningRateScheduleCallback.
 
         Args:
+            initial_lr: Initial learning rate at the start of training.
             multiplier: A constant multiplier or a function `f(epoch) = lr'`
             start_epoch: The first epoch this adjustment will be applied to. Defaults to 0.
             end_epoch: The epoch this adjustment will stop applying (exclusive end).
@@ -99,14 +100,9 @@ def __init__(self, multiplier, start_epoch=0, end_epoch=None, staircase=True,
             steps_per_epoch: The callback will attempt to autodetect number of batches per
                              epoch with Keras >= 2.0.0. Provide this value if you have an older
                              version of Keras.
-            initial_lr: Initial learning rate at the start of training.
-
-                .. warning:: Will be required in v0.21.0.
-
         """
-        super(LearningRateScheduleCallback, self).__init__(K, multiplier, start_epoch, end_epoch,
-                                                           staircase, momentum_correction, steps_per_epoch,
-                                                           initial_lr)
+        super(LearningRateScheduleCallback, self).__init__(K, initial_lr, multiplier, start_epoch, end_epoch,
+                                                           staircase, momentum_correction, steps_per_epoch)
 
 
 class LearningRateWarmupCallback(_impl.LearningRateWarmupCallbackImpl, keras.callbacks.Callback):
diff --git a/horovod/tensorflow/keras/callbacks.py b/horovod/tensorflow/keras/callbacks.py
index dcb2cb3e53..9f1f161165 100644
--- a/horovod/tensorflow/keras/callbacks.py
+++ b/horovod/tensorflow/keras/callbacks.py
@@ -82,12 +82,13 @@ class LearningRateScheduleCallback(_impl.LearningRateScheduleCallbackImpl, keras
     `initial_lr` is the learning rate of the model optimizer at the start of the training.
     """
 
-    def __init__(self, multiplier, start_epoch=0, end_epoch=None, staircase=True,
-                 momentum_correction=True, steps_per_epoch=None, initial_lr=None):
+    def __init__(self, initial_lr, multiplier, start_epoch=0, end_epoch=None, staircase=True,
+                 momentum_correction=True, steps_per_epoch=None):
         """
         Construct a new LearningRateScheduleCallback.
 
         Args:
+            initial_lr: Initial learning rate at the start of training.
             multiplier: A constant multiplier or a function `f(epoch) = lr'`
             start_epoch: The first epoch this adjustment will be applied to. Defaults to 0.
             end_epoch: The epoch this adjustment will stop applying (exclusive end).
@@ -99,14 +100,9 @@ def __init__(self, multiplier, start_epoch=0, end_epoch=None, staircase=True,
             steps_per_epoch: The callback will attempt to autodetect number of batches per
                              epoch with Keras >= 2.0.0. Provide this value if you have an older
                              version of Keras.
-            initial_lr: Initial learning rate at the start of training.
-
-                .. warning:: Will be required in v0.21.0.
-
         """
-        super(LearningRateScheduleCallback, self).__init__(K, multiplier, start_epoch, end_epoch,
-                                                           staircase, momentum_correction, steps_per_epoch,
-                                                           initial_lr)
+        super(LearningRateScheduleCallback, self).__init__(K, initial_lr, multiplier, start_epoch, end_epoch,
+                                                           staircase, momentum_correction, steps_per_epoch)
 
 
 class LearningRateWarmupCallback(_impl.LearningRateWarmupCallbackImpl, keras.callbacks.Callback):

From 68daa9fd33bde1b6790ffa743375ea49b084784a Mon Sep 17 00:00:00 2001
From: Travis Addair <tgaddair@gmail.com>
Date: Tue, 17 Nov 2020 13:32:31 -0800
Subject: [PATCH 2/4] Required for LearningRateWarmupCallback

Signed-off-by: Travis Addair <tgaddair@gmail.com>
---
 examples/keras/keras_imagenet_resnet50.py        | 15 +++++++++------
 examples/keras/keras_mnist_advanced.py           |  2 +-
 examples/ray/tensorflow2_mnist_ray.py            |  2 +-
 examples/spark/keras/keras_spark3_rossmann.py    |  2 +-
 examples/spark/keras/keras_spark_rossmann_run.py |  2 +-
 examples/tensorflow2/tensorflow2_keras_mnist.py  |  2 +-
 horovod/_keras/callbacks.py                      |  8 ++++----
 horovod/keras/callbacks.py                       | 12 +++++-------
 horovod/tensorflow/keras/callbacks.py            | 12 +++++-------
 9 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/examples/keras/keras_imagenet_resnet50.py b/examples/keras/keras_imagenet_resnet50.py
index c09d13eade..2a687f10e4 100644
--- a/examples/keras/keras_imagenet_resnet50.py
+++ b/examples/keras/keras_imagenet_resnet50.py
@@ -142,15 +142,18 @@
     # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
     # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
     # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
-    hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=args.warmup_epochs, initial_lr=initial_lr,
+    hvd.callbacks.LearningRateWarmupCallback(initial_lr=initial_lr,
+                                             warmup_epochs=args.warmup_epochs,
                                              verbose=verbose),
 
     # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs.
-    hvd.callbacks.LearningRateScheduleCallback(start_epoch=args.warmup_epochs, end_epoch=30, multiplier=1.,
-                                               initial_lr=initial_lr),
-    hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1, initial_lr=initial_lr),
-    hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2, initial_lr=initial_lr),
-    hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3, initial_lr=initial_lr),
+    hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr,
+                                               multiplier=1.,
+                                               start_epoch=args.warmup_epochs,
+                                               end_epoch=30),
+    hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr, multiplier=1e-1, start_epoch=30, end_epoch=60),
+    hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr, multiplier=1e-2, start_epoch=60, end_epoch=80),
+    hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr, multiplier=1e-3, start_epoch=80),
 ]
 
 # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
diff --git a/examples/keras/keras_mnist_advanced.py b/examples/keras/keras_mnist_advanced.py
index de3fa86aee..3685300d1c 100644
--- a/examples/keras/keras_mnist_advanced.py
+++ b/examples/keras/keras_mnist_advanced.py
@@ -106,7 +106,7 @@
     # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
     # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
     # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
-    hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=scaled_lr, verbose=1),
+    hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=5, verbose=1),
 
     # Reduce the learning rate if training plateaues.
     keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1),
diff --git a/examples/ray/tensorflow2_mnist_ray.py b/examples/ray/tensorflow2_mnist_ray.py
index 10c5a5ad2a..4ce279f2dc 100644
--- a/examples/ray/tensorflow2_mnist_ray.py
+++ b/examples/ray/tensorflow2_mnist_ray.py
@@ -73,7 +73,7 @@ def train(num_epochs):
         # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
         # the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
         hvd.callbacks.LearningRateWarmupCallback(
-            warmup_epochs=3, initial_lr=scaled_lr, verbose=1),
+            initial_lr=scaled_lr, warmup_epochs=3, verbose=1),
     ]
 
     # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
diff --git a/examples/spark/keras/keras_spark3_rossmann.py b/examples/spark/keras/keras_spark3_rossmann.py
index 1b5aa86427..d21967f0e2 100644
--- a/examples/spark/keras/keras_spark3_rossmann.py
+++ b/examples/spark/keras/keras_spark3_rossmann.py
@@ -446,7 +446,7 @@ def train_fn(model_bytes):
             # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
             # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
             # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
-            hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=scaled_lr, verbose=verbose),
+            hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=5, verbose=verbose),
 
             # Reduce LR if the metric is not improved for 10 epochs, and stop training
             # if it has not improved for 20 epochs.
diff --git a/examples/spark/keras/keras_spark_rossmann_run.py b/examples/spark/keras/keras_spark_rossmann_run.py
index 022fd9053a..2c58bb7753 100644
--- a/examples/spark/keras/keras_spark_rossmann_run.py
+++ b/examples/spark/keras/keras_spark_rossmann_run.py
@@ -435,7 +435,7 @@ def train_fn(model_bytes):
             # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
             # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
             # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
-            hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=scaled_lr, verbose=verbose),
+            hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=5, verbose=verbose),
 
             # Reduce LR if the metric is not improved for 10 epochs, and stop training
             # if it has not improved for 20 epochs.
diff --git a/examples/tensorflow2/tensorflow2_keras_mnist.py b/examples/tensorflow2/tensorflow2_keras_mnist.py
index 8ae1ca60b5..4872dce4f8 100644
--- a/examples/tensorflow2/tensorflow2_keras_mnist.py
+++ b/examples/tensorflow2/tensorflow2_keras_mnist.py
@@ -76,7 +76,7 @@
     # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
     # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
     # the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
-    hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=3, initial_lr=scaled_lr, verbose=1),
+    hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=3, verbose=1),
 ]
 
 # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
diff --git a/horovod/_keras/callbacks.py b/horovod/_keras/callbacks.py
index fce39d944a..9ab31b3aeb 100644
--- a/horovod/_keras/callbacks.py
+++ b/horovod/_keras/callbacks.py
@@ -170,16 +170,16 @@ def on_epoch_end(self, epoch, logs=None):
 
 
 class LearningRateWarmupCallbackImpl(LearningRateScheduleCallbackImpl):
-    def __init__(self, backend, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
-                 verbose=0, initial_lr=None, *args):
+    def __init__(self, backend, initial_lr, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
+                 verbose=0, *args):
         def multiplier(epoch):
             # Adjust epoch to produce round numbers at the end of each epoch, so that TensorBoard
             # learning rate graphs look better.
             epoch += 1. / self.steps_per_epoch
             return 1. / hvd.size() * (epoch * (hvd.size() - 1) / warmup_epochs + 1)
         super(LearningRateWarmupCallbackImpl, self).__init__(
-            backend, multiplier, start_epoch=0, end_epoch=warmup_epochs, staircase=False,
-            momentum_correction=momentum_correction, steps_per_epoch=steps_per_epoch, initial_lr=initial_lr,
+            backend, initial_lr, multiplier, start_epoch=0, end_epoch=warmup_epochs, staircase=False,
+            momentum_correction=momentum_correction, steps_per_epoch=steps_per_epoch,
             *args)
         self.verbose = verbose
 
diff --git a/horovod/keras/callbacks.py b/horovod/keras/callbacks.py
index 159f887f12..63b4321386 100644
--- a/horovod/keras/callbacks.py
+++ b/horovod/keras/callbacks.py
@@ -129,12 +129,13 @@ class LearningRateWarmupCallback(_impl.LearningRateWarmupCallbackImpl, keras.cal
         lr'(epoch = warmup) &= lr
     """
 
-    def __init__(self, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
-                 verbose=0, initial_lr=None):
+    def __init__(self, initial_lr, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
+                 verbose=0):
         """
         Construct a new LearningRateWarmupCallback that will gradually warm up the learning rate.
 
         Args:
+            initial_lr: Initial learning rate at the start of training.
             warmup_epochs: The number of epochs of the warmup phase. Defaults to 5.
             momentum_correction: Apply momentum correction to optimizers that have momentum.
                                  Defaults to True.
@@ -142,12 +143,9 @@ def __init__(self, warmup_epochs=5, momentum_correction=True, steps_per_epoch=No
                              epoch with Keras >= 2.0.0. Provide this value if you have an older
                              version of Keras.
             verbose: verbosity mode, 0 or 1.
-            initial_lr: Initial learning rate at the start of training.
-
-                .. warning:: Will be required in v0.21.0.
         """
-        super(LearningRateWarmupCallback, self).__init__(K, warmup_epochs, momentum_correction,
-                                                         steps_per_epoch, verbose, initial_lr)
+        super(LearningRateWarmupCallback, self).__init__(K, initial_lr, warmup_epochs, momentum_correction,
+                                                         steps_per_epoch, verbose)
 
 
 class BestModelCheckpoint(keras.callbacks.ModelCheckpoint):
diff --git a/horovod/tensorflow/keras/callbacks.py b/horovod/tensorflow/keras/callbacks.py
index 9f1f161165..b91f367088 100644
--- a/horovod/tensorflow/keras/callbacks.py
+++ b/horovod/tensorflow/keras/callbacks.py
@@ -129,12 +129,13 @@ class LearningRateWarmupCallback(_impl.LearningRateWarmupCallbackImpl, keras.cal
         lr'(epoch = warmup) &= lr
     """
 
-    def __init__(self, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
-                 verbose=0, initial_lr=None):
+    def __init__(self, initial_lr, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
+                 verbose=0):
         """
         Construct a new LearningRateWarmupCallback that will gradually warm up the learning rate.
 
         Args:
+            initial_lr: Initial learning rate at the start of training.
             warmup_epochs: The number of epochs of the warmup phase. Defaults to 5.
             momentum_correction: Apply momentum correction to optimizers that have momentum.
                                  Defaults to True.
@@ -142,12 +143,9 @@ def __init__(self, warmup_epochs=5, momentum_correction=True, steps_per_epoch=No
                              epoch with Keras >= 2.0.0. Provide this value if you have an older
                              version of Keras.
             verbose: verbosity mode, 0 or 1.
-            initial_lr: Initial learning rate at the start of training.
-
-                .. warning:: Will be required in v0.21.0.
         """
-        super(LearningRateWarmupCallback, self).__init__(K, warmup_epochs, momentum_correction,
-                                                         steps_per_epoch, verbose, initial_lr)
+        super(LearningRateWarmupCallback, self).__init__(K, initial_lr, warmup_epochs, momentum_correction,
+                                                         steps_per_epoch, verbose)
 
 
 class BestModelCheckpoint(keras.callbacks.ModelCheckpoint):

From a0894ef7e36c62495b298f04cb0ba68ba6d019d7 Mon Sep 17 00:00:00 2001
From: Travis Addair <tgaddair@gmail.com>
Date: Fri, 20 Nov 2020 07:42:58 -0800
Subject: [PATCH 3/4] Updated changelog

Signed-off-by: Travis Addair <tgaddair@gmail.com>
---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3c8f6f472a..7c95581dfd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
+- Require parameter `initial_lr` of Keras callbacks `LearningRateScheduleCallback` and `LearningRateWarmupCallback` ([#2459](https://github.com/horovod/horovod/pull/2459))
+
 - Changed default cycle time from 5ms to 1ms and fusion threshold from 64MB to 128MB. ([#2468](https://github.com/horovod/horovod/pull/2468))
 
 ### Deprecated

From 881c1bd086edcb81777a31fe11be0ddd7b0bc900 Mon Sep 17 00:00:00 2001
From: Travis Addair <tgaddair@gmail.com>
Date: Fri, 20 Nov 2020 07:44:25 -0800
Subject: [PATCH 4/4] Fixed wording

Signed-off-by: Travis Addair <tgaddair@gmail.com>
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7c95581dfd..dc6606d39c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,7 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
-- Require parameter `initial_lr` of Keras callbacks `LearningRateScheduleCallback` and `LearningRateWarmupCallback` ([#2459](https://github.com/horovod/horovod/pull/2459))
+- Changed Keras callbacks to require parameter `initial_lr` of `LearningRateScheduleCallback` and `LearningRateWarmupCallback`. ([#2459](https://github.com/horovod/horovod/pull/2459))
 
 - Changed default cycle time from 5ms to 1ms and fusion threshold from 64MB to 128MB. ([#2468](https://github.com/horovod/horovod/pull/2468))