From e0ab9719cac18ed1abebabfaa47308b239a7ed74 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Tue, 17 Nov 2020 13:25:16 -0800 Subject: [PATCH 1/4] Require initial_lr parameter to LearningRateScheduleCallback Signed-off-by: Travis Addair --- horovod/_keras/callbacks.py | 6 +++--- horovod/keras/callbacks.py | 14 +++++--------- horovod/tensorflow/keras/callbacks.py | 14 +++++--------- 3 files changed, 13 insertions(+), 21 deletions(-) diff --git a/horovod/_keras/callbacks.py b/horovod/_keras/callbacks.py index 4b4ca570d5..fce39d944a 100644 --- a/horovod/_keras/callbacks.py +++ b/horovod/_keras/callbacks.py @@ -87,8 +87,8 @@ def on_epoch_end(self, epoch, logs=None): class LearningRateScheduleCallbackImpl(object): - def __init__(self, backend, multiplier, start_epoch=0, end_epoch=None, staircase=True, - momentum_correction=True, steps_per_epoch=None, initial_lr=None, *args): + def __init__(self, backend, initial_lr, multiplier, start_epoch=0, end_epoch=None, staircase=True, + momentum_correction=True, steps_per_epoch=None, *args): super(LearningRateScheduleCallbackImpl, self).__init__(*args) self.backend = backend self.start_epoch = start_epoch @@ -107,7 +107,7 @@ def __init__(self, backend, multiplier, start_epoch=0, end_epoch=None, staircase self.multiplier = multiplier if self.initial_lr is None: - warnings.warn('Parameter `initial_lr` will be required in v0.21.0', DeprecationWarning) + raise ValueError('Parameter `initial_lr` is required') def _autodetect_steps_per_epoch(self): if self.params.get('steps'): diff --git a/horovod/keras/callbacks.py b/horovod/keras/callbacks.py index 2bf11cd91f..159f887f12 100644 --- a/horovod/keras/callbacks.py +++ b/horovod/keras/callbacks.py @@ -82,12 +82,13 @@ class LearningRateScheduleCallback(_impl.LearningRateScheduleCallbackImpl, keras `initial_lr` is the learning rate of the model optimizer at the start of the training. """ - def __init__(self, multiplier, start_epoch=0, end_epoch=None, staircase=True, - momentum_correction=True, steps_per_epoch=None, initial_lr=None): + def __init__(self, initial_lr, multiplier, start_epoch=0, end_epoch=None, staircase=True, + momentum_correction=True, steps_per_epoch=None): """ Construct a new LearningRateScheduleCallback. Args: + initial_lr: Initial learning rate at the start of training. multiplier: A constant multiplier or a function `f(epoch) = lr'` start_epoch: The first epoch this adjustment will be applied to. Defaults to 0. end_epoch: The epoch this adjustment will stop applying (exclusive end). @@ -99,14 +100,9 @@ def __init__(self, multiplier, start_epoch=0, end_epoch=None, staircase=True, steps_per_epoch: The callback will attempt to autodetect number of batches per epoch with Keras >= 2.0.0. Provide this value if you have an older version of Keras. - initial_lr: Initial learning rate at the start of training. - - .. warning:: Will be required in v0.21.0. - """ - super(LearningRateScheduleCallback, self).__init__(K, multiplier, start_epoch, end_epoch, - staircase, momentum_correction, steps_per_epoch, - initial_lr) + super(LearningRateScheduleCallback, self).__init__(K, initial_lr, multiplier, start_epoch, end_epoch, + staircase, momentum_correction, steps_per_epoch) class LearningRateWarmupCallback(_impl.LearningRateWarmupCallbackImpl, keras.callbacks.Callback): diff --git a/horovod/tensorflow/keras/callbacks.py b/horovod/tensorflow/keras/callbacks.py index dcb2cb3e53..9f1f161165 100644 --- a/horovod/tensorflow/keras/callbacks.py +++ b/horovod/tensorflow/keras/callbacks.py @@ -82,12 +82,13 @@ class LearningRateScheduleCallback(_impl.LearningRateScheduleCallbackImpl, keras `initial_lr` is the learning rate of the model optimizer at the start of the training. """ - def __init__(self, multiplier, start_epoch=0, end_epoch=None, staircase=True, - momentum_correction=True, steps_per_epoch=None, initial_lr=None): + def __init__(self, initial_lr, multiplier, start_epoch=0, end_epoch=None, staircase=True, + momentum_correction=True, steps_per_epoch=None): """ Construct a new LearningRateScheduleCallback. Args: + initial_lr: Initial learning rate at the start of training. multiplier: A constant multiplier or a function `f(epoch) = lr'` start_epoch: The first epoch this adjustment will be applied to. Defaults to 0. end_epoch: The epoch this adjustment will stop applying (exclusive end). @@ -99,14 +100,9 @@ def __init__(self, multiplier, start_epoch=0, end_epoch=None, staircase=True, steps_per_epoch: The callback will attempt to autodetect number of batches per epoch with Keras >= 2.0.0. Provide this value if you have an older version of Keras. - initial_lr: Initial learning rate at the start of training. - - .. warning:: Will be required in v0.21.0. - """ - super(LearningRateScheduleCallback, self).__init__(K, multiplier, start_epoch, end_epoch, - staircase, momentum_correction, steps_per_epoch, - initial_lr) + super(LearningRateScheduleCallback, self).__init__(K, initial_lr, multiplier, start_epoch, end_epoch, + staircase, momentum_correction, steps_per_epoch) class LearningRateWarmupCallback(_impl.LearningRateWarmupCallbackImpl, keras.callbacks.Callback): From 68daa9fd33bde1b6790ffa743375ea49b084784a Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Tue, 17 Nov 2020 13:32:31 -0800 Subject: [PATCH 2/4] Required for LearningRateWarmupCallback Signed-off-by: Travis Addair --- examples/keras/keras_imagenet_resnet50.py | 15 +++++++++------ examples/keras/keras_mnist_advanced.py | 2 +- examples/ray/tensorflow2_mnist_ray.py | 2 +- examples/spark/keras/keras_spark3_rossmann.py | 2 +- examples/spark/keras/keras_spark_rossmann_run.py | 2 +- examples/tensorflow2/tensorflow2_keras_mnist.py | 2 +- horovod/_keras/callbacks.py | 8 ++++---- horovod/keras/callbacks.py | 12 +++++------- horovod/tensorflow/keras/callbacks.py | 12 +++++------- 9 files changed, 28 insertions(+), 29 deletions(-) diff --git a/examples/keras/keras_imagenet_resnet50.py b/examples/keras/keras_imagenet_resnet50.py index c09d13eade..2a687f10e4 100644 --- a/examples/keras/keras_imagenet_resnet50.py +++ b/examples/keras/keras_imagenet_resnet50.py @@ -142,15 +142,18 @@ # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. - hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=args.warmup_epochs, initial_lr=initial_lr, + hvd.callbacks.LearningRateWarmupCallback(initial_lr=initial_lr, + warmup_epochs=args.warmup_epochs, verbose=verbose), # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs. - hvd.callbacks.LearningRateScheduleCallback(start_epoch=args.warmup_epochs, end_epoch=30, multiplier=1., - initial_lr=initial_lr), - hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1, initial_lr=initial_lr), - hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2, initial_lr=initial_lr), - hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3, initial_lr=initial_lr), + hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr, + multiplier=1., + start_epoch=args.warmup_epochs, + end_epoch=30), + hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr, multiplier=1e-1, start_epoch=30, end_epoch=60), + hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr, multiplier=1e-2, start_epoch=60, end_epoch=80), + hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr, multiplier=1e-3, start_epoch=80), ] # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. diff --git a/examples/keras/keras_mnist_advanced.py b/examples/keras/keras_mnist_advanced.py index de3fa86aee..3685300d1c 100644 --- a/examples/keras/keras_mnist_advanced.py +++ b/examples/keras/keras_mnist_advanced.py @@ -106,7 +106,7 @@ # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. - hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=scaled_lr, verbose=1), + hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=5, verbose=1), # Reduce the learning rate if training plateaues. keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1), diff --git a/examples/ray/tensorflow2_mnist_ray.py b/examples/ray/tensorflow2_mnist_ray.py index 10c5a5ad2a..4ce279f2dc 100644 --- a/examples/ray/tensorflow2_mnist_ray.py +++ b/examples/ray/tensorflow2_mnist_ray.py @@ -73,7 +73,7 @@ def train(num_epochs): # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first three epochs. See https://arxiv.org/abs/1706.02677 for details. hvd.callbacks.LearningRateWarmupCallback( - warmup_epochs=3, initial_lr=scaled_lr, verbose=1), + initial_lr=scaled_lr, warmup_epochs=3, verbose=1), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. diff --git a/examples/spark/keras/keras_spark3_rossmann.py b/examples/spark/keras/keras_spark3_rossmann.py index 1b5aa86427..d21967f0e2 100644 --- a/examples/spark/keras/keras_spark3_rossmann.py +++ b/examples/spark/keras/keras_spark3_rossmann.py @@ -446,7 +446,7 @@ def train_fn(model_bytes): # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. - hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=scaled_lr, verbose=verbose), + hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=5, verbose=verbose), # Reduce LR if the metric is not improved for 10 epochs, and stop training # if it has not improved for 20 epochs. diff --git a/examples/spark/keras/keras_spark_rossmann_run.py b/examples/spark/keras/keras_spark_rossmann_run.py index 022fd9053a..2c58bb7753 100644 --- a/examples/spark/keras/keras_spark_rossmann_run.py +++ b/examples/spark/keras/keras_spark_rossmann_run.py @@ -435,7 +435,7 @@ def train_fn(model_bytes): # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. - hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=scaled_lr, verbose=verbose), + hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=5, verbose=verbose), # Reduce LR if the metric is not improved for 10 epochs, and stop training # if it has not improved for 20 epochs. diff --git a/examples/tensorflow2/tensorflow2_keras_mnist.py b/examples/tensorflow2/tensorflow2_keras_mnist.py index 8ae1ca60b5..4872dce4f8 100644 --- a/examples/tensorflow2/tensorflow2_keras_mnist.py +++ b/examples/tensorflow2/tensorflow2_keras_mnist.py @@ -76,7 +76,7 @@ # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first three epochs. See https://arxiv.org/abs/1706.02677 for details. - hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=3, initial_lr=scaled_lr, verbose=1), + hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=3, verbose=1), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. diff --git a/horovod/_keras/callbacks.py b/horovod/_keras/callbacks.py index fce39d944a..9ab31b3aeb 100644 --- a/horovod/_keras/callbacks.py +++ b/horovod/_keras/callbacks.py @@ -170,16 +170,16 @@ def on_epoch_end(self, epoch, logs=None): class LearningRateWarmupCallbackImpl(LearningRateScheduleCallbackImpl): - def __init__(self, backend, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None, - verbose=0, initial_lr=None, *args): + def __init__(self, backend, initial_lr, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None, + verbose=0, *args): def multiplier(epoch): # Adjust epoch to produce round numbers at the end of each epoch, so that TensorBoard # learning rate graphs look better. epoch += 1. / self.steps_per_epoch return 1. / hvd.size() * (epoch * (hvd.size() - 1) / warmup_epochs + 1) super(LearningRateWarmupCallbackImpl, self).__init__( - backend, multiplier, start_epoch=0, end_epoch=warmup_epochs, staircase=False, - momentum_correction=momentum_correction, steps_per_epoch=steps_per_epoch, initial_lr=initial_lr, + backend, initial_lr, multiplier, start_epoch=0, end_epoch=warmup_epochs, staircase=False, + momentum_correction=momentum_correction, steps_per_epoch=steps_per_epoch, *args) self.verbose = verbose diff --git a/horovod/keras/callbacks.py b/horovod/keras/callbacks.py index 159f887f12..63b4321386 100644 --- a/horovod/keras/callbacks.py +++ b/horovod/keras/callbacks.py @@ -129,12 +129,13 @@ class LearningRateWarmupCallback(_impl.LearningRateWarmupCallbackImpl, keras.cal lr'(epoch = warmup) &= lr """ - def __init__(self, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None, - verbose=0, initial_lr=None): + def __init__(self, initial_lr, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None, + verbose=0): """ Construct a new LearningRateWarmupCallback that will gradually warm up the learning rate. Args: + initial_lr: Initial learning rate at the start of training. warmup_epochs: The number of epochs of the warmup phase. Defaults to 5. momentum_correction: Apply momentum correction to optimizers that have momentum. Defaults to True. @@ -142,12 +143,9 @@ def __init__(self, warmup_epochs=5, momentum_correction=True, steps_per_epoch=No epoch with Keras >= 2.0.0. Provide this value if you have an older version of Keras. verbose: verbosity mode, 0 or 1. - initial_lr: Initial learning rate at the start of training. - - .. warning:: Will be required in v0.21.0. """ - super(LearningRateWarmupCallback, self).__init__(K, warmup_epochs, momentum_correction, - steps_per_epoch, verbose, initial_lr) + super(LearningRateWarmupCallback, self).__init__(K, initial_lr, warmup_epochs, momentum_correction, + steps_per_epoch, verbose) class BestModelCheckpoint(keras.callbacks.ModelCheckpoint): diff --git a/horovod/tensorflow/keras/callbacks.py b/horovod/tensorflow/keras/callbacks.py index 9f1f161165..b91f367088 100644 --- a/horovod/tensorflow/keras/callbacks.py +++ b/horovod/tensorflow/keras/callbacks.py @@ -129,12 +129,13 @@ class LearningRateWarmupCallback(_impl.LearningRateWarmupCallbackImpl, keras.cal lr'(epoch = warmup) &= lr """ - def __init__(self, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None, - verbose=0, initial_lr=None): + def __init__(self, initial_lr, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None, + verbose=0): """ Construct a new LearningRateWarmupCallback that will gradually warm up the learning rate. Args: + initial_lr: Initial learning rate at the start of training. warmup_epochs: The number of epochs of the warmup phase. Defaults to 5. momentum_correction: Apply momentum correction to optimizers that have momentum. Defaults to True. @@ -142,12 +143,9 @@ def __init__(self, warmup_epochs=5, momentum_correction=True, steps_per_epoch=No epoch with Keras >= 2.0.0. Provide this value if you have an older version of Keras. verbose: verbosity mode, 0 or 1. - initial_lr: Initial learning rate at the start of training. - - .. warning:: Will be required in v0.21.0. """ - super(LearningRateWarmupCallback, self).__init__(K, warmup_epochs, momentum_correction, - steps_per_epoch, verbose, initial_lr) + super(LearningRateWarmupCallback, self).__init__(K, initial_lr, warmup_epochs, momentum_correction, + steps_per_epoch, verbose) class BestModelCheckpoint(keras.callbacks.ModelCheckpoint): From a0894ef7e36c62495b298f04cb0ba68ba6d019d7 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Fri, 20 Nov 2020 07:42:58 -0800 Subject: [PATCH 3/4] Updated changelog Signed-off-by: Travis Addair --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c8f6f472a..7c95581dfd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Changed +- Require parameter `initial_lr` of Keras callbacks `LearningRateScheduleCallback` and `LearningRateWarmupCallback` ([#2459](https://github.com/horovod/horovod/pull/2459)) + - Changed default cycle time from 5ms to 1ms and fusion threshold from 64MB to 128MB. ([#2468](https://github.com/horovod/horovod/pull/2468)) ### Deprecated From 881c1bd086edcb81777a31fe11be0ddd7b0bc900 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Fri, 20 Nov 2020 07:44:25 -0800 Subject: [PATCH 4/4] Fixed wording Signed-off-by: Travis Addair --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c95581dfd..dc6606d39c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,7 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Changed -- Require parameter `initial_lr` of Keras callbacks `LearningRateScheduleCallback` and `LearningRateWarmupCallback` ([#2459](https://github.com/horovod/horovod/pull/2459)) +- Changed Keras callbacks to require parameter `initial_lr` of `LearningRateScheduleCallback` and `LearningRateWarmupCallback`. ([#2459](https://github.com/horovod/horovod/pull/2459)) - Changed default cycle time from 5ms to 1ms and fusion threshold from 64MB to 128MB. ([#2468](https://github.com/horovod/horovod/pull/2468))