Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Require initial_lr parameter to LearningRateScheduleCallback #2459

Merged
merged 4 commits into from Nov 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Expand Up @@ -18,6 +18,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

### Changed

- Changed Keras callbacks to require parameter `initial_lr` of `LearningRateScheduleCallback` and `LearningRateWarmupCallback`. ([#2459](https://github.com/horovod/horovod/pull/2459))

- Changed default cycle time from 5ms to 1ms and fusion threshold from 64MB to 128MB. ([#2468](https://github.com/horovod/horovod/pull/2468))

### Deprecated
Expand Down
15 changes: 9 additions & 6 deletions examples/keras/keras_imagenet_resnet50.py
Expand Up @@ -142,15 +142,18 @@
# Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
# accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
# the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=args.warmup_epochs, initial_lr=initial_lr,
hvd.callbacks.LearningRateWarmupCallback(initial_lr=initial_lr,
warmup_epochs=args.warmup_epochs,
verbose=verbose),

# Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs.
hvd.callbacks.LearningRateScheduleCallback(start_epoch=args.warmup_epochs, end_epoch=30, multiplier=1.,
initial_lr=initial_lr),
hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1, initial_lr=initial_lr),
hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2, initial_lr=initial_lr),
hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3, initial_lr=initial_lr),
hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr,
multiplier=1.,
start_epoch=args.warmup_epochs,
end_epoch=30),
hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr, multiplier=1e-1, start_epoch=30, end_epoch=60),
hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr, multiplier=1e-2, start_epoch=60, end_epoch=80),
hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr, multiplier=1e-3, start_epoch=80),
]

# Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
Expand Down
2 changes: 1 addition & 1 deletion examples/keras/keras_mnist_advanced.py
Expand Up @@ -106,7 +106,7 @@
# Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
# accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
# the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=scaled_lr, verbose=1),
hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=5, verbose=1),

# Reduce the learning rate if training plateaues.
keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1),
Expand Down
2 changes: 1 addition & 1 deletion examples/ray/tensorflow2_mnist_ray.py
Expand Up @@ -73,7 +73,7 @@ def train(num_epochs):
# accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
# the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
hvd.callbacks.LearningRateWarmupCallback(
warmup_epochs=3, initial_lr=scaled_lr, verbose=1),
initial_lr=scaled_lr, warmup_epochs=3, verbose=1),
]

# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
Expand Down
2 changes: 1 addition & 1 deletion examples/spark/keras/keras_spark3_rossmann.py
Expand Up @@ -446,7 +446,7 @@ def train_fn(model_bytes):
# Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
# accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
# the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=scaled_lr, verbose=verbose),
hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=5, verbose=verbose),

# Reduce LR if the metric is not improved for 10 epochs, and stop training
# if it has not improved for 20 epochs.
Expand Down
2 changes: 1 addition & 1 deletion examples/spark/keras/keras_spark_rossmann_run.py
Expand Up @@ -435,7 +435,7 @@ def train_fn(model_bytes):
# Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
# accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
# the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=scaled_lr, verbose=verbose),
hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=5, verbose=verbose),

# Reduce LR if the metric is not improved for 10 epochs, and stop training
# if it has not improved for 20 epochs.
Expand Down
2 changes: 1 addition & 1 deletion examples/tensorflow2/tensorflow2_keras_mnist.py
Expand Up @@ -76,7 +76,7 @@
# Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
# accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
# the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=3, initial_lr=scaled_lr, verbose=1),
hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=3, verbose=1),
]

# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
Expand Down
14 changes: 7 additions & 7 deletions horovod/_keras/callbacks.py
Expand Up @@ -87,8 +87,8 @@ def on_epoch_end(self, epoch, logs=None):


class LearningRateScheduleCallbackImpl(object):
def __init__(self, backend, multiplier, start_epoch=0, end_epoch=None, staircase=True,
momentum_correction=True, steps_per_epoch=None, initial_lr=None, *args):
def __init__(self, backend, initial_lr, multiplier, start_epoch=0, end_epoch=None, staircase=True,
momentum_correction=True, steps_per_epoch=None, *args):
super(LearningRateScheduleCallbackImpl, self).__init__(*args)
self.backend = backend
self.start_epoch = start_epoch
Expand All @@ -107,7 +107,7 @@ def __init__(self, backend, multiplier, start_epoch=0, end_epoch=None, staircase
self.multiplier = multiplier

if self.initial_lr is None:
warnings.warn('Parameter `initial_lr` will be required in v0.21.0', DeprecationWarning)
raise ValueError('Parameter `initial_lr` is required')

def _autodetect_steps_per_epoch(self):
if self.params.get('steps'):
Expand Down Expand Up @@ -170,16 +170,16 @@ def on_epoch_end(self, epoch, logs=None):


class LearningRateWarmupCallbackImpl(LearningRateScheduleCallbackImpl):
def __init__(self, backend, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
verbose=0, initial_lr=None, *args):
def __init__(self, backend, initial_lr, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
verbose=0, *args):
def multiplier(epoch):
# Adjust epoch to produce round numbers at the end of each epoch, so that TensorBoard
# learning rate graphs look better.
epoch += 1. / self.steps_per_epoch
return 1. / hvd.size() * (epoch * (hvd.size() - 1) / warmup_epochs + 1)
super(LearningRateWarmupCallbackImpl, self).__init__(
backend, multiplier, start_epoch=0, end_epoch=warmup_epochs, staircase=False,
momentum_correction=momentum_correction, steps_per_epoch=steps_per_epoch, initial_lr=initial_lr,
backend, initial_lr, multiplier, start_epoch=0, end_epoch=warmup_epochs, staircase=False,
momentum_correction=momentum_correction, steps_per_epoch=steps_per_epoch,
*args)
self.verbose = verbose

Expand Down
26 changes: 10 additions & 16 deletions horovod/keras/callbacks.py
Expand Up @@ -82,12 +82,13 @@ class LearningRateScheduleCallback(_impl.LearningRateScheduleCallbackImpl, keras
`initial_lr` is the learning rate of the model optimizer at the start of the training.
"""

def __init__(self, multiplier, start_epoch=0, end_epoch=None, staircase=True,
momentum_correction=True, steps_per_epoch=None, initial_lr=None):
def __init__(self, initial_lr, multiplier, start_epoch=0, end_epoch=None, staircase=True,
momentum_correction=True, steps_per_epoch=None):
"""
Construct a new LearningRateScheduleCallback.

Args:
initial_lr: Initial learning rate at the start of training.
multiplier: A constant multiplier or a function `f(epoch) = lr'`
start_epoch: The first epoch this adjustment will be applied to. Defaults to 0.
end_epoch: The epoch this adjustment will stop applying (exclusive end).
Expand All @@ -99,14 +100,9 @@ def __init__(self, multiplier, start_epoch=0, end_epoch=None, staircase=True,
steps_per_epoch: The callback will attempt to autodetect number of batches per
epoch with Keras >= 2.0.0. Provide this value if you have an older
version of Keras.
initial_lr: Initial learning rate at the start of training.

.. warning:: Will be required in v0.21.0.

"""
super(LearningRateScheduleCallback, self).__init__(K, multiplier, start_epoch, end_epoch,
staircase, momentum_correction, steps_per_epoch,
initial_lr)
super(LearningRateScheduleCallback, self).__init__(K, initial_lr, multiplier, start_epoch, end_epoch,
staircase, momentum_correction, steps_per_epoch)


class LearningRateWarmupCallback(_impl.LearningRateWarmupCallbackImpl, keras.callbacks.Callback):
Expand All @@ -133,25 +129,23 @@ class LearningRateWarmupCallback(_impl.LearningRateWarmupCallbackImpl, keras.cal
lr'(epoch = warmup) &= lr
"""

def __init__(self, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
verbose=0, initial_lr=None):
def __init__(self, initial_lr, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
verbose=0):
"""
Construct a new LearningRateWarmupCallback that will gradually warm up the learning rate.

Args:
initial_lr: Initial learning rate at the start of training.
warmup_epochs: The number of epochs of the warmup phase. Defaults to 5.
momentum_correction: Apply momentum correction to optimizers that have momentum.
Defaults to True.
steps_per_epoch: The callback will attempt to autodetect number of batches per
epoch with Keras >= 2.0.0. Provide this value if you have an older
version of Keras.
verbose: verbosity mode, 0 or 1.
initial_lr: Initial learning rate at the start of training.

.. warning:: Will be required in v0.21.0.
"""
super(LearningRateWarmupCallback, self).__init__(K, warmup_epochs, momentum_correction,
steps_per_epoch, verbose, initial_lr)
super(LearningRateWarmupCallback, self).__init__(K, initial_lr, warmup_epochs, momentum_correction,
steps_per_epoch, verbose)


class BestModelCheckpoint(keras.callbacks.ModelCheckpoint):
Expand Down
26 changes: 10 additions & 16 deletions horovod/tensorflow/keras/callbacks.py
Expand Up @@ -82,12 +82,13 @@ class LearningRateScheduleCallback(_impl.LearningRateScheduleCallbackImpl, keras
`initial_lr` is the learning rate of the model optimizer at the start of the training.
"""

def __init__(self, multiplier, start_epoch=0, end_epoch=None, staircase=True,
momentum_correction=True, steps_per_epoch=None, initial_lr=None):
def __init__(self, initial_lr, multiplier, start_epoch=0, end_epoch=None, staircase=True,
momentum_correction=True, steps_per_epoch=None):
"""
Construct a new LearningRateScheduleCallback.

Args:
initial_lr: Initial learning rate at the start of training.
multiplier: A constant multiplier or a function `f(epoch) = lr'`
start_epoch: The first epoch this adjustment will be applied to. Defaults to 0.
end_epoch: The epoch this adjustment will stop applying (exclusive end).
Expand All @@ -99,14 +100,9 @@ def __init__(self, multiplier, start_epoch=0, end_epoch=None, staircase=True,
steps_per_epoch: The callback will attempt to autodetect number of batches per
epoch with Keras >= 2.0.0. Provide this value if you have an older
version of Keras.
initial_lr: Initial learning rate at the start of training.

.. warning:: Will be required in v0.21.0.

"""
super(LearningRateScheduleCallback, self).__init__(K, multiplier, start_epoch, end_epoch,
staircase, momentum_correction, steps_per_epoch,
initial_lr)
super(LearningRateScheduleCallback, self).__init__(K, initial_lr, multiplier, start_epoch, end_epoch,
staircase, momentum_correction, steps_per_epoch)


class LearningRateWarmupCallback(_impl.LearningRateWarmupCallbackImpl, keras.callbacks.Callback):
Expand All @@ -133,25 +129,23 @@ class LearningRateWarmupCallback(_impl.LearningRateWarmupCallbackImpl, keras.cal
lr'(epoch = warmup) &= lr
"""

def __init__(self, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
verbose=0, initial_lr=None):
def __init__(self, initial_lr, warmup_epochs=5, momentum_correction=True, steps_per_epoch=None,
verbose=0):
"""
Construct a new LearningRateWarmupCallback that will gradually warm up the learning rate.

Args:
initial_lr: Initial learning rate at the start of training.
warmup_epochs: The number of epochs of the warmup phase. Defaults to 5.
momentum_correction: Apply momentum correction to optimizers that have momentum.
Defaults to True.
steps_per_epoch: The callback will attempt to autodetect number of batches per
epoch with Keras >= 2.0.0. Provide this value if you have an older
version of Keras.
verbose: verbosity mode, 0 or 1.
initial_lr: Initial learning rate at the start of training.

.. warning:: Will be required in v0.21.0.
"""
super(LearningRateWarmupCallback, self).__init__(K, warmup_epochs, momentum_correction,
steps_per_epoch, verbose, initial_lr)
super(LearningRateWarmupCallback, self).__init__(K, initial_lr, warmup_epochs, momentum_correction,
steps_per_epoch, verbose)


class BestModelCheckpoint(keras.callbacks.ModelCheckpoint):
Expand Down