From 35d80f7993a238e0fcffdd2d1f652a96be8c7103 Mon Sep 17 00:00:00 2001 From: Ilya Kulikov Date: Thu, 19 May 2016 22:33:12 +0200 Subject: [PATCH 1/2] Nadam optimizer and test for it added --- keras/optimizers.py | 77 ++++++++++++++++++++++++++++++++++ tests/keras/test_optimizers.py | 6 ++- 2 files changed, 82 insertions(+), 1 deletion(-) diff --git a/keras/optimizers.py b/keras/optimizers.py index 05dbc2625a1f..f336beaf3739 100644 --- a/keras/optimizers.py +++ b/keras/optimizers.py @@ -415,6 +415,82 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) +class Nadam(Optimizer): + ''' + Nesterov Adam optimizer: Adam ~ RMSProp + momentum, Nadam ~ RMSProp + NAG + + Default parameters follow those provided in the paper. + + # Arguments + lr: float >= 0. Learning rate. + beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. + epsilon: float >= 0. Fuzz factor. + + # References + [1] Nadam report - http://cs229.stanford.edu/proj2015/054_report.pdf + [2] On the importance of initialization and momentum in deep learning - + http://www.cs.toronto.edu/~fritz/absps/momentum.pdf + ''' + def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999, + epsilon=1e-8, **kwargs): + super(Nadam, self).__init__(**kwargs) + self.__dict__.update(locals()) + self.iterations = K.variable(0.) + self.m_schedule = K.variable(1.) + self.lr = K.variable(lr) + self.beta_1 = K.variable(beta_1) + self.beta_2 = K.variable(beta_2) + + def get_updates(self, params, constraints, loss): + grads = self.get_gradients(loss, params) + self.updates = [ (self.iterations, self.iterations + 1) ] + + t = self.iterations + 1 + + # Due to the recommendations in [2], i.e. warming momentum schedule + schedule_decay = 0.004 # Exactly given in [1] and [2] + momentum_cache_t = self.beta_1 * (1. - 0.5 * (K.pow(0.96, t * schedule_decay))) + momentum_cache_t_1 = self.beta_1 * (1. - 0.5 * (K.pow(0.96, (t + 1) * schedule_decay))) + m_schedule_new = self.m_schedule * momentum_cache_t + m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1 + self.updates.append((self.m_schedule, m_schedule_new)) + + ms = [ K.variable(np.zeros(K.get_value(p).shape)) for p in params ] + vs = [ K.variable(np.zeros(K.get_value(p).shape)) for p in params ] + + self.weights = ms + vs + + for p, g, m, v in zip(params, grads, ms, vs): + # the following equations given in [1] + g_prime = g / (1. - m_schedule_new) + m_t = self.beta_1 * m + (1. - self.beta_1) * g + m_t_prime = m_t / (1. - m_schedule_next) + v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g) + v_t_prime = v_t / (1. - K.pow(self.beta_2, t)) + m_t_bar = (1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime + + self.updates.append((m, m_t)) + self.updates.append((v, v_t)) + + p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon) + new_p = p_t + + # apply constraints + if p in constraints: + c = constraints[ p ] + new_p = c(new_p) + self.updates.append((p, new_p)) + return self.updates + + def get_config(self): + config = { 'lr': float(K.get_value(self.lr)), + 'beta_1': float(K.get_value(self.beta_1)), + 'beta_2': float(K.get_value(self.beta_2)), + 'epsilon': self.epsilon } + base_config = super(Nadam, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + # aliases sgd = SGD rmsprop = RMSprop @@ -422,6 +498,7 @@ def get_config(self): adadelta = Adadelta adam = Adam adamax = Adamax +nadam = Nadam def get(identifier, kwargs=None): diff --git a/tests/keras/test_optimizers.py b/tests/keras/test_optimizers.py index 36e8a5a6702e..fffd5cc86e2a 100644 --- a/tests/keras/test_optimizers.py +++ b/tests/keras/test_optimizers.py @@ -2,7 +2,7 @@ import pytest from keras.utils.test_utils import get_test_data -from keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax +from keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam from keras.models import Sequential from keras.layers.core import Dense, Activation from keras.utils.np_utils import to_categorical @@ -63,5 +63,9 @@ def test_adamax(): _test_optimizer(Adamax()) +def test_nadam(): + _test_optimizer(Nadam()) + + if __name__ == '__main__': pytest.main([__file__]) From d0341fc52a635fb01885d70979f58553473f2537 Mon Sep 17 00:00:00 2001 From: Ilya Kulikov Date: Thu, 19 May 2016 23:23:10 +0200 Subject: [PATCH 2/2] pep8 fix --- keras/optimizers.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/keras/optimizers.py b/keras/optimizers.py index f336beaf3739..112184b90e1e 100644 --- a/keras/optimizers.py +++ b/keras/optimizers.py @@ -443,7 +443,7 @@ def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999, def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) - self.updates = [ (self.iterations, self.iterations + 1) ] + self.updates = [(self.iterations, self.iterations + 1)] t = self.iterations + 1 @@ -455,8 +455,8 @@ def get_updates(self, params, constraints, loss): m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1 self.updates.append((self.m_schedule, m_schedule_new)) - ms = [ K.variable(np.zeros(K.get_value(p).shape)) for p in params ] - vs = [ K.variable(np.zeros(K.get_value(p).shape)) for p in params ] + ms = [K.variable(np.zeros(K.get_value(p).shape)) for p in params] + vs = [K.variable(np.zeros(K.get_value(p).shape)) for p in params] self.weights = ms + vs @@ -477,16 +477,16 @@ def get_updates(self, params, constraints, loss): # apply constraints if p in constraints: - c = constraints[ p ] + c = constraints[p] new_p = c(new_p) self.updates.append((p, new_p)) return self.updates def get_config(self): - config = { 'lr': float(K.get_value(self.lr)), - 'beta_1': float(K.get_value(self.beta_1)), - 'beta_2': float(K.get_value(self.beta_2)), - 'epsilon': self.epsilon } + config = {'lr': float(K.get_value(self.lr)), + 'beta_1': float(K.get_value(self.beta_1)), + 'beta_2': float(K.get_value(self.beta_2)), + 'epsilon': self.epsilon} base_config = super(Nadam, self).get_config() return dict(list(base_config.items()) + list(config.items()))