From be2dead729122aa728c81b53f395f93d2411c02d Mon Sep 17 00:00:00 2001 From: Valentin Bisson Date: Tue, 10 Jul 2012 16:16:08 -0400 Subject: [PATCH 1/4] added 'adagrad' adaptive learning rate scheme to SGDOptimizer. --- pylearn2/optimizer.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/pylearn2/optimizer.py b/pylearn2/optimizer.py index e4d8e5b088..1126066f56 100644 --- a/pylearn2/optimizer.py +++ b/pylearn2/optimizer.py @@ -3,6 +3,7 @@ import sys # Third-party imports +import numpy from numpy import inf import theano from theano import tensor @@ -19,7 +20,8 @@ class SGDOptimizer(Optimizer): Supports constant learning rates, or decreasing like 1/t after an initial period. """ - def __init__(self, params, base_lr, anneal_start=None, **kwargs): + def __init__(self, params, base_lr, anneal_start=None, use_adagrad=False, + ** kwargs): """ Construct an SGDOptimizer. @@ -60,6 +62,16 @@ def __init__(self, params, base_lr, anneal_start=None, **kwargs): else: self.anneal_start = as_floatX(anneal_start) + # Create accumulators and epsilon0's + self.use_adagrad = use_adagrad + if self.use_adagrad: + self.accumulators = {} + self.e0s = {} + for param in self.params: + self.accumulators[param] = theano.shared(value=as_floatX(0), + name='acc_%s' % param.name) + self.e0s[param] = as_floatX(base_lr) + # Set up the clipping values self.clipping_values = {} # Keep track of names already seen @@ -206,8 +218,16 @@ def updates(self, gradients): l_ups, learn_rates = self.learning_rate_updates() safe_update(ups, l_ups) - # Get the updates from sgd_updates - p_up = dict(self.sgd_updates(self.params, gradients, learn_rates)) + if self.use_adagrad: + p_up = {} + for param, gp in zip(self.params, gradients): + acc = self.accumulators[param] + p_up[acc] = acc + (gp ** 2).sum() + adagrad = self.e0s[param] / (p_up[acc] ** .5) + p_up[param] = param - adagrad * gp + else: + # Get the updates from sgd_updates, a PyLearn library function. + p_up = dict(sgd_updates(self.params, gradients, learn_rates)) # Add the things in p_up to ups safe_update(ups, p_up) From c3d2c217c5bf3a78a1dc42645f3a865cd37cd4cd Mon Sep 17 00:00:00 2001 From: Valentin Bisson Date: Tue, 10 Jul 2012 16:29:24 -0400 Subject: [PATCH 2/4] Fixed wrong assumption on pylearn2.utils.as_floatX's behavior. --- pylearn2/optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pylearn2/optimizer.py b/pylearn2/optimizer.py index 1126066f56..c42b943b72 100644 --- a/pylearn2/optimizer.py +++ b/pylearn2/optimizer.py @@ -68,7 +68,7 @@ def __init__(self, params, base_lr, anneal_start=None, use_adagrad=False, self.accumulators = {} self.e0s = {} for param in self.params: - self.accumulators[param] = theano.shared(value=as_floatX(0), + self.accumulators[param] = theano.shared(value=as_floatX(0.), name='acc_%s' % param.name) self.e0s[param] = as_floatX(base_lr) From e91eaf29cbe51b215ab628791ffea809c567b0f8 Mon Sep 17 00:00:00 2001 From: Valentin Bisson Date: Tue, 10 Jul 2012 16:31:43 -0400 Subject: [PATCH 3/4] pep8'ified the rest of the file. --- pylearn2/optimizer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pylearn2/optimizer.py b/pylearn2/optimizer.py index c42b943b72..136385c1ba 100644 --- a/pylearn2/optimizer.py +++ b/pylearn2/optimizer.py @@ -269,7 +269,6 @@ def cost_updates(self, cost): grads = [tensor.grad(cost, p) for p in self.params] return self.updates(gradients=grads) - def sgd_updates(self, params, grads, stepsizes): """Return a list of (pairs) that can be used as updates in theano.function to implement stochastic gradient descent. @@ -306,5 +305,5 @@ def sgd_momentum_updates(self, params, grads, stepsizes, momentum=0.9): updates = [] for s, p, gp, m, h in zip(stepsizes, params, grads, momentum, headings): updates.append((p, p + s * h)) - updates.append((h, m*h - (1.0-m)*gp)) + updates.append((h, m * h - (1.0 - m) * gp)) return updates From ea61fd2251b626b203858aa1d5b5c5def6cebcef Mon Sep 17 00:00:00 2001 From: Valentin Bisson Date: Tue, 10 Jul 2012 17:04:55 -0400 Subject: [PATCH 4/4] Wrote some doc, better integrated in the class. --- pylearn2/optimizer.py | 65 +++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/pylearn2/optimizer.py b/pylearn2/optimizer.py index 136385c1ba..d2d22b57fb 100644 --- a/pylearn2/optimizer.py +++ b/pylearn2/optimizer.py @@ -4,7 +4,6 @@ # Third-party imports import numpy -from numpy import inf import theano from theano import tensor @@ -37,6 +36,9 @@ def __init__(self, params, base_lr, anneal_start=None, use_adagrad=False, Number of steps after which to start annealing the learning rate at a 1/t schedule, where t is the number of stochastic gradient updates. + use_adagrad : bool + 'adagrad' adaptive learning rate scheme is used. If set to True, + base_lr is used as e0. Notes ----- @@ -49,6 +51,10 @@ def __init__(self, params, base_lr, anneal_start=None, use_adagrad=False, Parameter-specific bounding values can be specified by passing keyword arguments _clip, which should be a (min, max) pair. + + Adagrad is recommended with sparse inputs. It normalizes the base + learning rate of a parameter theta_i by the accumulated 2-norm of its + gradient: e{ti} = e0 / sqrt( sum_t (dL_t / dtheta_i)^2 ) """ if hasattr(params, '__iter__'): self.params = params @@ -156,7 +162,7 @@ def learning_rates_setup(self, base_lr, **kwargs): # to lower the learning rate gradually after a certain amount of time. self.annealed = sharedX(base_lr, 'annealed') - def learning_rate_updates(self): + def learning_rate_updates(self, gradients): """ Compute a dictionary of shared variable updates related to annealing the learning rate. @@ -170,24 +176,31 @@ def learning_rate_updates(self): """ ups = {} - # Annealing coefficient. Here we're using a formula of - # min(base_lr, anneal_start / (iteration + 1)) - if self.anneal_start is None: - annealed = sharedX(self.base_lr) + if self.use_adagrad: + learn_rates = [] + for param, gp in zip(self.params, gradients): + acc = self.accumulators[param] + ups[acc] = acc + (gp ** 2).sum() + learn_rates.append(self.e0s[param] / (ups[acc] ** .5)) else: - frac = self.anneal_start / (self.iteration + 1.) - annealed = tensor.minimum( - as_floatX(frac), - self.base_lr # maximum learning rate - ) - - # Update the shared variable for the annealed learning rate. - ups[self.annealed] = annealed - ups[self.iteration] = self.iteration + 1 - - # Calculate the learning rates for each parameter, in the order - # they appear in self.params - learn_rates = [annealed * self.learning_rates[p] for p in self.params] + # Annealing coefficient. Here we're using a formula of + # min(base_lr, anneal_start / (iteration + 1)) + if self.anneal_start is None: + annealed = sharedX(self.base_lr) + else: + frac = self.anneal_start / (self.iteration + 1.) + annealed = tensor.minimum( + as_floatX(frac), + self.base_lr # maximum learning rate + ) + + # Update the shared variable for the annealed learning rate. + ups[self.annealed] = annealed + ups[self.iteration] = self.iteration + 1 + + # Calculate the learning rates for each parameter, in the order + # they appear in self.params + learn_rates = [annealed * self.learning_rates[p] for p in self.params] return ups, learn_rates def updates(self, gradients): @@ -215,19 +228,11 @@ def updates(self, gradients): """ ups = {} # Add the learning rate/iteration updates - l_ups, learn_rates = self.learning_rate_updates() + l_ups, learn_rates = self.learning_rate_updates(gradients) safe_update(ups, l_ups) - if self.use_adagrad: - p_up = {} - for param, gp in zip(self.params, gradients): - acc = self.accumulators[param] - p_up[acc] = acc + (gp ** 2).sum() - adagrad = self.e0s[param] / (p_up[acc] ** .5) - p_up[param] = param - adagrad * gp - else: - # Get the updates from sgd_updates, a PyLearn library function. - p_up = dict(sgd_updates(self.params, gradients, learn_rates)) + # Get the updates from sgd_updates, a PyLearn library function. + p_up = dict(self.sgd_updates(self.params, gradients, learn_rates)) # Add the things in p_up to ups safe_update(ups, p_up)