Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added 'adagrad' adaptive learning rate scheme to SGDOptimizer. #97

Merged
merged 4 commits into from
Jul 12, 2012
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 48 additions & 24 deletions pylearn2/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sys

# Third-party imports
from numpy import inf
import numpy
import theano
from theano import tensor

Expand All @@ -19,7 +19,8 @@ class SGDOptimizer(Optimizer):
Supports constant learning rates, or decreasing like 1/t after an initial
period.
"""
def __init__(self, params, base_lr, anneal_start=None, **kwargs):
def __init__(self, params, base_lr, anneal_start=None, use_adagrad=False,
** kwargs):
"""
Construct an SGDOptimizer.

Expand All @@ -35,6 +36,9 @@ def __init__(self, params, base_lr, anneal_start=None, **kwargs):
Number of steps after which to start annealing the learning
rate at a 1/t schedule, where t is the number of stochastic
gradient updates.
use_adagrad : bool
'adagrad' adaptive learning rate scheme is used. If set to True,
base_lr is used as e0.

Notes
-----
Expand All @@ -47,6 +51,10 @@ def __init__(self, params, base_lr, anneal_start=None, **kwargs):

Parameter-specific bounding values can be specified by passing
keyword arguments <param>_clip, which should be a (min, max) pair.

Adagrad is recommended with sparse inputs. It normalizes the base
learning rate of a parameter theta_i by the accumulated 2-norm of its
gradient: e{ti} = e0 / sqrt( sum_t (dL_t / dtheta_i)^2 )
"""
if hasattr(params, '__iter__'):
self.params = params
Expand All @@ -60,6 +68,16 @@ def __init__(self, params, base_lr, anneal_start=None, **kwargs):
else:
self.anneal_start = as_floatX(anneal_start)

# Create accumulators and epsilon0's
self.use_adagrad = use_adagrad
if self.use_adagrad:
self.accumulators = {}
self.e0s = {}
for param in self.params:
self.accumulators[param] = theano.shared(value=as_floatX(0.),
name='acc_%s' % param.name)
self.e0s[param] = as_floatX(base_lr)

# Set up the clipping values
self.clipping_values = {}
# Keep track of names already seen
Expand Down Expand Up @@ -144,7 +162,7 @@ def learning_rates_setup(self, base_lr, **kwargs):
# to lower the learning rate gradually after a certain amount of time.
self.annealed = sharedX(base_lr, 'annealed')

def learning_rate_updates(self):
def learning_rate_updates(self, gradients):
"""
Compute a dictionary of shared variable updates related to annealing
the learning rate.
Expand All @@ -158,24 +176,31 @@ def learning_rate_updates(self):
"""
ups = {}

# Annealing coefficient. Here we're using a formula of
# min(base_lr, anneal_start / (iteration + 1))
if self.anneal_start is None:
annealed = sharedX(self.base_lr)
if self.use_adagrad:
learn_rates = []
for param, gp in zip(self.params, gradients):
acc = self.accumulators[param]
ups[acc] = acc + (gp ** 2).sum()
learn_rates.append(self.e0s[param] / (ups[acc] ** .5))
else:
frac = self.anneal_start / (self.iteration + 1.)
annealed = tensor.minimum(
as_floatX(frac),
self.base_lr # maximum learning rate
)

# Update the shared variable for the annealed learning rate.
ups[self.annealed] = annealed
ups[self.iteration] = self.iteration + 1

# Calculate the learning rates for each parameter, in the order
# they appear in self.params
learn_rates = [annealed * self.learning_rates[p] for p in self.params]
# Annealing coefficient. Here we're using a formula of
# min(base_lr, anneal_start / (iteration + 1))
if self.anneal_start is None:
annealed = sharedX(self.base_lr)
else:
frac = self.anneal_start / (self.iteration + 1.)
annealed = tensor.minimum(
as_floatX(frac),
self.base_lr # maximum learning rate
)

# Update the shared variable for the annealed learning rate.
ups[self.annealed] = annealed
ups[self.iteration] = self.iteration + 1

# Calculate the learning rates for each parameter, in the order
# they appear in self.params
learn_rates = [annealed * self.learning_rates[p] for p in self.params]
return ups, learn_rates

def updates(self, gradients):
Expand Down Expand Up @@ -203,10 +228,10 @@ def updates(self, gradients):
"""
ups = {}
# Add the learning rate/iteration updates
l_ups, learn_rates = self.learning_rate_updates()
l_ups, learn_rates = self.learning_rate_updates(gradients)
safe_update(ups, l_ups)

# Get the updates from sgd_updates
# Get the updates from sgd_updates, a PyLearn library function.
p_up = dict(self.sgd_updates(self.params, gradients, learn_rates))

# Add the things in p_up to ups
Expand Down Expand Up @@ -249,7 +274,6 @@ def cost_updates(self, cost):
grads = [tensor.grad(cost, p) for p in self.params]
return self.updates(gradients=grads)


def sgd_updates(self, params, grads, stepsizes):
"""Return a list of (pairs) that can be used as updates in theano.function to
implement stochastic gradient descent.
Expand Down Expand Up @@ -286,5 +310,5 @@ def sgd_momentum_updates(self, params, grads, stepsizes, momentum=0.9):
updates = []
for s, p, gp, m, h in zip(stepsizes, params, grads, momentum, headings):
updates.append((p, p + s * h))
updates.append((h, m*h - (1.0-m)*gp))
updates.append((h, m * h - (1.0 - m) * gp))
return updates