Skip to content

Commit

Permalink
Add Boltzmann-Gumbel exploration policy and fix compatibility issue (#…
Browse files Browse the repository at this point in the history
…156)

* Add BoltzmannGumbelQPolicy

Based on https://arxiv.org/pdf/1705.10257.pdf.

* Improve BGE documentation, checks

* Fix compatibility issue with most recent keras

* Make BGE work nicely when testing

* Only allow BGE for training

* Include information on testing

* Fix math typo
  • Loading branch information
evhub authored and matthiasplappert committed Nov 30, 2017
1 parent 5981750 commit 7b7e511
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 2 deletions.
53 changes: 53 additions & 0 deletions rl/policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,56 @@ def get_config(self):
config['tau'] = self.tau
config['clip'] = self.clip
return config


class BoltzmannGumbelQPolicy(Policy):
"""Implements Boltzmann-Gumbel exploration (BGE) adapted for Q learning
based on the paper Boltzmann Exploration Done Right
(https://arxiv.org/pdf/1705.10257.pdf).
BGE is invariant with respect to the mean of the rewards but not their
variance. The parameter C, which defaults to 1, can be used to correct for
this, and should be set to the least upper bound on the standard deviation
of the rewards.
BGE is only available for training, not testing. For testing purposes, you
can achieve approximately the same result as BGE after training for N steps
on K actions with parameter C by using the BoltzmannQPolicy and setting
tau = C/sqrt(N/K)."""

def __init__(self, C=1.0):
assert C > 0, "BoltzmannGumbelQPolicy C parameter must be > 0, not " + repr(C)
super(BoltzmannGumbelQPolicy, self).__init__()
self.C = C
self.action_counts = None

def select_action(self, q_values):
# We can't use BGE during testing, since we don't have access to the
# action_counts at the end of training.
assert self.agent.training, "BoltzmannGumbelQPolicy should only be used for training, not testing"

assert q_values.ndim == 1, q_values.ndim
q_values = q_values.astype('float64')

# If we are starting training, we should reset the action_counts.
# Otherwise, action_counts should already be initialized, since we
# always do so when we begin training.
if self.agent.step == 0:
self.action_counts = np.ones(q_values.shape)
assert self.action_counts is not None, self.agent.step
assert self.action_counts.shape == q_values.shape, (self.action_counts.shape, q_values.shape)

beta = self.C/np.sqrt(self.action_counts)
Z = np.random.gumbel(size=q_values.shape)

perturbation = beta * Z
perturbed_q_values = q_values + perturbation
action = np.argmax(perturbed_q_values)

self.action_counts[action] += 1
return action

def get_config(self):
config = super(BoltzmannGumbelQPolicy, self).get_config()
config['C'] = self.C
return config
9 changes: 7 additions & 2 deletions rl/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def get_soft_target_model_updates(target, source, tau):
def get_object_config(o):
if o is None:
return None

config = {
'class_name': o.__class__.__name__,
'config': o.get_config()
Expand Down Expand Up @@ -90,7 +90,12 @@ def __init__(self, optimizer, additional_updates):
self.optimizer = optimizer
self.additional_updates = additional_updates

def get_updates(self, params, constraints, loss):
# Keras sometimes passes params and loss as keyword arguments,
# expecting constraints to be optional, so there must be a default
# value for constraints here; see for example:
# https://github.com/fchollet/keras/blob/master/keras/engine/training.py#L988-L990
def get_updates(self, params, constraints=None, loss=None):
assert loss is not None, (params, constraints, loss)
updates = self.optimizer.get_updates(params, constraints, loss)
updates += self.additional_updates
self.updates = updates
Expand Down

0 comments on commit 7b7e511

Please sign in to comment.