diff --git a/docs/autogen.py b/docs/autogen.py index f4053d7ed..766648e08 100644 --- a/docs/autogen.py +++ b/docs/autogen.py @@ -16,6 +16,7 @@ import rl import rl.core +import rl.processors import rl.agents @@ -28,6 +29,10 @@ 'page': 'core.md', 'all_module_classes': [rl.core], }, + { + 'page': 'processors.md', + 'all_module_classes': [rl.processors], + }, { 'page': 'agents/overview.md', 'functions': [ diff --git a/docs/sources/core.md b/docs/sources/core.md index 9118a96bf..4d007bcd4 100644 --- a/docs/sources/core.md +++ b/docs/sources/core.md @@ -50,33 +50,7 @@ or write your own. ---- -[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L529) -### MultiInputProcessor - -```python -rl.core.MultiInputProcessor(nb_inputs) -``` - -Converts observations from an environment with multiple observations for use in a neural network -policy. - -In some cases, you have environments that return multiple different observations per timestep -(in a robotics context, for example, a camera may be used to view the scene and a joint encoder may -be used to report the angles for each joint). Usually, this can be handled by a policy that has -multiple inputs, one for each modality. However, observations are returned by the environment -in the form of a tuple `[(modality1_t, modality2_t, ..., modalityn_t) for t in T]` but the neural network -expects them in per-modality batches like so: `[[modality1_1, ..., modality1_T], ..., [[modalityn_1, ..., modalityn_T]]`. -This processor converts observations appropriate for this use case. - -__Arguments__ - -- __nb_inputs__ (integer): The number of inputs, that is different modalities, to be used. - Your neural network that you use for the policy must have a corresponding number of - inputs. - ----- - -[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L566) +[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L533) ### Env ```python @@ -90,7 +64,7 @@ implementation. ---- -[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L642) +[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L609) ### Space ```python diff --git a/docs/sources/processors.md b/docs/sources/processors.md new file mode 100644 index 000000000..d2845d81e --- /dev/null +++ b/docs/sources/processors.md @@ -0,0 +1,41 @@ +[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/processors.py#L7) +### MultiInputProcessor + +```python +rl.processors.MultiInputProcessor(nb_inputs) +``` + +Converts observations from an environment with multiple observations for use in a neural network +policy. + +In some cases, you have environments that return multiple different observations per timestep +(in a robotics context, for example, a camera may be used to view the scene and a joint encoder may +be used to report the angles for each joint). Usually, this can be handled by a policy that has +multiple inputs, one for each modality. However, observations are returned by the environment +in the form of a tuple `[(modality1_t, modality2_t, ..., modalityn_t) for t in T]` but the neural network +expects them in per-modality batches like so: `[[modality1_1, ..., modality1_T], ..., [[modalityn_1, ..., modalityn_T]]`. +This processor converts observations appropriate for this use case. + +__Arguments__ + +- __nb_inputs__ (integer): The number of inputs, that is different modalities, to be used. + Your neural network that you use for the policy must have a corresponding number of + inputs. + +---- + +[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/processors.py#L40) +### WhiteningNormalizerProcessor + +```python +rl.processors.WhiteningNormalizerProcessor() +``` + +Normalizes the observations to have zero mean and standard deviation of one, +i.e. it applies whitening to the inputs. + +This typically helps significantly with learning, especially if different dimensions are +on different scales. However, it complicates training in the sense that you will have to store +these weights alongside the policy if you intend to load it later. It is the responsibility of +the user to do so. + diff --git a/docs/templates/processors.md b/docs/templates/processors.md new file mode 100644 index 000000000..6bfff75d4 --- /dev/null +++ b/docs/templates/processors.md @@ -0,0 +1 @@ +{{autogenerated}} diff --git a/examples/ddpg_mujoco.py b/examples/ddpg_mujoco.py new file mode 100644 index 000000000..39344b9e7 --- /dev/null +++ b/examples/ddpg_mujoco.py @@ -0,0 +1,77 @@ +import numpy as np + +import gym +from gym import wrappers + +from keras.models import Sequential, Model +from keras.layers import Dense, Activation, Flatten, Input +from keras.optimizers import Adam + +from rl.processors import WhiteningNormalizerProcessor +from rl.agents import DDPGAgent +from rl.memory import SequentialMemory +from rl.random import OrnsteinUhlenbeckProcess +from rl.keras_future import concatenate + + +class MujocoProcessor(WhiteningNormalizerProcessor): + def process_action(self, action): + return np.clip(action, -1., 1.) + + +ENV_NAME = 'HalfCheetah-v1' +gym.undo_logger_setup() + + +# Get the environment and extract the number of actions. +env = gym.make(ENV_NAME) +env = wrappers.Monitor(env, '/tmp/{}'.format(ENV_NAME), force=True) +np.random.seed(123) +env.seed(123) +assert len(env.action_space.shape) == 1 +nb_actions = env.action_space.shape[0] + +# Next, we build a very simple model. +actor = Sequential() +actor.add(Flatten(input_shape=(1,) + env.observation_space.shape)) +actor.add(Dense(400)) +actor.add(Activation('relu')) +actor.add(Dense(300)) +actor.add(Activation('relu')) +actor.add(Dense(nb_actions)) +actor.add(Activation('tanh')) +print(actor.summary()) + +action_input = Input(shape=(nb_actions,), name='action_input') +observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') +flattened_observation = Flatten()(observation_input) +x = Dense(400)(flattened_observation) +x = Activation('relu')(x) +x = concatenate([x, action_input]) +x = Dense(300)(x) +x = Activation('relu')(x) +x = Dense(1)(x) +x = Activation('linear')(x) +critic = Model(input=[action_input, observation_input], output=x) +print(critic.summary()) + +# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and +# even the metrics! +memory = SequentialMemory(limit=100000, window_length=1) +random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.1) +agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, + memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, + random_process=random_process, gamma=.99, target_model_update=1e-3, + processor=MujocoProcessor()) +agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae']) + +# Okay, now it's time to learn something! We visualize the training here for show, but this +# slows down training quite a lot. You can always safely abort the training prematurely using +# Ctrl + C. +agent.fit(env, nb_steps=1000000, visualize=False, verbose=1) + +# After training is done, we save the final weights. +agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True) + +# Finally, evaluate our algorithm for 5 episodes. +agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200) diff --git a/rl/core.py b/rl/core.py index 5b6d1e2f6..c04ab6bfb 100644 --- a/rl/core.py +++ b/rl/core.py @@ -526,39 +526,6 @@ def metrics_names(self): return [] -class MultiInputProcessor(Processor): - """Converts observations from an environment with multiple observations for use in a neural network - policy. - - In some cases, you have environments that return multiple different observations per timestep - (in a robotics context, for example, a camera may be used to view the scene and a joint encoder may - be used to report the angles for each joint). Usually, this can be handled by a policy that has - multiple inputs, one for each modality. However, observations are returned by the environment - in the form of a tuple `[(modality1_t, modality2_t, ..., modalityn_t) for t in T]` but the neural network - expects them in per-modality batches like so: `[[modality1_1, ..., modality1_T], ..., [[modalityn_1, ..., modalityn_T]]`. - This processor converts observations appropriate for this use case. - - # Arguments - nb_inputs (integer): The number of inputs, that is different modalities, to be used. - Your neural network that you use for the policy must have a corresponding number of - inputs. - """ - def __init__(self, nb_inputs): - self.nb_inputs = nb_inputs - - def process_state_batch(self, state_batch): - input_batches = [[] for x in range(self.nb_inputs)] - for state in state_batch: - processed_state = [[] for x in range(self.nb_inputs)] - for observation in state: - assert len(observation) == self.nb_inputs - for o, s in zip(observation, processed_state): - s.append(o) - for idx, s in enumerate(processed_state): - input_batches[idx].append(s) - return [np.array(x) for x in input_batches] - - # Note: the API of the `Env` and `Space` classes are taken from the OpenAI Gym implementation. # https://github.com/openai/gym/blob/master/gym/core.py diff --git a/rl/processors.py b/rl/processors.py new file mode 100644 index 000000000..4b99735b4 --- /dev/null +++ b/rl/processors.py @@ -0,0 +1,57 @@ +import numpy as np + +from rl.core import Processor +from rl.util import WhiteningNormalizer + + +class MultiInputProcessor(Processor): + """Converts observations from an environment with multiple observations for use in a neural network + policy. + + In some cases, you have environments that return multiple different observations per timestep + (in a robotics context, for example, a camera may be used to view the scene and a joint encoder may + be used to report the angles for each joint). Usually, this can be handled by a policy that has + multiple inputs, one for each modality. However, observations are returned by the environment + in the form of a tuple `[(modality1_t, modality2_t, ..., modalityn_t) for t in T]` but the neural network + expects them in per-modality batches like so: `[[modality1_1, ..., modality1_T], ..., [[modalityn_1, ..., modalityn_T]]`. + This processor converts observations appropriate for this use case. + + # Arguments + nb_inputs (integer): The number of inputs, that is different modalities, to be used. + Your neural network that you use for the policy must have a corresponding number of + inputs. + """ + def __init__(self, nb_inputs): + self.nb_inputs = nb_inputs + + def process_state_batch(self, state_batch): + input_batches = [[] for x in range(self.nb_inputs)] + for state in state_batch: + processed_state = [[] for x in range(self.nb_inputs)] + for observation in state: + assert len(observation) == self.nb_inputs + for o, s in zip(observation, processed_state): + s.append(o) + for idx, s in enumerate(processed_state): + input_batches[idx].append(s) + return [np.array(x) for x in input_batches] + + +class WhiteningNormalizerProcessor(Processor): + """Normalizes the observations to have zero mean and standard deviation of one, + i.e. it applies whitening to the inputs. + + This typically helps significantly with learning, especially if different dimensions are + on different scales. However, it complicates training in the sense that you will have to store + these weights alongside the policy if you intend to load it later. It is the responsibility of + the user to do so. + """ + def __init__(self): + self.normalizer = None + + def process_state_batch(self, batch): + if self.normalizer is None: + self.normalizer = WhiteningNormalizer(shape=batch.shape[1:], dtype=batch.dtype) + self.normalizer.update(batch) + return self.normalizer.normalize(batch) + diff --git a/rl/util.py b/rl/util.py index 8319412a1..a1c991827 100644 --- a/rl/util.py +++ b/rl/util.py @@ -98,3 +98,36 @@ def get_updates(self, params, constraints, loss): def get_config(self): return self.optimizer.get_config() + + +# Based on https://github.com/openai/baselines/blob/master/baselines/common/mpi_running_mean_std.py +class WhiteningNormalizer(object): + def __init__(self, shape, eps=1e-2, dtype=np.float64): + self.eps = eps + self.shape = shape + self.dtype = dtype + + self._sum = np.zeros(shape, dtype=dtype) + self._sumsq = np.zeros(shape, dtype=dtype) + self._count = 0 + + self.mean = np.zeros(shape, dtype=dtype) + self.std = np.ones(shape, dtype=dtype) + + def normalize(self, x): + return (x - self.mean) / self.std + + def denormalize(self, x): + return self.std * x + self.mean + + def update(self, x): + if x.ndim == len(self.shape): + x = x.reshape(-1, *self.shape) + assert x.shape[1:] == self.shape + + self._count += x.shape[0] + self._sum += np.sum(x, axis=0) + self._sumsq += np.sum(np.square(x), axis=0) + + self.mean = self._sum / float(self._count) + self.std = np.sqrt(np.maximum(np.square(self.eps), self._sumsq / float(self._count) - np.square(self.mean))) diff --git a/tests/rl/agents/test_cem.py b/tests/rl/agents/test_cem.py index b71a49030..05add261d 100644 --- a/tests/rl/agents/test_cem.py +++ b/tests/rl/agents/test_cem.py @@ -10,7 +10,7 @@ from rl.agents.cem import CEMAgent from rl.memory import EpisodeParameterMemory -from rl.core import MultiInputProcessor +from rl.processors import MultiInputProcessor from ..util import MultiInputTestEnv diff --git a/tests/rl/agents/test_ddpg.py b/tests/rl/agents/test_ddpg.py index 00840ab76..ef53e76e4 100644 --- a/tests/rl/agents/test_ddpg.py +++ b/tests/rl/agents/test_ddpg.py @@ -10,7 +10,7 @@ from rl.agents.ddpg import DDPGAgent from rl.memory import SequentialMemory -from rl.core import MultiInputProcessor +from rl.processors import MultiInputProcessor from ..util import MultiInputTestEnv diff --git a/tests/rl/agents/test_dqn.py b/tests/rl/agents/test_dqn.py index e2858984d..bf82c9f6d 100644 --- a/tests/rl/agents/test_dqn.py +++ b/tests/rl/agents/test_dqn.py @@ -10,7 +10,7 @@ from rl.agents.dqn import NAFLayer, DQNAgent, NAFAgent from rl.memory import SequentialMemory -from rl.core import MultiInputProcessor +from rl.processors import MultiInputProcessor from rl.keras_future import concatenate, Model from ..util import MultiInputTestEnv diff --git a/tests/rl/test_util.py b/tests/rl/test_util.py index 7229cae10..3990c166f 100644 --- a/tests/rl/test_util.py +++ b/tests/rl/test_util.py @@ -8,7 +8,7 @@ from keras.optimizers import SGD import keras.backend as K -from rl.util import clone_optimizer, clone_model, huber_loss +from rl.util import clone_optimizer, clone_model, huber_loss, WhiteningNormalizer def test_clone_sequential_model(): @@ -68,5 +68,22 @@ def test_huber_loss(): assert_allclose(K.eval(huber_loss(a, b, np.inf)), np.array([.125, .125, 2., 2.])) +def test_whitening_normalizer(): + x = np.random.normal(loc=.2, scale=2., size=(1000, 5)) + normalizer = WhiteningNormalizer(shape=(5,)) + normalizer.update(x[:500]) + normalizer.update(x[500:]) + + assert_allclose(normalizer.mean, np.mean(x, axis=0)) + assert_allclose(normalizer.std, np.std(x, axis=0)) + + x_norm = normalizer.normalize(x) + assert_allclose(np.mean(x_norm, axis=0), np.zeros(5, dtype=normalizer.dtype), atol=1e-5) + assert_allclose(np.std(x_norm, axis=0), np.ones(5, dtype=normalizer.dtype), atol=1e-5) + + x_denorm = normalizer.denormalize(x_norm) + assert_allclose(x_denorm, x) + + if __name__ == '__main__': pytest.main([__file__])