In [1]:
import numpy as np
import gym

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Input, Concatenate
from keras.optimizers import Adam

from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess

ENV_NAME = 'CoolingFin-v0'
gym.undo_logger_setup()


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
assert len(env.action_space.shape) == 1
nb_actions = env.action_space.shape[0]

# Next, we build a very simple model.
actor = Sequential()
actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
actor.add(Dense(16))
actor.add(Activation('relu'))
actor.add(Dense(16))
actor.add(Activation('relu'))
actor.add(Dense(16))
actor.add(Activation('relu'))
actor.add(Dense(nb_actions))
actor.add(Activation('linear'))
print(actor.summary())

action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
flattened_observation = Flatten()(observation_input)
x = Concatenate()([action_input, flattened_observation])
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                  random_process=random_process, gamma=.5, target_model_update=1e-3)
agent.compile(Adam(lr=.0001, clipnorm=1.), metrics=['mae'])

Using TensorFlow backend.


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 42)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                688       
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)            

In [2]:
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env, nb_steps=5000, log_interval=500,visualize=False, verbose=1, nb_max_episode_steps=200)

# After training is done, we save the final weights.
agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

Training for 5000 steps ...
Interval 1 (0 steps performed)
2 episodes - episode_reward: 8617706.312 [4217017.935, 13018394.690] - loss: 1100701516.204 - mean_absolute_error: 29220.968 - mean_q: 3122.647

Interval 2 (500 steps performed)
3 episodes - episode_reward: 17135817.345 [17132095.418, 17141907.054] - loss: 819714880.000 - mean_absolute_error: 29902.082 - mean_q: 49054.848

Interval 3 (1000 steps performed)
2 episodes - episode_reward: 16957943.521 [16783791.623, 17132095.418] - loss: 624456448.000 - mean_absolute_error: 24062.760 - mean_q: 73750.469

Interval 4 (1500 steps performed)
3 episodes - episode_reward: 16975154.862 [16941485.421, 17016546.901] - loss: 505610656.000 - mean_absolute_error: 14962.451 - mean_q: 98231.336

Interval 5 (2000 steps performed)
2 episodes - episode_reward: 14146590.485 [11344003.025, 16949177.944] - loss: 620141824.000 - mean_absolute_error: 14379.436 - mean_q: 120911.016

Interval 6 (2500 steps performed)
3 episodes - episode_reward: -53811.06

In [3]:
# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=5, visualize=False, nb_max_episode_steps=200)

Testing for 5 episodes ...
Episode 1: reward: 7845391.894, steps: 200
Episode 2: reward: 7845391.894, steps: 200
Episode 3: reward: 7845391.894, steps: 200
Episode 4: reward: 7845391.894, steps: 200
Episode 5: reward: 7845391.894, steps: 200


<keras.callbacks.History at 0x11300668>

In [4]:
print(observation_input)
print(env.observation_space.shape)

Tensor("observation_input:0", shape=(?, 1, 42), dtype=float32)
(42,)


In [5]:
print(agent.nb_actions)

20
