In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from keras.models import Model
from keras.layers import concatenate, Dense, Input, Flatten
from keras.optimizers import Adam
import gym
from rl.agents import DDPGAgent
from rl.memory import SequentialMemory

Using TensorFlow backend.


In [2]:
# GymのPendulum環境を作成
env = gym.make("Pendulum-v0")

# 取りうる”打ち手”のアクション数と値の定義
nb_actions = 2
ACT_ID_TO_VALUE = {0: [-1], 1: [+1]}

print("Action Space: %s" % env.action_space)
#action  dim = 1
#critic dim = 3 with ??
print( env.observation_space.shape[0])

Action Space: Box(1,)
3


In [3]:
def actor_net(a_shape, s_shape):
    action_input = Input(shape=(1,)+s_shape)
    x = Flatten()(action_input)
    x = Dense(16, activation="relu")(x)
    x = Dense(16, activation="relu")(x)
    x = Dense(a_shape[0], activation="linear")(x)
    actor = Model(inputs=action_input, outputs=x)
    return actor

def critic_net(a_shape, s_shape):
    action_input = Input(a_shape)
    observation_input = Input(shape=(1,)+s_shape)
    flattened_observation = Flatten()(observation_input)
    x = concatenate([action_input, flattened_observation])
    x = Dense(32, activation="relu")(x)
    x = Dense(32, activation="relu")(x)
    x = Dense(1, activation="linear")(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)
    return (critic, action_input)

In [4]:
def agent(a_shape, s_shape):
    actor = actor_net(a_shape, s_shape)
    critic,  critic_action_input = critic_net(a_shape, s_shape)
    memory = SequentialMemory(limit = 50000, window_length = 1)
    agent = DDPGAgent(
        a_shape[0],
        actor,
        critic,
        critic_action_input,
        memory,
        target_model_update=.01
    )
    return agent

In [5]:
agent = agent(env.action_space.shape, env.observation_space.shape)
print(env.action_space.shape, env.observation_space.shape)
agent.compile(Adam(lr=0.001, clipnorm=1., decay = 0.001), metrics=["mse"])
agent.fit(env, nb_steps=50000, visualize=0, verbose=1, nb_max_episode_steps=200)
#agent.test(env, nb_episodes=5, visualize=0, nb_max_episode_steps=200)

W1010 19:57:27.034594 140735584400256 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1010 19:57:27.054357 140735584400256 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1010 19:57:27.071994 140735584400256 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4158: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1010 19:57:27.194554 140735584400256 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is de

(1,) (3,)


W1010 19:57:27.482129 140735584400256 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



<keras.optimizers.Adam object at 0x12b186d68>
Training for 50000 steps ...
Interval 1 (0 steps performed)
50 episodes - episode_reward: -994.765 [-1687.718, -1.917] - loss: 69.893 - mean_squared_error: 139.786 - mean_q: -150.064

Interval 2 (10000 steps performed)
50 episodes - episode_reward: -258.381 [-1501.185, -0.793] - loss: 61.988 - mean_squared_error: 123.975 - mean_q: -142.281

Interval 3 (20000 steps performed)
50 episodes - episode_reward: -173.464 [-400.803, -1.116] - loss: 19.124 - mean_squared_error: 38.249 - mean_q: -47.912

Interval 4 (30000 steps performed)
50 episodes - episode_reward: -171.420 [-438.786, -1.266] - loss: 18.267 - mean_squared_error: 36.534 - mean_q: 28.752

Interval 5 (40000 steps performed)
done, took 246.138 seconds


<keras.callbacks.History at 0x12b857f28>

In [7]:
agent.test(env, nb_episodes=5, visualize=1, nb_max_episode_steps=200)

Testing for 5 episodes ...
Episode 1: reward: -745.911, steps: 200
Episode 2: reward: -734.510, steps: 200
Episode 3: reward: -124.128, steps: 200
Episode 4: reward: -648.875, steps: 200
Episode 5: reward: -765.605, steps: 200


<keras.callbacks.History at 0x12faac908>

In [6]:
agent.fit(env, nb_steps=30000, visualize=0, verbose=1, nb_max_episode_steps=200)

Training for 30000 steps ...
Interval 1 (0 steps performed)
50 episodes - episode_reward: -1498.042 [-1657.866, -945.195] - loss: 7465498288767733071872.000 - mean_squared_error: 14930996577535466143744.000 - mean_q: 9759891516.987

Interval 2 (10000 steps performed)
50 episodes - episode_reward: -1391.066 [-1657.149, -858.739] - loss: 389024682807153416732672.000 - mean_squared_error: 778049365614306833465344.000 - mean_q: 106456883200.000

Interval 3 (20000 steps performed)


<keras.callbacks.History at 0x12b027518>