In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from keras.models import Model
from keras.layers import concatenate, Dense, Input, Flatten
from keras.optimizers import Adam
import gym
from rl.agents import DDPGAgent
from rl.memory import SequentialMemory

Using TensorFlow backend.


In [2]:
# GymのPendulum環境を作成
env = gym.make("Pendulum-v0")

# 取りうる”打ち手”のアクション数と値の定義
nb_actions = 2
ACT_ID_TO_VALUE = {0: [-1], 1: [+1]}

print("Action Space: %s" % env.action_space)
#action  dim = 1
#critic dim = 3 with ??
print( env.observation_space.shape[0])

Action Space: Box(1,)
3


In [3]:
def actor_net(a_shape, s_shape):
    action_input = Input(shape=(1,)+s_shape)
    x = Flatten()(action_input)
    x = Dense(16, activation="relu")(x)
    x = Dense(16, activation="relu")(x)
    x = Dense(a_shape[0], activation="linear")(x)
    actor = Model(inputs=action_input, outputs=x)
    return actor

In [4]:
def critic_net(a_shape, s_shape):
    action_input = Input(a_shape)
    observation_input = Input(shape=(1,)+s_shape)
    flattened_observation = Flatten()(observation_input)
    x = concatenate([action_input, flattened_observation])
    x = Dense(32, activation="relu")(x)
    x = Dense(32, activation="relu")(x)
    x = Dense(1, activation="linear")(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)
    return (critic, action_input)


In [5]:
def agent(a_shape, s_shape):
    actor = actor_net(a_shape, s_shape)
    critic,  critic_action_input = critic_net(a_shape, s_shape)
    memory = SequentialMemory(limit = 50000, window_length = 1)
    agent = DDPGAgent(
        a_shape[0],
        actor,
        critic,
        critic_action_input,
        memory
    )
    return agent

In [6]:
agent = agent(env.action_space.shape, env.observation_space.shape)
print(env.action_space.shape, env.observation_space.shape)
agent.compile(Adam(lr=0.001, clipnorm=1.), metrics=["mae"])
agent.fit(env, nb_steps=500, visualize=True, verbose=1, nb_max_episode_steps=200)
agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)

W0922 17:41:23.932668 140736146887552 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0922 17:41:23.950040 140736146887552 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0922 17:41:23.964315 140736146887552 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4158: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



(1,) (3,)


W0922 17:41:24.105174 140736146887552 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0922 17:41:24.107983 140736146887552 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:181: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

W0922 17:41:24.409943 140736146887552 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



Training for 500 steps ...
Interval 1 (0 steps performed)
step =  0
(1,)
    1/10000 [..............................] - ETA: 4:20:39 - reward: -5.2892step =  1
(1,)
step =  2
(1,)
step =  3
(1,)
step =  4
(1,)
    5/10000 [..............................] - ETA: 54:22 - reward: -6.2394  step =  5
(1,)
step =  6
(1,)
step =  7
(1,)
    8/10000 [..............................] - ETA: 35:01 - reward: -7.5451step =  8
(1,)
step =  9
(1,)
step =  10
(1,)
step =  11
(1,)
   12/10000 [..............................] - ETA: 24:14 - reward: -7.9176step =  12
(1,)
step =  13
(1,)
step =  14
(1,)
step =  15
(1,)
   16/10000 [..............................] - ETA: 18:51 - reward: -7.2789step =  16
(1,)
step =  17
(1,)
step =  18
(1,)
step =  19
(1,)
   20/10000 [..............................] - ETA: 15:38 - reward: -6.7475step =  20
(1,)
step =  21
(1,)
step =  22
(1,)
   23/10000 [..............................] - ETA: 13:57 - reward: -6.7218step =  23
(1,)
step =  24
(1,)
step =  25
(1,)
   26/1

  215/10000 [..............................] - ETA: 3:53 - reward: -6.2478step =  215
(1,)
step =  216
(1,)
step =  217
(1,)
step =  218
(1,)
  219/10000 [..............................] - ETA: 3:52 - reward: -6.2246step =  219
(1,)
step =  220
(1,)
step =  221
(1,)
  222/10000 [..............................] - ETA: 3:51 - reward: -6.1624step =  222
(1,)
step =  223
(1,)
step =  224
(1,)
step =  225
(1,)
  226/10000 [..............................] - ETA: 3:49 - reward: -6.0662step =  226
(1,)
step =  227
(1,)
step =  228
(1,)
step =  229
(1,)
  230/10000 [..............................] - ETA: 3:48 - reward: -5.9885step =  230
(1,)
step =  231
(1,)
step =  232
(1,)
  233/10000 [..............................] - ETA: 3:47 - reward: -5.9775step =  233
(1,)
step =  234
(1,)
step =  235
(1,)
  236/10000 [..............................] - ETA: 3:46 - reward: -6.0536step =  236
(1,)
step =  237
(1,)
step =  238
(1,)
step =  239
(1,)
  240/10000 [..............................] - ETA: 3:45 

  432/10000 [>.............................] - ETA: 3:16 - reward: -6.5892step =  432
(1,)
step =  433
(1,)
step =  434
(1,)
step =  435
(1,)
  436/10000 [>.............................] - ETA: 3:15 - reward: -6.5985step =  436
(1,)
step =  437
(1,)
step =  438
(1,)
step =  439
(1,)
  440/10000 [>.............................] - ETA: 3:15 - reward: -6.6246step =  440
(1,)
step =  441
(1,)
step =  442
(1,)
step =  443
(1,)
  444/10000 [>.............................] - ETA: 3:15 - reward: -6.6415step =  444
(1,)
step =  445
(1,)
step =  446
(1,)
  447/10000 [>.............................] - ETA: 3:14 - reward: -6.6437step =  447
(1,)
step =  448
(1,)
step =  449
(1,)
step =  450
(1,)
  451/10000 [>.............................] - ETA: 3:14 - reward: -6.6454step =  451
(1,)
step =  452
(1,)
step =  453
(1,)
  454/10000 [>.............................] - ETA: 3:14 - reward: -6.6557step =  454
(1,)
step =  455
(1,)
step =  456
(1,)
  457/10000 [>.............................] - ETA: 3:13 

<keras.callbacks.History at 0x1297077f0>

In [7]:
a = np.array([1,2]).shape
b = np.array([1,2,3]).shape
c = np.array([1,2,3,4]).shape
print(a + b + c)

(2, 3, 4)


In [8]:
a = [1,1,3]
b = a[1:3]
print(b)
print(b[0],b[1])

[1, 3]
1 3
