In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from keras.models import Model
from keras.layers import concatenate, Dense, Input, Flatten
from keras.optimizers import Adam
import gym
from rl.agents import eventDDPGAgent
from rl.memory import SequentialMemory

Using TensorFlow backend.


In [2]:
# GymのPendulum環境を作成
env = gym.make("Pendulum-v0")

# 取りうる”打ち手”のアクション数と値の定義
nb_actions = 2
ACT_ID_TO_VALUE = {0: [-1], 1: [+1]}

print("Action Space: %s" % env.action_space)
#action  dim = 1
#critic dim = 3 with ??
print( env.observation_space.shape[0])

Action Space: Box(1,)
3


In [3]:
def actor_net(a_shape, s_shape):
    action_input = Input(shape=(1,)+s_shape)
    x = Flatten()(action_input)
    x = Dense(32, activation="relu")(x)
    x = Dense(16, activation="relu")(x)
    x = Dense(3, activation="tanh")(x)
    #x = Dense(a_shape[0], activation="linear")(x)
    actor = Model(inputs=action_input, outputs=x)
    return actor

In [4]:
def critic_net(a_shape , s_shape):
    action_input = Input(a_shape)
    observation_input = Input(shape=(1,)+s_shape)
    flattened_observation = Flatten()(observation_input)
    x = concatenate([action_input, flattened_observation])
    x = Dense(32, activation="relu")(x)
    x = Dense(32, activation="relu")(x)
    x = Dense(1, activation="linear")(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)
    return (critic, action_input)


In [5]:
def agent(a_shape, s_shape):
    actor = actor_net(a_shape, s_shape)
    critic,  critic_action_input = critic_net(a_shape, s_shape)
    memory = SequentialMemory(limit = 50000, window_length = 1)
    print('critic_action_input = ', critic_action_input)
    agent = eventDDPGAgent(
        a_shape[0],
        actor,
        critic,
        critic_action_input,
        memory
    )
    return agent

In [6]:
agent = agent((3,), env.observation_space.shape)
print(env.action_space.shape, env.observation_space.shape)
agent.compile(Adam(lr=0.001, clipnorm=1.), metrics=["mae"])
agent.fit(env, nb_steps=100000, visualize=True, verbose=1, nb_max_episode_steps=200)
print('Finish Learning. We start test phase.')
agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)

W0924 16:54:45.896171 140736146887552 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0924 16:54:45.941265 140736146887552 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0924 16:54:45.992090 140736146887552 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4158: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0924 16:54:46.149750 140736146887552 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is de

network.py shape =  (None, 1, 3)
network.py shape =  (None, 3)
network.py shape =  (None, 1, 3)
critic_action_input =  Tensor("input_2:0", shape=(?, 3), dtype=float32)
(1,) (3,)


W0924 16:54:46.559592 140736146887552 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



network.py shape =  (None, 1, 3)
network.py shape =  (None, 3)
network.py shape =  (None, 1, 3)
Training for 100000 steps ...

episode =  0
Interval 1 (0 steps performed)
  199/10000 [..............................] - ETA: 4:03 - reward: -5.9631
episode =  1
  399/10000 [>.............................] - ETA: 3:19 - reward: -6.9572
episode =  2
  599/10000 [>.............................] - ETA: 3:02 - reward: -6.7801
episode =  3
  797/10000 [=>............................] - ETA: 2:52 - reward: -6.6200
episode =  4
  999/10000 [=>............................] - ETA: 2:44 - reward: -6.5039
episode =  5
 1200/10000 [==>...........................] - ETA: 2:49 - reward: -6.4109
episode =  6
 1399/10000 [===>..........................] - ETA: 2:42 - reward: -6.3138
episode =  7
 1599/10000 [===>..........................] - ETA: 2:37 - reward: -6.3208
episode =  8
 1797/10000 [====>.........................] - ETA: 2:31 - reward: -6.1485
episode =  9
 1998/10000 [====>...................

episode =  89
episode =  90
episode =  91
episode =  92
episode =  93
episode =  94
episode =  95
episode =  96
episode =  97
episode =  98
episode =  99

episode =  100
50 episodes - episode_reward: -1065.114 [-1733.314, -569.502] - loss: 13.860 - mean_absolute_error: 0.913 - mean_q: -74.158

Interval 3 (20000 steps performed)
  197/10000 [..............................] - ETA: 2:42 - reward: -8.1160
episode =  101
  399/10000 [>.............................] - ETA: 2:39 - reward: -6.8585
episode =  102
  597/10000 [>.............................] - ETA: 2:36 - reward: -5.6582
episode =  103
  797/10000 [=>............................] - ETA: 2:33 - reward: -5.1942
episode =  104
 1000/10000 [==>...........................] - ETA: 2:29 - reward: -5.0361
episode =  105
 1198/10000 [==>...........................] - ETA: 2:26 - reward: -4.8273
episode =  106
 1398/10000 [===>..........................] - ETA: 2:23 - reward: -4.6775
episode =  107
 1600/10000 [===>.......................

episode =  177
episode =  178
episode =  179
episode =  180
episode =  181
episode =  182
episode =  183
episode =  184
episode =  185
episode =  186
episode =  187
episode =  188
episode =  189
episode =  190
episode =  191
episode =  192
episode =  193
episode =  194
episode =  195
episode =  196
episode =  197
episode =  198
episode =  199

episode =  200
50 episodes - episode_reward: -933.163 [-1644.734, -1.928] - loss: 36.580 - mean_absolute_error: 1.962 - mean_q: -113.680

Interval 5 (40000 steps performed)
  200/10000 [..............................] - ETA: 2:43 - reward: -3.6654
episode =  201
  400/10000 [>.............................] - ETA: 2:39 - reward: -4.9683
episode =  202
  600/10000 [>.............................] - ETA: 2:36 - reward: -4.5589
episode =  203
  800/10000 [=>............................] - ETA: 2:33 - reward: -4.0362
episode =  204
  998/10000 [=>............................] - ETA: 2:30 - reward: -4.5754
episode =  205
 1200/10000 [==>...............

episode =  264
episode =  265
episode =  266
episode =  267
episode =  268
episode =  269
episode =  270
episode =  271
episode =  272
episode =  273
episode =  274
episode =  275
episode =  276
episode =  277
episode =  278
episode =  279
episode =  280
episode =  281
episode =  282
episode =  283
episode =  284
episode =  285
episode =  286
episode =  287
episode =  288
episode =  289
episode =  290
episode =  291
episode =  292
episode =  293
episode =  294
episode =  295
episode =  296
episode =  297
episode =  298
episode =  299

episode =  300
50 episodes - episode_reward: -360.976 [-1363.887, -1.129] - loss: 41.285 - mean_absolute_error: 2.294 - mean_q: -114.445

Interval 7 (60000 steps performed)
  198/10000 [..............................] - ETA: 2:43 - reward: -2.4068
episode =  301
  399/10000 [>.............................] - ETA: 2:39 - reward: -2.5095
episode =  302
  599/10000 [>.............................] - ETA: 2:36 - reward: -1.8734
episode =  303
  800/10000 [=>.

  396/10000 [>.............................] - ETA: 1:16 - reward: -2.9885
episode =  352
  599/10000 [>.............................] - ETA: 1:11 - reward: -3.3055
episode =  353
  796/10000 [=>............................] - ETA: 1:07 - reward: -2.6460
episode =  354
 1000/10000 [==>...........................] - ETA: 1:04 - reward: -2.1094
episode =  355
 1199/10000 [==>...........................] - ETA: 1:02 - reward: -2.1674
episode =  356
 1396/10000 [===>..........................] - ETA: 1:00 - reward: -2.1217
episode =  357
 1595/10000 [===>..........................] - ETA: 59s - reward: -1.9366
episode =  358
 1799/10000 [====>.........................] - ETA: 57s - reward: -1.8510
episode =  359
 1994/10000 [====>.........................] - ETA: 56s - reward: -1.8583
episode =  360
 2199/10000 [=====>........................] - ETA: 59:41 - reward: -1.7428
episode =  361
episode =  362
episode =  363
episode =  364
episode =  365

episode =  366
episode =  367
episode =  

episode =  442
episode =  443
episode =  444
episode =  445
episode =  446
episode =  447
episode =  448
episode =  449

episode =  450
50 episodes - episode_reward: -579.976 [-1372.586, -0.262] - loss: 10.272 - mean_absolute_error: 1.873 - mean_q: -47.096

Interval 10 (90000 steps performed)
  199/10000 [..............................] - ETA: 2:42 - reward: -5.2575
episode =  451
  399/10000 [>.............................] - ETA: 2:39 - reward: -5.6174
episode =  452
  600/10000 [>.............................] - ETA: 2:36 - reward: -5.8883
episode =  453
  797/10000 [=>............................] - ETA: 2:33 - reward: -6.1116
episode =  454
  998/10000 [=>............................] - ETA: 2:29 - reward: -6.2067
episode =  455
 1197/10000 [==>...........................] - ETA: 2:26 - reward: -6.3141
episode =  456
 1398/10000 [===>..........................] - ETA: 2:23 - reward: -6.3699
episode =  457
 1598/10000 [===>..........................] - ETA: 2:19 - reward: -6.2246
e

<keras.callbacks.History at 0x137c87400>

In [8]:
a = np.array([1,2]).shape
b = np.array([1,2,3]).shape
c = np.array([1,2,3,4]).shape
print(a + b + c)

(2, 3, 4)


In [9]:
a = np.array([ [ [ 1,2 ] ], [ [ 2,1 ] ] ])
print(a.shape)
print(a[0][1])

(2, 1, 2)


IndexError: index 1 is out of bounds for axis 0 with size 1

In [None]:
def array(n):
    a = []
    for i in range(n):
        a.append([1,2])
    return a

print(array(3))
print(array(3)[0])
print(array[0](3))

In [None]:
a = np.array([[1,2],[3,4],[5,6]])
b = np.zeros((3,1))
for i in range(a.shape[0]):
    b[i] = a[i][0]
print(b)

In [None]:
a = np.array([[[1,2]],[[3,2]]])
print(a)
print(a.shape)

In [None]:
a = 1
a[0]

In [None]:
critic net のビルド時に_feed_input_shape が変わるのでは。 a_shape = (3,)にすれば よいい