In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from keras.models import Model
from keras.layers import concatenate, Dense, Input, Flatten
from keras.optimizers import Adam
import gym
from rl.agents import DDPGAgent
from rl.memory import SequentialMemory

Using TensorFlow backend.


In [2]:
# GymのPendulum環境を作成
env = gym.make("Pendulum-v0")

# 取りうる”打ち手”のアクション数と値の定義
nb_actions = 2
ACT_ID_TO_VALUE = {0: [-1], 1: [+1]}

print("Action Space: %s" % env.action_space)
#action  dim = 1
#critic dim = 3 with ??
print( env.observation_space.shape[0])

Action Space: Box(1,)
3


In [3]:
def actor_net(a_shape, s_shape):
    action_input = Input(shape=(1,)+s_shape)
    x = Flatten()(action_input)
    x = Dense(16, activation="relu")(x)
    x = Dense(16, activation="relu")(x)
    x = Dense(a_shape[0], activation="linear")(x)
    actor = Model(inputs=action_input, outputs=x)
    return actor

In [4]:
def critic_net(a_shape, s_shape):
    action_input = Input(a_shape)
    observation_input = Input(shape=(1,)+s_shape)
    flattened_observation = Flatten()(observation_input)
    x = concatenate([action_input, flattened_observation])
    x = Dense(32, activation="relu")(x)
    x = Dense(32, activation="relu")(x)
    x = Dense(1, activation="linear")(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)
    return (critic, action_input)


In [5]:
def agent(a_shape, s_shape):
    actor = actor_net(a_shape, s_shape)
    critic,  critic_action_input = critic_net(a_shape, s_shape)
    memory = SequentialMemory(limit = 50000, window_length = 1)
    agent = DDPGAgent(
        a_shape[0],
        actor,
        critic,
        critic_action_input,
        memory
    )
    return agent

In [6]:
agent = agent(env.action_space.shape, env.observation_space.shape)
print(env.action_space.shape, env.observation_space.shape)
agent.compile(Adam(lr=0.001, clipnorm=1.), metrics=["mae"])
agent.fit(env, nb_steps=1200, visualize=0, verbose=1, nb_max_episode_steps=200)
agent.test(env, nb_episodes=5, visualize=0, nb_max_episode_steps=200)

W0923 17:23:55.242219 140736146887552 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0923 17:23:55.261725 140736146887552 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0923 17:23:55.281327 140736146887552 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4158: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0923 17:23:55.409113 140736146887552 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is de

(1,) (3,)


W0923 17:23:55.651964 140736146887552 deprecation_wrapper.py:119] From /Users/admin/.pyenv/versions/3.6.6/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



Training for 1200 steps ...
Interval 1 (0 steps performed)
 1196/10000 [==>...........................] - ETA: 21s - reward: -6.7720done, took 2.925 seconds
Testing for 5 episodes ...
Episode 1: reward: -1625.792, steps: 200
Episode 2: reward: -1292.137, steps: 200
Episode 3: reward: -1611.689, steps: 200
Episode 4: reward: -1185.354, steps: 200
Episode 5: reward: -1619.661, steps: 200


<keras.callbacks.History at 0x128ddf7b8>

In [7]:
a = np.array([1,2]).shape
b = np.array([1,2,3]).shape
c = np.array([1,2,3,4]).shape
print(a + b + c)

(2, 3, 4)


In [8]:
a = [1,1,3]
b = a[1:3]
print(b)
print(b[0],b[1])

[1, 3]
1 3
