In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import sys
sys.path.append('../../../module/')

from keras2.models import Model
from keras2.layers import concatenate, Dense, Input, Flatten
from keras2.optimizers import Adam
import csv
from util import *
import gym2
from rl2.agents import selfDDPGAgent, selfDDPGAgent2
from rl2.memory import SequentialMemory

Using TensorFlow backend.
Using TensorFlow backend.


In [2]:
env = gym2.make('Pendulum-v2')

In [3]:
def critic_net(a_shape , s_shape):
    action_input = Input(a_shape)
    observation_input = Input(shape=(1,)+s_shape)
    flattened_observation = Flatten()(observation_input)
    x = concatenate([action_input, flattened_observation])
    x = Dense(16, activation="relu")(x)
    x = Dense(16, activation="relu")(x)
    x = Dense(1, activation="linear")(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)
    return (critic, action_input)

def branch_actor(a_shape, s_shape):
    action_input = Input(shape=(1,)+s_shape)
    x = Flatten()(action_input) # 実質的なinput layer
    
    x1 = Dense(8, activation="relu")(x)
    x1 = Dense(8, activation="relu")(x1)
    x1 = Dense(1, activation="multiple_tanh")(x1) # action signal
    
    x2 = Dense(8, activation="relu")(x)
    x2 = Dense(8, activation="relu")(x2)
    x2 = Dense(1, activation="tau_output")(x2) # tau
    
    output = concatenate([x1, x2])
    actor = Model(inputs=action_input, outputs=output)
    return actor


def agent2(a_shape, s_shape):
    actor = branch_actor(a_shape, s_shape)
    critic, critic_action_input = critic_net(a_shape, s_shape)
    memory = SequentialMemory(limit = 50000, window_length = 1)
    agent = selfDDPGAgent2(
        a_shape[0],
        actor,
        critic,
        critic_action_input,
        memory,
        mb_noise=False,
        coef_u = .01,
        coef_tau = .001,
        action_clipper=[-10., 10.],
        tau_clipper=[0.001, 1.],
        params_logging=False,
        gradient_logging=False,
        batch_size=128,
    )
    return agent

In [4]:
#learning   
l = .1
step = 1000000  # num of interval
episode_step = step
a = agent2((2,), (2,))
actor_optimizer, critic_optimizer = Adam(lr=100., clipnorm=1.), Adam(lr=0.001, clipnorm=1.) # actorの方は何でもいい
optimizer = [actor_optimizer, critic_optimizer]
a.compile(optimizer=optimizer, metrics=["mse"], action_lr=0.0001, tau_lr=0.001)

Instructions for updating:
Colocations handled automatically by placer.


In [6]:
#a.load_weights('../saved_agent/linear_init.h5')
a.actor.load_weights('../saved_agent/sample_02.h5')
out = a.fit(env, l=l, nb_steps=step, visualize=0, verbose=1, nb_max_episode_steps=episode_step, episode_time=10.)

Training for 1000000 steps ...
Interval 1 (0 steps performed)
Instructions for updating:
Use tf.cast instead.
52 episodes - episode_reward: -7.112 [-24.164, 0.598] - loss: 0.019 - mean_squared_error: 0.039 - mean_q: -0.618

Interval 2 (10000 steps performed)
132 episodes - episode_reward: -3.051 [-8.300, 0.207] - loss: 0.003 - mean_squared_error: 0.006 - mean_q: -0.117

Interval 3 (20000 steps performed)
164 episodes - episode_reward: -3.109 [-8.219, -0.182] - loss: 0.004 - mean_squared_error: 0.008 - mean_q: 0.198

Interval 4 (30000 steps performed)
117 episodes - episode_reward: -2.408 [-7.313, 0.214] - loss: 0.005 - mean_squared_error: 0.010 - mean_q: 0.335

Interval 5 (40000 steps performed)
75 episodes - episode_reward: -2.406 [-6.702, 0.152] - loss: 0.005 - mean_squared_error: 0.011 - mean_q: 0.436

Interval 6 (50000 steps performed)
21 episodes - episode_reward: -3.023 [-10.353, 0.560] - loss: 0.005 - mean_squared_error: 0.010 - mean_q: 0.468

Interval 7 (60000 steps performed)


