In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import sys
sys.path.append('../../../module/')

from keras2.models import Model
from keras2.layers import concatenate, Dense, Input, Flatten
from keras2.optimizers import Adam, Optimizer
import keras2.backend as K
import tensorflow as tf
import csv
import itertools
from util import *
import gym2
from rl2.agents import selfDDPGAgent, selfDDPGAgent2, selfDDPGAgent3
from rl2.memory import SequentialMemory

Using TensorFlow backend.
Using TensorFlow backend.


In [2]:
env = gym2.make('Linear-v1')
Q = .01 * np.eye(2)
R = .01

In [86]:
def critic_net(a_shape , s_shape):
    action_input = Input(a_shape)
    observation_input = Input(shape=(1,)+s_shape)
    flattened_observation = Flatten()(observation_input)
    x = concatenate([action_input, flattened_observation])
    x = Dense(16, activation="relu")(x)
    x = Dense(16, activation="relu")(x)
    x = Dense(1, activation="linear")(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)
    return (critic, action_input)

def value_net(s_shape):
    state_input = Input((1,)+s_shape)
    x = Flatten()(state_input)
    
    x = Dense(16, activation='relu')(x)
    x = Dense(16, activation='relu')(x)
    x = Dense(1, activation="linear")(x)
    value = Model(inputs=state_input, output=x)
    return value

def branch_actor(a_shape, s_shape):
    action_input = Input(shape=(1,)+s_shape)
    x = Flatten()(action_input) # 実質的なinput layer
    
    x1 = Dense(16, activation="relu")(x)
    x1 = Dense(16, activation="relu")(x1)
    x1 = Dense(1, activation="multiple_tanh")(x1) # action signal
    
    x2 = Dense(16, activation="relu")(x)
    x2 = Dense(16, activation="relu")(x2)
    x2 = Dense(1, activation="tau_output_large")(x2) # tau
    
    output = concatenate([x1, x2])
    actor = Model(inputs=action_input, outputs=output)
    return actor

In [87]:
actor = branch_actor((2,),(2,))
critic = critic_net((2,),(2,))[0]
critic.compile(Adam(lr=0.001, clipnorm=1.), loss='mean_squared_error')

In [88]:
x = tf.placeholder(tf.float32, shape=(None, 1))
y = tf.placeholder(tf.float32, shape=(None, 1))
d = tf.matmul(x,y, transpose_a=True)

print(x)
print(d)

with tf.Session() as sess:
    print(sess.run(d, feed_dict={x:[[0.3],[0.2]], y:[[3],[4]]}))

Tensor("Placeholder_91:0", shape=(?, 1), dtype=float32)
Tensor("MatMul_16:0", shape=(1, 1), dtype=float32)
[[1.7]]


In [89]:
print(np.array([[[1]],[[2]]]).shape)

(2, 1, 1)


In [226]:
alpha = 0.4
beta = 1.
dummy_optimizer = Optimizer()
def gradient_for_one_data(state, next_state, actor, critic):
    params = actor.trainable_weights  
    # q_func gradient tensor
    state_input = tf.constant([[state.tolist()]], tf.float32)
    actor_output = actor(state_input)
    combined_input_tensor = [actor_output, state_input]
    critic_output_tensor = critic(combined_input_tensor)
    loss = -K.mean(critic_output_tensor)
    gradient_for_q = dummy_optimizer.get_gradients(loss, params)
    
    # discount factor gradient tensor
    discount = - tf.exp(- alpha * actor_output[0])
    gradient_for_d = dummy_optimizer.get_gradients(discount, params)
    next_state_input = tf.constant([[next_state.tolist()]], tf.float32)
    next_action = actor(next_state_input)
    next_value = critic([next_action, next_state_input])[0]
    gradient_for_d = [next_value * g for g in gradient_for_d]
    
    gradient_for_one_state = [K.add(gq, gd) for gq, gd in zip(gradient_for_q, gradient_for_d)]
    
    return gradient_for_one_state

def gradient(states, ts, next_states, actor, critic):
    batch_size = len(states)
    for i in range(batch_size):
        state, t, next_state = states[i], ts[i], next_states[i]
        discount = np.exp(- alpha * np.array(t))
        g_tensor = gradient_for_one_data(state, next_state, actor, critic)
        g_tensor = [discount * g for g in g_tensor]
        if i == 0:
            out = [tf.constant(0.)] * len(g_tensor)
            out = [K.add(o, g) for o, g in zip(out, g_tensor)]
        else:
            out = [K.add(o, g) for o, g in zip(out, g_tensor)]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        g = sess.run(out) 
    return g

In [220]:
state = np.array([1,2])
next_state = np.array([2,3])
g = gradient_for_one_data(state, next_state, actor, critic)
g

[<tf.Tensor 'Add_765:0' shape=(2, 16) dtype=float32>,
 <tf.Tensor 'Add_766:0' shape=(16,) dtype=float32>,
 <tf.Tensor 'Add_767:0' shape=(2, 16) dtype=float32>,
 <tf.Tensor 'Add_768:0' shape=(16,) dtype=float32>,
 <tf.Tensor 'Add_769:0' shape=(16, 16) dtype=float32>,
 <tf.Tensor 'Add_770:0' shape=(16,) dtype=float32>,
 <tf.Tensor 'Add_771:0' shape=(16, 16) dtype=float32>,
 <tf.Tensor 'Add_772:0' shape=(16,) dtype=float32>,
 <tf.Tensor 'Add_773:0' shape=(16, 1) dtype=float32>,
 <tf.Tensor 'Add_774:0' shape=(1,) dtype=float32>,
 <tf.Tensor 'Add_775:0' shape=(16, 1) dtype=float32>,
 <tf.Tensor 'Add_776:0' shape=(1,) dtype=float32>]

In [227]:
states = [np.array([1,2]), np.array([1,2])]
ts = np.random.uniform(0,10,2)
discounts = np.exp(- alpha * np.array(ts))
out = gradient(states,ts,states,actor,critic)

In [157]:
state_input = tf.placeholder(tf.float32, shape=(1, 1, 2))
b = actor(state_input)[0][1]
c = tf.exp(-0.4*b)
f = K.function([state_input], [c])

In [245]:

next_state_input = tf.constant([[next_state.tolist()]], tf.float32)
next_action = actor(next_state_input)
next_value = critic([next_action, next_state_input])[0][0]

In [246]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    o = sess.run(next_value)
o

1.9683669

In [274]:
def gradient_for_one_data(state, next_state, actor, critic):
    params = actor.trainable_weights  
    # q_func gradient tensor
    state_input = tf.constant([[state.tolist()]], tf.float32)
    actor_output = actor(state_input)
    combined_input_tensor = [actor_output, state_input]
    critic_output_tensor = critic(combined_input_tensor)
    loss = -K.mean(critic_output_tensor)
    gradient_for_q = dummy_optimizer.get_gradients(loss, params)
    
    # discount factor gradient tensor
    discount = - tf.exp(- alpha * actor_output[0][1])
    gradient_for_d = dummy_optimizer.get_gradients(discount, params)
    next_state_input = tf.constant([[next_state.tolist()]], tf.float32)
    next_action = actor(next_state_input)
    next_value = critic([next_action, next_state_input])[0][0]
    gradient_for_d = [next_value * g for g in gradient_for_d]
    
    gradient_for_one_state = [K.add(gq, gd) for gq, gd in zip(gradient_for_q, gradient_for_d)]
    
    return gradient_for_one_state

def gradient(states, next_states, actor, critic):
    batch_size = len(states)
    for i in range(batch_size):
        state, next_state = states[i], next_states[i]
        g_tensor = gradient_for_one_data(state, next_state, actor, critic)
        if i == 0:
            out = [tf.constant(0.)] * len(g_tensor)
            out = [K.add(o, g) for o, g in zip(out, g_tensor)]
        else:
            out = [K.add(o, g) for o, g in zip(out, g_tensor)]
    func = K.function([], out)
    gg = func([[]])
    return gg

In [273]:
func = K.function([state_input], gradient_for_d)
states = np.random.randn(10,1,2)

[array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       dtype=float32),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       dtype=float32),
 array([[-0.11229111, -0.24341993, -0.03630567, -0.08787848, -0.10941689,
          0.21040806,  0.02520856, -0.06745154, -0.06868149,  0.01851335,
          0.07598334,  0.0614257 ,  0.09258272, -0.05652423,  0.10875043,
         -0.11025535],
        [-0.13694474, -0.11092255, -0.06447151, -0.03315563, -0.04780806,
          0.03805738, -0.00338256, -0.02910255,  0.07443681,  0.00394833,
          0.03462461,  0.03452566, -0.06600756,  0.00651356,  0.04361254,
         -0.0466548 ]], dtype=float32),
 array([ 0.1351693 ,  0.19580382,  0.04206249, -0.07052041,  0.08830704,
        -0.16184129,  0.0344316 , -0.04228456, -0.03849045, -0.01294317,
         0.04166566,  0.03302442,  0.19174244, -0.0796766 ,  0.07889966,
      

In [272]:
def gradient(states, ts, next_states, actor, critic):
    batch_size = len(states)
    for i in range(batch_size):
        state, t, next_state = states[i], ts[i], next_states[i]
        discount = np.exp(- alpha * t)
        g_tensor = gradient_for_one_data(state, next_state, actor, critic)
        g_tensor = [discount * g for g in g_tensor]
        if i == 0:
            out = [tf.constant(0.)] * len(g_tensor)
            out = [K.add(o, g) for o, g in zip(out, g_tensor)]
        else:
            out = [K.add(o, g) for o, g in zip(out, g_tensor)]
    func = K.function([], out)
    gg = func([[]])
    return gg

array([[[ 0.30088118, -0.38656207]],

       [[ 0.02471239,  0.30542027]],

       [[-0.19578595, -1.40778848]],

       [[-0.40148604, -1.25714324]],

       [[ 1.57934584, -0.79851163]],

       [[ 0.24536596,  0.46897948]],

       [[-0.38385112,  1.8293963 ]],

       [[-0.7633261 ,  0.02787994]],

       [[-0.54199855, -1.81471219]],

       [[ 1.54494108,  0.79743481]]])