In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import sys
sys.path.append('../../../module/')

from keras2.models import Model
from keras2.layers import concatenate, Dense, Input, Flatten
from keras2.optimizers import Adam, Optimizer
import keras2.backend as K
import tensorflow as tf
import csv
import itertools
from util import *
import gym2
from rl2.agents import selfDDPGAgent, selfDDPGAgent2, selfDDPGAgent3
from rl2.memory import SequentialMemory

Using TensorFlow backend.
Using TensorFlow backend.


In [2]:
env = gym2.make('Linear-v1')
Q = .01 * np.eye(2)
R = .01

In [3]:
def critic_net(a_shape , s_shape):
    action_input = Input(a_shape)
    observation_input = Input(shape=(1,)+s_shape)
    flattened_observation = Flatten()(observation_input)
    x = concatenate([action_input, flattened_observation])
    x = Dense(16, activation="relu")(x)
    x = Dense(16, activation="relu")(x)
    x = Dense(1, activation="linear")(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)
    return (critic, action_input)

def value_net(s_shape):
    state_input = Input((1,)+s_shape)
    x = Flatten()(state_input)
    
    x = Dense(16, activation='relu')(x)
    x = Dense(16, activation='relu')(x)
    x = Dense(1, activation="linear")(x)
    value = Model(inputs=state_input, output=x)
    return value

def branch_actor(a_shape, s_shape):
    action_input = Input(shape=(1,)+s_shape)
    x = Flatten()(action_input) # 実質的なinput layer
    
    x1 = Dense(16, activation="relu")(x)
    x1 = Dense(16, activation="relu")(x1)
    x1 = Dense(1, activation="multiple_tanh")(x1) # action signal
    
    x2 = Dense(16, activation="relu")(x)
    x2 = Dense(16, activation="relu")(x2)
    x2 = Dense(1, activation="tau_output_large")(x2) # tau
    
    output = concatenate([x1, x2])
    actor = Model(inputs=action_input, outputs=output)
    return actor

In [28]:
dummy_optimizer = Optimizer()
def q_gradient(state, actor, critic):
    params = actor.trainable_weights
    
    # tensor
    state_input = tf.placeholder(tf.float32, shape=(None, 1, 2))
    actor_output = actor(state_input)
    combined_input_tensor = [actor_output, state_input]
    critic_output_tensor = critic(combined_input_tensor)
    loss = -K.mean(critic_output_tensor)
    gradient_tensor = dummy_optimizer.get_gradients(loss, params)
    
    # calc
    grad_calc_func = K.function([combined_input_tensor[1]], gradient_tensor)
    q_g = grad_calc_func([state])

    return q_g

# Adam クラス
def learning_rate_arr(actor, size=674, u_lr=.00001, tau_lr=.0001):
    grad_idx = 0
    i, j = 0, 0
    lrs = np.zeros((size,))
    for layer in actor.layers:
        if len(layer.get_weights())==0:
            continue
        else:
            w, b = layer.get_weights()
            grad_idx += len(w.ravel()) + len(b.ravel())
            if j % 2 == 0: # u
                lrs[i:grad_idx] = u_lr
            else: # tau
                lrs[i:grad_idx] = tau_lr
            i = grad_idx
            j += 1
    return lrs

    
def flatten_gradient(gradient):
    params = []
    for i in range(len(gradient)//2):
        w, b = gradient[2*i], gradient[2*i+1]
        layer_params = np.hstack((w.ravel(), b.ravel()))
        params = np.hstack((params, layer_params))
    params = np.array(params).ravel()
    return params


def get_nn_params(actor):
    params = []
    for layer in actor.layers:
        if len(layer.get_weights())==0:
            continue
        else:
            w, b = layer.get_weights()
            layer_params = np.hstack((w.ravel(), b.ravel()))
            params = np.hstack((params, layer_params))
    params = np.array(params).ravel()
    return params

def set_nn_params(actor, params):
    param_idx = 0
    for layer in actor.layers:
        if len(layer.get_weights())==0:
            continue
        else:
            w, b = layer.get_weights()
            # set w
            w_prime = params[param_idx:param_idx+w.ravel().shape[0]].reshape(w.shape)
            param_idx += w.ravel().shape[0]

            # set b
            b_prime = params[param_idx:param_idx+b.ravel().shape[0]].reshape(b.shape)
            param_idx += b.ravel().shape[0]

            layer.set_weights([w_prime, b_prime])
    assert params.shape[0] == param_idx
    return actor

class Adam():
    def __init__(self, lrs, beta_1=.9, beta_2=.999, epsilon=1e-6):
        self.lrs = lrs # 学習率
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.epsilon = epsilon
        
        self.m = np.zeros(674,) # 前回の勾配
        self.v = np.zeros(674,) # 前回の勾配
    
    def update(self, actor, pg):
        # update m
        self.m = (self.beta_1 * self.m) + (1. - self.beta_1) * pg
        
        # update v
        self.v = (self.beta_2 * self.v) + (1. - self.beta_2) * pg**2
        
        # 更新量を決める
        ag = self.lrs * self.m / (np.sqrt(self.v) + self.epsilon)
        
        # 今のパラメータ
        ps = _get_nn_params(actor)
        
        # update
        ps = ps - ag
        
        # update
        actor = _set_nn_params(actor, ps)
    
        return actor

In [29]:
alpha = 0.4
beta = 1.

# 1ステップのインタラクション
def interaction(state, u, tau, env):
    env.reset()
    x = np.array(state)
    env.set_state(x)
    reward = 0
    a_agent, tau = u, tau
    tau = np.clip(tau, 0.01, 10.)
    action_repetition = int(np.ceil(100 * tau))  # minimum natural number which makes `dt` smaller than 0.005
    dt = .01
    for p in range(action_repetition):
        _,r,_,_ = env.step(np.array([a_agent]), dt, tau, 0)
        r *= np.exp(- alpha * p * dt)
        reward += r
    reward *= dt
    reward -= beta
    state1 = env.state
    return reward, state1


# 学習データ
def train_data(actor):
    memory = []
    S1 = np.linspace(-7, 7, 10)
    S2 = np.linspace(-7, 7, 10)
    S1, S2 = np.meshgrid(S1, S2)
    S1, S2 = S1.flatten(), S2.flatten()

    for i, x in enumerate(zip(S1, S2)):
        state0 = np.array(x)
        a_agent, tau = actor.predict_on_batch(state0.reshape((1,1,)+state0.shape))[0]
        reward, state1 = interaction(state0, a_agent, tau, env)
        memory.append([state0, np.array([a_agent, tau]), reward, state1])
    memory = np.array(memory)
    return memory

# Q関数の学習
def td_learning(memory, actor, critic):
    # TODO: loss小さい=関数として正しい　をチェックする
    assert critic.compile, 'compile critic before use this function'
    # critic learning
    epoch = 10
    batch_size = 32
    arr = np.array(range(memory.shape[0]))
    losses = []
    for _ in range(epoch):
        # make mini_batch
        mem = memory[np.random.choice(arr, batch_size, replace=False)]
        state0_batch = []
        action_batch = []
        reward_batch = []
        state1_batch = []
        for m in mem:
            state0_batch.append([m[0]])
            action_batch.append(m[1])
            reward_batch.append([m[2]])
            state1_batch.append([m[3]])
        state0_batch, action_batch, reward_batch, state1_batch = \
        np.array(state0_batch), np.array(action_batch), np.array(reward_batch), np.array(state1_batch)

        # TD error
        next_action = actor.predict_on_batch(state1_batch)
        state1_batch_with_action = [next_action, state1_batch]
        target = critic.predict_on_batch(state1_batch_with_action)
        discount = np.exp(- alpha * action_batch[:,1].reshape(batch_size, 1))
        r_second = np.multiply(discount, target)
        r = reward_batch + r_second

        # learn
        critic_input_batch = [action_batch, state0_batch]
        loss = critic.train_on_batch(critic_input_batch, r)
    return critic, loss

# 勾配の計算
def policy_gradient(actor, critic):
    # どの状態からの変化を見る？
    init_state = np.array([1,2])
    # 初期状態からのシミュレーション
    x = init_state
    episode_time = 0
    log = []
    while True:
        a_agent, tau = actor.predict_on_batch(x.reshape(1,1,2))[0]
        log.append([x, episode_time])
        reward, x = interaction(x, a_agent, tau, env)
        episode_time += tau
        if episode_time >= 1.:
            log.append([x, episode_time])
            break
    
    pg = 0
    # Q 関数の勾配を計算する
    for x, t in log:
        g = q_gradient([x], actor, critic)
        g = flatten_gradient(g)
        g *= np.exp(- alpha * t)
        pg += g
    return pg

# 評価
def evaluation(actor):
    value = 1
    return value


In [30]:
actor = branch_actor((2,),(2,))
critic = critic_net((2,),(2,))[0]
critic.compile(loss='mse', optimizer='adam')

In [31]:
actor.load_weights('../saved_agent/linear_init_extend_actor.h5')
lrs = learning_rate_arr(actor, size=674, u_lr=.00001, tau_lr=.0001)
opt = Adam(lrs)
for i in range(10):
    d_train = train_data(actor)
    print('data')
    learned_q_net, loss = td_learning(d_train, actor, critic)
    print('q_n')
    pg = policy_gradient(actor, critic)
    print('pg')
    actor = opt.update(actor, pg)
    print('adam')
    ev = evaluation(actor)
    print(i, ev, loss)

data
q_n


KeyboardInterrupt: 

In [54]:
%%time
state = np.array([[1,2]])
qg = q_gradient(state, actor, critic)

CPU times: user 501 ms, sys: 12.5 ms, total: 513 ms
Wall time: 540 ms


In [56]:
%%time
g = flatten_gradient(qg)

CPU times: user 311 µs, sys: 132 µs, total: 443 µs
Wall time: 428 µs


In [34]:
flatten_gradient(qg)

array([ 4.39446598e-01,  8.08119923e-02,  0.00000000e+00,  0.00000000e+00,
       -7.81188458e-02, -1.96768388e-01,  1.54914409e-01,  0.00000000e+00,
       -2.50273287e-01, -2.97086030e-01,  0.00000000e+00,  5.94549596e-01,
       -1.71956420e-02, -3.93255949e-02,  0.00000000e+00, -4.06553000e-01,
        8.78893197e-01,  1.61623985e-01,  0.00000000e+00,  0.00000000e+00,
       -1.56237692e-01, -3.93536776e-01,  3.09828818e-01,  0.00000000e+00,
       -5.00546575e-01, -5.94172060e-01,  0.00000000e+00,  1.18909919e+00,
       -3.43912840e-02, -7.86511898e-02,  0.00000000e+00, -8.13106000e-01,
        4.39446598e-01,  8.08119923e-02, -0.00000000e+00, -0.00000000e+00,
       -7.81188458e-02, -1.96768388e-01,  1.54914409e-01,  0.00000000e+00,
       -2.50273287e-01, -2.97086030e-01,  0.00000000e+00,  5.94549596e-01,
       -1.71956420e-02, -3.93255949e-02, -0.00000000e+00, -4.06553000e-01,
       -2.39064448e-08,  0.00000000e+00, -2.83665322e-08, -3.95387048e-08,
        0.00000000e+00, -

In [51]:
an = np.array([[1,2],[3,4,5]])

In [52]:
an = np.resize(an,(5,))

In [53]:
an

array([list([1, 2]), list([3, 4, 5]), list([1, 2]), list([3, 4, 5]),
       list([1, 2])], dtype=object)