## confirm $\min_{s}V^{\pi_{\textrm{RL}}}(s) > \max_sV^{\pi_{\textrm{init}}}(s)$

In [16]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import sys
sys.path.append('../../../module/')

from keras2.models import Model
from keras2.layers import concatenate, Dense, Input, Flatten
from keras2.optimizers import Adam
import csv
from util import *
import gym2
from rl2.agents import selfDDPGAgent, selfDDPGAgent2
from rl2.memory import SequentialMemory
from scipy import linalg as slinalg

In [17]:
env = gym2.make('Linear-v0')
Q = .01 * np.eye(2)
R = .01 * np.eye(1)
l = 1.

In [3]:
def critic_net(a_shape , s_shape):
    action_input = Input(a_shape)
    observation_input = Input(shape=(1,)+s_shape)
    flattened_observation = Flatten()(observation_input)
    x = concatenate([action_input, flattened_observation])
    x = Dense(16, activation="relu")(x)
    x = Dense(16, activation="relu")(x)
    x = Dense(1, activation="linear")(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)
    return (critic, action_input)

def branch_actor(a_shape, s_shape):
    action_input = Input(shape=(1,)+s_shape)
    x = Flatten()(action_input) # 実質的なinput layer
    
    x1 = Dense(8, activation="relu")(x)
    x1 = Dense(8, activation="relu")(x1)
    x1 = Dense(1, activation="multiple_tanh")(x1) # action signal
    
    x2 = Dense(8, activation="relu")(x)
    x2 = Dense(8, activation="relu")(x2)
    x2 = Dense(1, activation="tau_output")(x2) # tau
    
    output = concatenate([x1, x2])
    actor = Model(inputs=action_input, outputs=output)
    return actor


def agent2(a_shape, s_shape):
    actor = branch_actor(a_shape, s_shape)
    critic, critic_action_input = critic_net(a_shape, s_shape)
    memory = SequentialMemory(limit = 50000, window_length = 1)
    agent = selfDDPGAgent2(
        a_shape[0],
        actor,
        critic,
        critic_action_input,
        memory,
        mb_noise=False,
        coef_u=.01,
        coef_tau=.001,
        action_clipper=[-10., 10.],
        tau_clipper=[0.001, 1.],
        params_logging=False,
        gradient_logging=False,
        batch_size=128,
    )
    return agent

In [4]:
#learning   
l = .1
step = 1000000  # num of interval
episode_step = step
a = agent2((2,), (2,))
actor_optimizer, critic_optimizer = Adam(lr=100., clipnorm=1.), Adam(lr=0.001, clipnorm=1.) # actorの方は何でもいい
optimizer = [actor_optimizer, critic_optimizer]
a.compile(optimizer=optimizer, metrics=["mse"], action_lr=0.0001, tau_lr=0.001)

b = agent2((2,), (2,))
b.compile(optimizer=optimizer, metrics=["mse"], action_lr=0.0001, tau_lr=0.001)

Instructions for updating:
Colocations handled automatically by placer.


In [20]:
l = 1.
def value_function(state, agent, env, gamma=.99, step_limit=1000):
    env.reset()
    env.set_state(state)
    value = 0
    
    for step in range(step_limit):
        reward = 0
        x = env.state
        a_agent, tau = agent.forward(x)
        
        action_repetition = int(np.ceil(20 * tau))  # minimum natural number which makes `dt` smaller than 0.05
        dt = tau / action_repetition
        for p in range(action_repetition):
            _,r,_,_ = env.step(np.array([a_agent]), dt, tau)
            reward += r
        reward *= dt
        reward += - tau * 0.01 * a_agent**2 + l * tau
        
        value += pow(gamma, step) * reward
    
    return value

In [21]:
%%time
a.load_weights('../saved_agent/linear_init.h5')
b.load_weights('../saved_agent/learned_self_linear0.h5')

s1 = np.linspace(-7, 7, 100)
s2 = np.linspace(-7, 7, 100)
S1, S2 = np.meshgrid(s1, s2)

values_init = []
values_rl = []
i = 0

for state in zip(S1.flatten(), S2.flatten()):
    print(f'{int(i*100/S1.shape[0]**2)}%\r', end='')
    i += 1
    state = np.array(state)
    values_init.append(value_function(state, a, env))
    values_rl.append(value_function(state, b, env))
    
values_init = np.array(values_init).reshape(S1.shape)
values_rl = np.array(values_rl).reshape(S1.shape)

CPU times: user 4h 24min 15s, sys: 1h 14min 49s, total: 5h 39min 5s
Wall time: 3h 35min 55s


In [22]:
print(np.min(values_rl), np.max(values_init))

32.940298968662816 1.8869420942565018


## Model Based

In [18]:
def array_exp(A):
    v, p = np.linalg.eig(A)
    align = np.array([[v[0], 0],[0, v[1]]])
    exp = np.exp(align)
    exp[~np.eye(exp.shape[0],dtype=bool)] = 0
    out = np.dot(np.dot(p, exp), np.linalg.inv(p))
    return out


def next_state(state, action, tau, env):
    # next state when const action and tau is input
    A = env.A
    B = env.B
    eAt = array_exp(A*tau)
    A_inv = np.linalg.inv(A)
    
    integral = np.dot(np.dot(eAt, A_inv), B) \
             - np.dot(np.dot(eAt, A_inv), np.dot(array_exp(-A*tau), B))
    s_prime = np.dot(eAt, state) + integral *action 
    
    return s_prime
    
    
def control_law(state, env, Q=Q, R=R, l=1.):
    A = env.A
    B = env.B
    next_value_weight = 100
    
    taus = np.linspace(.01, 1., 100)
    evaluation = np.inf
    
    # riccati
    P = slinalg.solve_continuous_are(A, B.reshape(B.shape[0],1), Q, R)
    A_inv = np.linalg.inv(A)
    
    for tau in taus:
        # calculate optimal action with fixed `tau`
        eAt = array_exp(A*tau)
        
        # ∂s'/∂u
        dsdu = np.dot(np.dot(eAt, A_inv), B) \
             - np.dot(np.dot(eAt, A_inv), np.dot(array_exp(-A*tau), B))
        # int exp(A(t-τ))Budt
        integral = dsdu

        # ∂c/∂u = first + second * u, c: instant cost
        first = 2 * np.dot(np.dot(dsdu, P), np.dot(eAt, state))
        first *= next_value_weight
        second = 2 * np.dot(np.dot(dsdu, P), integral)
        second = next_value_weight * second + 2

        # optimal action
        u = - first / second
        
        # evaluation
        s_prime = next_state(state, u, tau, env)
        ev = u**2 - l * tau + next_value_weight * np.dot(np.dot(s_prime, P), s_prime)
        #print(s_prime, ev, [u, tau])
        if ev < evaluation:
            control = np.array([u, tau])
            evaluation = ev
    
    return control

def value_function_mb(state, env, gamma=.99, step_limit=2000):
    env.reset()
    env.set_state(state)
    value = 0
    
    for step in range(step_limit):
        reward = 0
        x = env.state
        a_agent, tau = control_law(x, env, l=l)
        
        action_repetition = int(np.ceil(20 * tau))  # minimum natural number which makes `dt` smaller than 0.05
        dt = tau / action_repetition
        for p in range(action_repetition):
            _,r,_,_ = env.step(np.array([a_agent]), dt, tau)
            reward += r
        reward *= dt
        reward += - 0.01 * a_agent**2 + l * tau
        
        value += pow(gamma, step) * reward
 
    return value

In [13]:
values_mb = []
i = 0

for state in zip(S1.flatten(), S2.flatten()):
    print(f'{int(i*100/S1.shape[0]**2)}%\r', end='')
    i += 1
    state = np.array(state)
    values_mb.append(value_function(state, env))
    
values_mb = np.array(values_mb).reshape(S1.shape)

0%

TypeError: value_function() missing 1 required positional argument: 'env'

In [53]:
%%time
## ノイズのやつ
state = np.array([6.,6.])

a.load_weights('../saved_agent/large_noise0.h5')
print(value_function(state, a, env))

a.load_weights('../saved_agent/small_noise0.h5')
print(value_function(state, a, env))

-9.697294025257953
34.39552520415214
-14.709444689856836
-6.630478880023541
CPU times: user 5.56 s, sys: 323 ms, total: 5.88 s
Wall time: 5.56 s
