In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import linalg as slinalg
from joblib import Parallel, delayed
%matplotlib inline

import sys
sys.path.append('../../../module')
import gym2
import util as U

In [2]:
env = gym2.make('Linear-v1')
Q = .01 * np.eye(2)
R = .01 * np.eye(1)
lam = .5

In [3]:
def array_exp(A):
    v, p = np.linalg.eig(A)
    align = np.array([[v[0], 0],[0, v[1]]])
    exp = np.exp(align)
    exp[~np.eye(exp.shape[0],dtype=bool)] = 0
    out = np.dot(np.dot(p, exp), np.linalg.inv(p))
    return out


def next_state(state, action, tau, env):
    # mean of next state when const action and tau is input
    A = env.A
    B = env.B
    eAt = array_exp(A*tau)
    A_inv = np.linalg.inv(A)
    
    integral = np.dot(np.dot(eAt, A_inv), B) \
             - np.dot(np.dot(eAt, A_inv), np.dot(array_exp(-A*tau), B))
    s_prime = np.dot(eAt, state) + integral *action 
    
    return s_prime


def variance(A, D, tau):
    tmp = np.dot(array_exp(A*tau), D)
    Q = np.dot(D.reshape(2,1), D.reshape(1,2)) - np.dot(tmp, tmp)
    V = slinalg.solve_lyapunov(A, -Q)
    return V
    
    
def control_law(state, env, Q=Q, R=R, l=lam):
    A = env.A
    B = env.B
    D = env.D
    next_value_weight = 100
    
    taus = np.linspace(.01, 10., 1000)
    evaluation = np.inf
    
    # riccati
    P = slinalg.solve_continuous_are(A, B.reshape(B.shape[0],1), Q, R)
    A_inv = np.linalg.inv(A)
    
    for tau in taus:
        # calculate optimal action with fixed `tau`
        eAt = array_exp(A*tau)
        
        # ∂s'/∂u
        dsdu = np.dot(np.dot(eAt, A_inv), B) \
             - np.dot(np.dot(eAt, A_inv), np.dot(array_exp(-A*tau), B))
        # int exp(A(t-τ))Budt
        integral = dsdu

        # ∂c/∂u = first + second * u, c: instant cost
        first = 2 * np.dot(np.dot(dsdu, P), np.dot(eAt, state))
        first *= next_value_weight
        second = 2 * np.dot(np.dot(dsdu, P), integral)
        second = next_value_weight * second + 2

        # optimal action
        u = - first / second
        
        # evaluation
        s_prime = next_state(state, u, tau, env)
        V_lqr = np.dot(np.dot(s_prime, P), s_prime) + np.trace(np.dot(P, variance(A, D, tau)))
        ev = u**2 - l * tau + next_value_weight * V_lqr
        #print(s_prime, ev, [u, tau])
        if ev < evaluation:
            control = np.array([u, tau])
            evaluation = ev
    
    return control

In [4]:
import warnings
warnings.filterwarnings('ignore')
from keras2.models import Model
from keras2.layers import concatenate, Dense, Input, Flatten
from keras2.optimizers import Adam

def branch_actor(a_shape, s_shape):
    action_input = Input(shape=(1,)+s_shape)
    x = Flatten()(action_input) # 実質的なinput layer
    
    x1 = Dense(16, activation="relu")(x)
    x1 = Dense(16, activation="relu")(x1)
    x1 = Dense(1, activation="multiple_tanh")(x1) # action signal
    
    x2 = Dense(16, activation="relu")(x)
    x2 = Dense(16, activation="relu")(x2)
    x2 = Dense(1, activation="tau_output_large")(x2) # tau
    
    output = concatenate([x1, x2])
    actor = Model(inputs=action_input, outputs=output)
    return actor

Using TensorFlow backend.


In [5]:
# make train data

ls = [0.01, 0.1, 0.5, 1, 5, 10, 50, 100]

def actor_learning_different_param(lam):
    actor = branch_actor((2,),(2,))
    actor.compile(loss='mse', optimizer='adam')
    
    s1 = np.linspace(-7,7,100)
    s2 = np.linspace(-7,7,100)
    S1, S2 = np.meshgrid(s1, s2)
    batch = []

    for state in zip(S1.flatten(), S2.flatten()):
        state = np.array(state)
        action = control_law(state, env, l=lam)
        batch.append([state, action])

    batch = np.array(batch)

    batch_size = 1

    epochs = 10000
    losses = []
    for _ in range(epochs):
        indices = np.random.choice(range(len(s1)**2),batch_size,replace=False)
        states = batch[:,0].reshape(len(s1)**2,1,2)
        actions = batch[:,1]

        loss = actor.train_on_batch(states, actions)
        losses.append(loss)
        
    actor.save_weights(f'../saved_agent/mb/mb_self_{int(100*lam)}_actor.h5')
    
    return  np.mean(losses)

def parallel_learning(ls):
    actors = Parallel(n_jobs=-1)([delayed(actor_learning_different_param)(l) for l in ls])
    return actors

In [6]:
%%time
losses = parallel_learning(ls)

Wall time: 1h 44min 25s


In [14]:
losses

[0.18585317,
 0.17999224,
 0.038967192,
 0.07602325,
 0.04381613,
 0.07941709,
 0.062217005,
 0.020934818]

In [7]:
alpha = 0.1
beta = 0.5

def interaction(state, u, tau, env, ln=0.):
    env.reset()
    x = np.array(state)
    env.set_state(x)
    reward = 0
    a_agent, tau = u, tau
    tau = np.clip(tau, 0.01, 10.)
    action_repetition = int(np.ceil(100 * tau))  # minimum natural number which makes `dt` smaller than 0.005
    dt = .01
    for p in range(action_repetition):
        _,r,_,_ = env.step(np.array([a_agent]), dt, tau, ln)
        r *= np.exp(- alpha * p * dt)
        reward += r
    reward *= dt
    reward -= beta
    state1 = env.state
    return reward, state1

def value_function(actor, n_episodes=5, init_state=np.array([1,2])):
    average_reward = 0
    for _ in range(n_episodes):
        x = init_state
        episode_time = 0
        episode_reward = 0
        log = []
        while True:
            a_agent, tau = actor.predict_on_batch(x.reshape(1,1,2))[0]
            log.append([x, episode_time])
            reward, x = interaction(x, a_agent, tau, env, ln=1.)
            episode_reward += np.exp(- alpha * episode_time) * reward
            episode_time += tau
            if episode_time >= 30.:
                log.append([x, episode_time])
                break
        average_reward += episode_reward
    return average_reward / n_episodes

def evaluation_function(lam):
    actor = branch_actor((2,),(2,))
    actor.load_weights(f'../saved_agent/mb/mb_self_{int(100*lam)}_actor.h5')
    output = []
    for _ in range(50):
        s1 = np.linspace(-7, 7, 3)
        s2 = np.linspace(-7, 7, 3)
        S1, S2 = np.meshgrid(s1, s2)
        S1, S2 = S1.flatten(), S2.flatten()
        average_value = 0
        for i, x in enumerate(zip(S1, S2)):
            x = np.array(x)
            average_value += value_function(actor, init_state=x)
        output.append(average_value / len(S1)**2)
    return output

def all_actor_evaluation(ls):
    evaluations = Parallel(n_jobs=-1)([delayed(evaluation_function)(l) for l in ls])
    return evaluations # len(ls) * 50 array

In [8]:
%%time
evs = all_actor_evaluation(ls)

Wall time: 33min 10s


In [9]:
evs # 行の平均がJ(pi), 行はそれぞれのpi

[[-198.02568154526537,
  -201.41764576730475,
  -203.3511858108967,
  -198.25640324264126,
  -200.07251134555992,
  -199.15864179710553,
  -197.84739514542395,
  -197.11602819764437,
  -202.55525747492644,
  -203.27145888191734,
  -198.63969864130064,
  -201.405270411683,
  -196.94589341926547,
  -199.3335249742777,
  -196.41666707399065,
  -197.07446112202365,
  -199.70884982831302,
  -198.9378924170711,
  -199.30144853100043,
  -199.58131587696914,
  -199.02489581439798,
  -198.3180344818329,
  -198.82035504013945,
  -200.27040602708112,
  -199.84677725244052,
  -200.4364100213085,
  -200.4224716598839,
  -199.99312541995897,
  -201.47392377614634,
  -197.5450778260359,
  -199.16907149517266,
  -199.53653761118798,
  -199.58034600626976,
  -199.38931343532172,
  -199.52015245158157,
  -203.24445414648798,
  -198.5798860313763,
  -200.20710179490612,
  -198.80346771110035,
  -194.99531902088108,
  -196.03739792107007,
  -199.16966414546113,
  -198.27676059261574,
  -195.52836489773927

In [20]:
evs = np.array(evs)

In [21]:
print(np.mean(evs,axis=1))
print(np.var(evs,axis=1))

[-22.1369227  -80.02708822 -17.31114273 -11.32283132 -12.23011358
 -15.72344864 -32.20886764 -37.38010559]
[0.04571349 9.41146889 0.0284256  0.02677617 0.13306111 0.29675948
 1.27465945 1.23788651]


In [23]:
np.save('evs.npy', evs)