In [16]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import sys
sys.path.append('../../module/')

from keras2.models import Model
from keras2.layers import concatenate, Dense, Input, Flatten
from keras2.optimizers import Adam
import csv
from util import *
import gym2
from rl2.agents import selfDDPGAgent, selfDDPGAgent2
from rl2.memory import SequentialMemory
from scipy import linalg as slinalg

In [12]:
env = gym2.make('Linear-v0')
Q = .01 * np.eye(2)
R = .01 * np.eye(1)
l = 1.

In [39]:
def array_exp(A):
    v, p = np.linalg.eig(A)
    align = np.array([[v[0], 0],[0, v[1]]])
    exp = np.exp(align)
    exp[~np.eye(exp.shape[0],dtype=bool)] = 0
    out = np.dot(np.dot(p, exp), np.linalg.inv(p))
    return out


def next_state(state, action, tau, env):
    # next state when const action and tau is input
    A = env.A
    B = env.B
    eAt = array_exp(A*tau)
    A_inv = np.linalg.inv(A)
    
    integral = np.dot(np.dot(eAt, A_inv), B) \
             - np.dot(np.dot(eAt, A_inv), np.dot(array_exp(-A*tau), B))
    s_prime = np.dot(eAt, state) + integral *action 
    
    return s_prime
    
    
def control_law(state, env, Q=Q, R=R, l=1.):
    A = env.A
    B = env.B
    next_value_weight = 100
    
    taus = np.linspace(.01, 1., 10)
    evaluation = np.inf
    
    # riccati
    P = slinalg.solve_continuous_are(A, B.reshape(B.shape[0],1), Q, R)
    A_inv = np.linalg.inv(A)
    
    for tau in taus:
        # calculate optimal action with fixed `tau`
        eAt = array_exp(A*tau)
        
        # ∂s'/∂u
        dsdu = np.dot(np.dot(eAt, A_inv), B) \
             - np.dot(np.dot(eAt, A_inv), np.dot(array_exp(-A*tau), B))
        # int exp(A(t-τ))Budt
        integral = dsdu

        # ∂c/∂u = first + second * u, c: instant cost
        first = 2 * np.dot(np.dot(dsdu, P), np.dot(eAt, state))
        first *= next_value_weight
        second = 2 * np.dot(np.dot(dsdu, P), integral)
        second = next_value_weight * second + 2

        # optimal action
        u = - first / second
        
        # evaluation
        s_prime = next_state(state, u, tau, env)
        ev = u**2 - l * tau + next_value_weight * np.dot(np.dot(s_prime, P), s_prime)
        #print(s_prime, ev, [u, tau])
        if ev < evaluation:
            control = np.array([u, tau])
            evaluation = ev
    
    return control

def value_function_mb(state, env, gamma=.99, step_limit=750):
    env.reset()
    env.set_state(state)
    value = 0
    
    for step in range(step_limit):
        reward = 0
        x = env.state
        a_agent, tau = control_law(x, env, l=l)
        
        action_repetition = int(np.ceil(20 * tau))  # minimum natural number which makes `dt` smaller than 0.05
        dt = tau / action_repetition
        for p in range(action_repetition):
            _,r,_,_ = env.step(np.array([a_agent]), dt, tau)
            reward += r
        reward *= dt
        reward += - 0.01 * a_agent**2 + l * tau
        
        value += pow(gamma, step) * reward
 
    return value

In [41]:
%%time
s1 = np.linspace(-7, 7, 100)
s2 = np.linspace(-7, 7, 100)
S1, S2 = np.meshgrid(s1, s2)
values_mb = []
i = 0

for state in zip(S1.flatten(), S2.flatten()):
    print(f'{int(i*100/S1.shape[0]**2)}%\r', end='')
    i += 1
    state = np.array(state)
    values_mb.append(value_function_mb(state, env))
    
values_mb = np.array(values_mb).reshape(S1.shape)

CPU times: user 14h 19min 58s, sys: 45.1 s, total: 14h 20min 43s
Wall time: 14h 20min 36s


In [43]:
with open('./csv/values_mb.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(values_mb.flatten())