In [26]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import sys
sys.path.append('../../../module/')

from keras2.models import Model
from keras2.layers import concatenate, Dense, Input, Flatten
from keras2.optimizers import Adam
import csv
import itertools
from util import *
import gym2
from rl2.agents import selfDDPGAgent, selfDDPGAgent2
from rl2.memory import SequentialMemory

In [27]:
env = gym2.make('Linear-v1')
Q = .01 * np.eye(2)
R = .01

In [28]:
def critic_net(a_shape , s_shape):
    action_input = Input(a_shape)
    observation_input = Input(shape=(1,)+s_shape)
    flattened_observation = Flatten()(observation_input)
    x = concatenate([action_input, flattened_observation])
    x = Dense(16, activation="relu")(x)
    x = Dense(16, activation="relu")(x)
    x = Dense(1, activation="linear")(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)
    return (critic, action_input)

def branch_actor(a_shape, s_shape):
    action_input = Input(shape=(1,)+s_shape)
    x = Flatten()(action_input) # 実質的なinput layer
    
    x1 = Dense(16, activation="relu")(x)
    x1 = Dense(16, activation="relu")(x1)
    x1 = Dense(1, activation="multiple_tanh")(x1) # action signal
    
    x2 = Dense(16, activation="relu")(x)
    x2 = Dense(16, activation="relu")(x2)
    x2 = Dense(1, activation="tau_output_large")(x2) # tau
    
    output = concatenate([x1, x2])
    actor = Model(inputs=action_input, outputs=output)
    return actor


def agent2(a_shape, s_shape):
    actor = branch_actor(a_shape, s_shape)
    critic, critic_action_input = critic_net(a_shape, s_shape)
    memory = SequentialMemory(limit = 30000, window_length = 1)
    agent = selfDDPGAgent2(
        a_shape[0],
        actor,
        critic,
        critic_action_input,
        memory,
        gamma=1.,
        mb_noise=True,
        action_clipper=[-10., 10.],
        tau_clipper=[0.001, 10.],
        params_logging=False,
        gradient_logging=False,
        batch_size=128,
    )
    return agent

def gain(A, B, Q, R, dt=None):
    if dt is not None:
        Ad, Bd = discretized_system(A, B, dt)  
        K = dlqr(Ad,Bd,Q,R)[0]
    else:
        K = lqr(A,B,Q,R)[0]
    
    return K

In [29]:
#learning   
l = .1
step = 1000000  # num of interval
episode_step = step
a = agent2((2,), (2,))
actor_optimizer, critic_optimizer = Adam(lr=100., clipnorm=1.), Adam(lr=0.001, clipnorm=1.) # actorの方は何でもいい
optimizer = [actor_optimizer, critic_optimizer]
a.compile(optimizer=optimizer, metrics=["mse"], action_lr=0.0001, tau_lr=0.001)

In [34]:
dt = 0.01
alpha = 0.4
beta = 0.1
time_limit = 20.
initial_state = np.array([3.,3.])
n_episodes = 50

In [31]:
a.load_weights('../saved_agent/adaptive_linear3.h5')

In [35]:
ev = 0

# episodes from same initial point
for episode in range(n_episodes):
    env.reset()
    env.set_state(initial_state)
    state_log = []
    action_log = []
    communication_log = []
    acc_time = 0
    
    while acc_time < time_limit:
        u, tau = a.forward(env.state)
        tau = np.round(tau, decimals=2)
        tau = np.clip(tau, .01, 10.)
        acc_time += tau
        action_repetition = int(np.round(tau * 100))  # minimum natural number which makes `dt` smaller than 0.005
        for p in range(action_repetition):
            if p == 0:
                communication_log.append(1)
            else:
                communication_log.append(0)
            action_log.append(u)
            state_log.append(env.state)
            _,_,_,_ = env.step(np.array([u]), dt, tau)
        
    state_log = state_log[:2000]
    action_log = action_log[:2000]
    communication_log = communication_log[:2000]
    
    assert len(state_log) == 2000, f'acc_time = {acc_time}, steps = {len(state_log)}'
    
    integral = 0
    for k in range(2000): # 20 second (2000 * 0.01)
        integral += np.exp(-alpha * dt * k) * (np.dot(np.dot(state_log[k], Q), state_log[k]) + R * action_log[k]**2 + \
                                               beta * communication_log[k])
    ev += integral
ev = dt * ev / n_episodes

In [36]:
ev

0.13914920016720142

In [37]:
a.actor.load_weights('../saved_agent/mb_self_extend.h5')

In [38]:
ev = 0

# episodes from same initial point
for episode in range(n_episodes):
    env.reset()
    env.set_state(initial_state)
    state_log = []
    action_log = []
    communication_log = []
    acc_time = 0
    
    while acc_time < time_limit:
        u, tau = a.forward(env.state)
        tau = np.round(tau, decimals=2)
        tau = np.clip(tau, .01, 10.)
        acc_time += tau
        action_repetition = int(np.round(tau * 100))  # minimum natural number which makes `dt` smaller than 0.005
        for p in range(action_repetition):
            if p == 0:
                communication_log.append(1)
            else:
                communication_log.append(0)
            action_log.append(u)
            state_log.append(env.state)
            _,_,_,_ = env.step(np.array([u]), dt, tau)
        
    state_log = state_log[:2000]
    action_log = action_log[:2000]
    communication_log = communication_log[:2000]
    
    assert len(state_log) == 2000, f'acc_time = {acc_time}, steps = {len(state_log)}'
    
    integral = 0
    for k in range(2000): # 20 second (2000 * 0.01)
        integral += np.exp(-alpha * dt * k) * (np.dot(np.dot(state_log[k], Q), state_log[k]) + R * action_log[k]**2 + \
                                               beta * communication_log[k])
    ev += integral
ev = dt * ev / n_episodes

In [39]:
ev

0.16843965638649283

In [40]:
K = gain(env.A, env.B.reshape(2,1), Q, R, .01)

In [41]:
ev = 0

# episodes from same initial point
for episode in range(n_episodes):
    env.reset()
    env.set_state(initial_state)
    state_log = []
    action_log = []
    communication_log = []
    acc_time = 0
    
    while acc_time < time_limit:
        u, tau = np.dot(K, env.state), .01
        tau = np.round(tau, decimals=2)
        tau = np.clip(tau, .01, 10.)
        acc_time += tau
        action_repetition = int(np.round(tau * 100))  # minimum natural number which makes `dt` smaller than 0.005
        for p in range(action_repetition):
            if p == 0:
                communication_log.append(1)
            else:
                communication_log.append(0)
            action_log.append(u)
            state_log.append(env.state)
            _,_,_,_ = env.step(np.array([u]), dt, tau)
        
    state_log = state_log[:2000]
    action_log = action_log[:2000]
    communication_log = communication_log[:2000]
    
    assert len(state_log) == 2000, f'acc_time = {acc_time}, steps = {len(state_log)}'
    
    integral = 0
    for k in range(2000): # 20 second (2000 * 0.01)
        integral += np.exp(-alpha * dt * k) * (np.dot(np.dot(state_log[k], Q), state_log[k]) + R * action_log[k]**2 + \
                                               beta * communication_log[k])
    ev += integral
ev = dt * ev / n_episodes

In [42]:
ev

0.315196740205263

In [25]:
print(np.exp(-0.04*5))

0.8187307530779818
