In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import sys
sys.path.append('../../../module/')

from keras2.models import Model
from keras2.layers import concatenate, Dense, Input, Flatten
from keras2.optimizers import Adam
import csv
from util import *
import gym2
from rl2.agents import selfDDPGAgent, selfDDPGAgent2
from rl2.memory import SequentialMemory
from scipy import linalg as slinalg

Using TensorFlow backend.
Using TensorFlow backend.


In [2]:
env = gym2.make('Pendulum-v2')

In [3]:
def critic_net(a_shape , s_shape):
    action_input = Input(a_shape)
    observation_input = Input(shape=(1,)+s_shape)
    flattened_observation = Flatten()(observation_input)
    x = concatenate([action_input, flattened_observation])
    x = Dense(16, activation="relu")(x)
    x = Dense(16, activation="relu")(x)
    x = Dense(1, activation="linear")(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)
    return (critic, action_input)

def branch_actor(a_shape, s_shape):
    action_input = Input(shape=(1,)+s_shape)
    x = Flatten()(action_input) # 実質的なinput layer
    
    x1 = Dense(8, activation="relu")(x)
    x1 = Dense(8, activation="relu")(x1)
    x1 = Dense(1, activation="multiple_tanh")(x1) # action signal
    
    x2 = Dense(8, activation="relu")(x)
    x2 = Dense(8, activation="relu")(x2)
    x2 = Dense(1, activation="tau_output_large")(x2) # tau
    
    output = concatenate([x1, x2])
    actor = Model(inputs=action_input, outputs=output)
    return actor


In [10]:
alpha = 0.1
beta = 1.

In [5]:
def interaction(state, u, tau, env, ln=0):
    env.reset()
    x = np.array(state)
    env.set_state(x)
    reward = 0
    a_agent, tau = u, tau
    tau = np.clip(tau, 0.01, 10.)
    action_repetition = int(np.ceil(100 * tau))  # minimum natural number which makes `dt` smaller than 0.005
    dt = .01
    for p in range(action_repetition):
        _,r,_,_ = env.step(np.array([a_agent]), dt, tau, ln)
        r *= np.exp(- alpha * p * dt)
        reward += r
    reward *= dt
    reward -= beta
    state1 = env.state
    return reward, state1

# 評価
def evaluation(actor, init_state = np.array([1,2])):
    x = init_state
    episode_time = 0
    episode_reward = 0
    log = []
    while True:
        a_agent, tau = actor.predict_on_batch(x.reshape(1,1,2))[0]
        log.append([x, episode_time])
        reward, x = interaction(x, a_agent, tau, env, ln=0.)
        episode_reward += np.exp(- alpha * episode_time) * reward
        episode_time += tau
        if episode_time >= 30.:
            log.append([x, episode_time])
            break
    return episode_reward

In [19]:
p = branch_actor((2,),(2,))
m = branch_actor((2,),(2,))
p.load_weights('../saved_agent/learned_self_proposed6_actor.h5')
m.load_weights('../saved_agent/sample_02_extend_actor.h5')

In [20]:
s1 = np.linspace(-np.pi, np.pi, 10)
s2 = np.linspace(-2*np.pi, 2*np.pi, 10)
S1, S2 = np.meshgrid(s1, s2)
S1, S2 = S1.flatten(), S2.flatten()

ev_p = []
ev_m = []

for i, x in enumerate(zip(S1, S2)):
    x = np.array(x)
    ev_p.append(evaluation(p, init_state=x))
    ev_m.append(evaluation(m, init_state=x))
    print(f'{int(i*100/len(S1))}%\r',end='')

99%

In [21]:
# practical
print(np.mean(ev_p), np.max(ev_p), np.min(ev_p))

-30.573164572488757 -26.790293761817463 -37.0812709906237


In [15]:
# sample value
print(np.mean(ev_m), np.max(ev_m), np.min(ev_m))

-62.492721990335504 -58.54300170128275 -66.33503723183593
