In [1]:
import numpy as np
import tensorflow as tf
import time
import random

  from ._conv import register_converters as _register_converters


In [2]:
class Episode_experience():
    def __init__(self):
        self.memory = []
        
    def add(self, state, action, reward, next_state, done, goal):
        self.memory += [(state, action, reward, next_state, done, goal)]
        
    def clear(self):
        self.memory = []

class DDPGAgent:
    def __init__(self, state_size, action_size, action_low=-1, action_high=1, gamma=0.98,
                 actor_learning_rate=0.01, critic_learning_rate=0.01, tau=1e-3):
        self.state_size = state_size
        self.action_size = action_size
        self.action_low = action_low
        self.action_high = action_high
        self.gamma = gamma   # discount rate
        self.memory = []
        self.buffer_size = int(5e4)
        self.actor_learning_rate = actor_learning_rate
        self.critic_learning_rate = critic_learning_rate # often larger than actor_learning_rate
        self.tau = tau # soft update
        self.batch_size = 32
        self._construct_nets()
        
    def _construct_nets(self):
        tf.reset_default_graph()
        self.sess = tf.Session()
        
        self.S = tf.placeholder(tf.float32, [None, self.state_size], 'state')
        self.S_ = tf.placeholder(tf.float32, [None, self.state_size], 'next_state')
        self.D = tf.placeholder(tf.float32, [None, ], 'done')
        self.R = tf.placeholder(tf.float32, [None, ], 'r')

        with tf.variable_scope('Actor'):
            self.a = self._build_a(self.S, scope='eval')
            self.a_ = self._build_a(self.S_, scope='target')
        with tf.variable_scope('Critic'):
            q = self._build_c(self.S, self.a, scope='eval')
            q_ = self._build_c(self.S_, self.a_, scope='target')
        
        self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
        self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
        self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
        self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')

        self.soft_update_op = [[tf.assign(ta, (1 - self.tau) * ta + self.tau * ea), tf.assign(tc, (1 - self.tau) * tc + self.tau * ec)]
                             for ta, ea, tc, ec in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]

        q_target = self.R + self.gamma * (1-self.D) * q_
        # in the feed_dic for the td_error, the self.a should change to actions in memory
        td_error_squared = tf.losses.mean_squared_error(q_target, q)
        self.ctrain = tf.train.AdamOptimizer(self.critic_learning_rate).minimize(td_error_squared, var_list=self.ce_params)
           
        a_loss = - tf.reduce_mean(q)    # maximize the q
        self.atrain = tf.train.AdamOptimizer(self.actor_learning_rate).minimize(a_loss, var_list=self.ae_params)
        
        self.sess.run(tf.global_variables_initializer())
    
    def _build_a(self, s, scope): # policy
        with tf.variable_scope(scope):
            net = tf.layers.dense(s, 64, tf.nn.relu, name='l1')
            a = tf.layers.dense(net, self.action_size, tf.nn.tanh, name='a')
            return a * (self.action_high-self.action_low)/2 + (self.action_high+self.action_low)/2
    
    def _build_c(self, s, a, scope): # Q value
        with tf.variable_scope(scope):
            net = tf.layers.dense(tf.concat([s, a], axis=1), 64, tf.nn.relu)
            return tf.layers.dense(net, 1)  # Q(s,a)
    
    def choose_action(self, state, variance): # normal distribution
        action = self.sess.run(self.a, {self.S: state})[0]
        return np.clip(np.random.normal(action, variance), self.action_low, self.action_high)
    
    def remember(self, ep_experience):
        self.memory += ep_experience.memory
        if len(self.memory) > self.buffer_size:
            self.memory = self.memory[-self.buffer_size:] # empty the first memories
        
    def replay(self, optimization_steps=1):
        if len(self.memory) < self.batch_size: # if there's no enough transitions, do nothing
            return
        
        for _ in range(optimization_steps):
            minibatch = np.vstack(random.sample(self.memory, self.batch_size))
            ss = np.vstack(minibatch[:,0])
            acs = np.vstack(minibatch[:,1])
            rs = minibatch[:,2]
            nss = np.vstack(minibatch[:,3])
            ds = minibatch[:,4]
            gs = np.vstack(minibatch[:,5])
            self.sess.run(self.atrain, {self.S: ss})
            self.sess.run(self.ctrain, {self.S: ss, self.a: acs, self.R: rs,
                                        self.S_: nss, self.D: ds})
    
    def update_target_net(self):
        self.sess.run(self.soft_update_op)

In [3]:
import gym
env = gym.make('MountainCarContinuous-v0')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [4]:
agent = DDPGAgent(2, 1, tau=0.9)

In [6]:
# no param noise
rs = [0.0]

ep_experience = Episode_experience()
state = env.reset()
goal = np.array([0.]) # fixed
variance = 3

start_time = time.clock()
for t in range(int(1e5)):
    action = agent.choose_action([state], variance)
    next_state, reward, done, _ = env.step(action)
    ep_experience.add(state, action, reward, next_state, done, goal)
    agent.remember(ep_experience)  
    ep_experience.clear()
    rs[-1]+=reward
    state = next_state
    if done:
        state = env.reset()
        rs += [0.0]
    if t%50==0:
        variance *= 0.9995
        agent.replay(20)
        agent.update_target_net()
    if done and len(rs)%10==0:
        print("steps", t, "mean r %.2f"%np.mean(rs[-101:-1]), "variance %.2f"%variance)
        
print("Training time : %.2f"%(time.clock()-start_time), "s")

steps 8882 mean r -70.20 variance 2.74
steps 18872 mean r -75.82 variance 2.48
steps 28356 mean r -65.48 variance 2.26
steps 37075 mean r -58.42 variance 2.07
steps 39932 mean r -30.59 variance 2.01
steps 42301 mean r -11.50 variance 1.96
steps 44630 mean r 2.04 variance 1.92
steps 46534 mean r 12.64 variance 1.88
steps 48525 mean r 20.75 variance 1.85
steps 50176 mean r 27.49 variance 1.82
steps 51992 mean r 42.17 variance 1.78
steps 53669 mean r 59.03 variance 1.75
steps 55230 mean r 72.46 variance 1.73
steps 57041 mean r 84.91 variance 1.70
steps 58956 mean r 85.70 variance 1.66
steps 60760 mean r 86.18 variance 1.63
steps 62510 mean r 86.72 variance 1.60
steps 64202 mean r 86.91 variance 1.58
steps 65769 mean r 87.30 variance 1.55
steps 67350 mean r 87.39 variance 1.53
steps 68927 mean r 87.61 variance 1.51
steps 70527 mean r 87.70 variance 1.48
steps 72105 mean r 87.71 variance 1.46
steps 73596 mean r 88.00 variance 1.44
steps 75036 mean r 88.39 variance 1.42
steps 76591 mean r 88

In [9]:
actions = []
for _ in range(5):
    state = env.reset()
    r = 0
    while True:
#         env.render()
        action = agent.choose_action([state], 0)
        next_state, reward, done, _ = env.step(action)
        actions += [action]
        r += reward
        state = next_state
        if done:
            break
    print("reward :", r)

reward : 93.10280089983304
reward : 92.94764781156853
reward : 93.06710769416671
reward : 93.09285169078366
reward : 93.01621074614499
