In [1]:
import numpy as np
import tensorflow as tf
import time
import random

  from ._conv import register_converters as _register_converters


In [2]:
class Episode_experience():
    def __init__(self):
        self.memory = []
        
    def add(self, state, action, reward, next_state, done):
        self.memory += [(state, action, reward, next_state, done)]
        
    def clear(self):
        self.memory = []

class DDPGAgent:
    def __init__(self, state_size, action_size, action_low=-1, action_high=1, gamma=0.98,
                 actor_learning_rate=1e-3, critic_learning_rate=1e-3, tau=1e-3):
        self.state_size = state_size
        self.action_size = action_size
        self.action_low = action_low
        self.action_high = action_high
        self.gamma = gamma   # discount rate
        self.memory = []
        self.buffer_size = int(5e4)
        self.actor_learning_rate = actor_learning_rate
        self.critic_learning_rate = critic_learning_rate # often larger than actor_learning_rate
        self.tau = tau # soft update
        self.batch_size = 64
        self._construct_nets()
        
    def _construct_nets(self):
        tf.reset_default_graph()
        self.sess = tf.Session()
        
        self.S = tf.placeholder(tf.float32, [None, self.state_size], 'state')
        self.S_ = tf.placeholder(tf.float32, [None, self.state_size], 'next_state')
        self.D = tf.placeholder(tf.float32, [None, 1], 'done')
        self.R = tf.placeholder(tf.float32, [None, 1], 'r')

        with tf.variable_scope('Actor'):
            self.a = self._build_a(self.S, scope='eval')
            self.a_ = self._build_a(self.S_, scope='target')
        with tf.variable_scope('Critic'):
            q = self._build_c(self.S, self.a, scope='eval')
            q_ = self._build_c(self.S_, self.a_, scope='target')
        
        self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
        self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
        self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
        self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')

        self.soft_update_op = [[tf.assign(ta, (1 - self.tau) * ta + self.tau * ea), tf.assign(tc, (1 - self.tau) * tc + self.tau * ec)]
                             for ta, ea, tc, ec in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]

        q_target = self.R + self.gamma * q_
        td_error_squared = tf.losses.mean_squared_error(q_target, q)
        self.ctrain = tf.train.AdamOptimizer(self.critic_learning_rate).minimize(td_error_squared, var_list=self.ce_params)
        
        a_loss = - tf.reduce_mean(q)    # maximize the q
        self.atrain = tf.train.AdamOptimizer(self.actor_learning_rate).minimize(a_loss, var_list=self.ae_params)
        
        self.sess.run(tf.global_variables_initializer())
    
    def _build_a(self, s, scope): # policy
        with tf.variable_scope(scope):
            net = tf.layers.dense(s, 32, tf.nn.relu)
            a = tf.layers.dense(net, self.action_size, tf.nn.tanh, name='a')
            return a * (self.action_high-self.action_low)/2 + (self.action_high+self.action_low)/2
    
    def _build_c(self, s, a, scope): # Q value
        with tf.variable_scope(scope):
            net = tf.layers.dense(tf.concat([s, a], axis=1), 32, tf.nn.relu)
            return tf.layers.dense(net, 1)  # Q(s,a)
    
    def choose_action(self, state, variance): # normal distribution
        action = self.sess.run(self.a, {self.S: state})[0]
        return np.clip(np.random.normal(action, variance), self.action_low, self.action_high)
    
    def remember(self, ep_experience):
        self.memory += ep_experience.memory
        if len(self.memory) > self.buffer_size:
            self.memory = self.memory[-self.buffer_size:] # empty the first memories
        
    def replay(self, optimization_steps=1):
        if len(self.memory) < self.batch_size: # if there's no enough transitions, do nothing
            return
        
        for _ in range(optimization_steps):
            minibatch = np.vstack(random.sample(self.memory, self.batch_size))
            ss = np.vstack(minibatch[:,0])
            acs = np.vstack(minibatch[:,1])
            rs = np.vstack(minibatch[:,2])
            nss = np.vstack(minibatch[:,3])
            ds = np.vstack(minibatch[:,4])
            self.sess.run(self.atrain, {self.S: ss})
            self.sess.run(self.ctrain, {self.S: ss, self.a: acs, self.R: rs,
                                        self.S_: nss, self.D: ds})
    
    def update_target_net(self):
        self.sess.run(self.soft_update_op)

In [3]:
import gym
env = gym.make('Pendulum-v0')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [4]:
agent = DDPGAgent(3, 1, action_low=-2, action_high=2, tau=0.1)

In [5]:
# no param noise
rs = [0.0]

ep_experience = Episode_experience()
state = env.reset()
variance = 3

start_time = time.clock()
for t in range(int(1e5)):
    action = agent.choose_action([state], variance)
    next_state, reward, done, _ = env.step(action)
    ep_experience.add(state, action, reward, next_state, done)
    agent.remember(ep_experience)
    ep_experience.clear()
    rs[-1]+=reward
    state = next_state
    if done:
        state = env.reset()
        rs += [0.0]
    if t%50==0:
        variance *= 0.9995
        agent.replay(20)
        agent.update_target_net()
    if done and len(rs)%10==0:
        print("steps", t, "mean r %.2f"%np.mean(rs[-101:-1]), "variance %.2f"%variance)
        
print("Training time : %.2f"%(time.clock()-start_time), "s")

steps 1799 mean r -1345.00 variance 2.95
steps 3799 mean r -1417.73 variance 2.89
steps 5799 mean r -1374.66 variance 2.83
steps 7799 mean r -1327.30 variance 2.77
steps 9799 mean r -1271.10 variance 2.72
steps 11799 mean r -1259.13 variance 2.67
steps 13799 mean r -1251.02 variance 2.61
steps 15799 mean r -1249.86 variance 2.56
steps 17799 mean r -1243.24 variance 2.51
steps 19799 mean r -1205.25 variance 2.46
steps 21799 mean r -1151.89 variance 2.41
steps 23799 mean r -1085.62 variance 2.36
steps 25799 mean r -1057.91 variance 2.32
steps 27799 mean r -1027.57 variance 2.27
steps 29799 mean r -981.96 variance 2.23
steps 31799 mean r -925.45 variance 2.18
steps 33799 mean r -892.51 variance 2.14
steps 35799 mean r -862.26 variance 2.10
steps 37799 mean r -804.31 variance 2.06
steps 39799 mean r -767.44 variance 2.01
steps 41799 mean r -764.63 variance 1.97
steps 43799 mean r -732.08 variance 1.94
steps 45799 mean r -685.02 variance 1.90
steps 47799 mean r -635.23 variance 1.86
steps 4

In [6]:
actions = []
for _ in range(5):
    state = env.reset()
    r = 0
    while True:
#         env.render()
        action = agent.choose_action([state], 0)
        next_state, reward, done, _ = env.step(action)
        r += reward
        state = next_state
        if done:
            break
    print("reward :", r)

reward : -240.84491210553813
reward : -118.7125619502614
reward : -233.52466355028764
reward : -128.06473817218165
reward : -225.27557628067126
