## Deep Deterministic Policy Gradient
Implementation followed: Continuous control with deep reinforcement learning (arXiv:1509.02971v5)
- Memory Relay
- A3C
- Trained with a target net
- Initial exploration policy is quite important to warm up the net

In [1]:
import datetime
print(datetime.datetime.now().isoformat())

2017-08-26T21:22:27.814572


In [2]:
import numpy as np
import tensorflow as tf
from functools import partial

In [3]:
class Actor(object):
    def __init__(self, n_observation, n_action, name='actor_net'):
        self.n_observation = n_observation
        self.n_action = n_action
        self.name = name
        self.sess = None
        self.build_model()
        self.build_train()
        
    def build_model(self):
        activation = tf.nn.elu
        kernel_initializer = tf.contrib.layers.variance_scaling_initializer()
        kernel_regularizer = tf.contrib.layers.l2_regularizer(0.01)
        default_dense = partial(tf.layers.dense,\
                                activation=activation,\
                                kernel_initializer=kernel_initializer,\
                                kernel_regularizer=kernel_regularizer)
        with tf.variable_scope(self.name) as scope:
            observation = tf.placeholder(tf.float32,shape=[None,self.n_observation])
            hid1 = default_dense(observation,32)
            hid2 = default_dense(hid1,64)
            action = default_dense(hid2,self.n_action,activation=tf.nn.tanh,use_bias=False)
            trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=self.name)
        self.observation,self.action,self.trainable_vars = observation,action,trainable_vars
        
    def build_train(self,learning_rate = 0.0001):
        with tf.variable_scope(self.name) as scope:
            action_grads = tf.placeholder(tf.float32,[None,self.n_action])
            var_grads = tf.gradients(self.action,self.trainable_vars,-action_grads)
            train_op = tf.train.AdamOptimizer(learning_rate).apply_gradients(zip(var_grads,self.trainable_vars))
        self.action_grads,self.train_op = action_grads,train_op
        
    def predict_action(self,obs_batch):
        return self.action.eval(session=self.sess,feed_dict={self.observation:obs_batch})

    def train(self,obs_batch,action_grads):
        batch_size = len(action_grads)
        self.train_op.run(session=self.sess,feed_dict={self.observation:obs_batch,self.action_grads:action_grads/batch_size})
        
    def set_session(self,sess):
        self.sess = sess
        
    def get_trainable_dict(self):
        return {var.name[len(self.name):]: var for var in self.trainable_vars}

In [4]:
class Critic(object):
    def __init__(self, n_observation, n_action, name='critic_net'):
        self.n_observation = n_observation
        self.n_action = n_action
        self.name = name
        self.sess = None
        self.build_model()
        self.build_train()
        
    def build_model(self):
        activation = tf.nn.elu
        kernel_initializer = tf.contrib.layers.variance_scaling_initializer()
        kernel_regularizer = tf.contrib.layers.l2_regularizer(0.01)
        default_dense = partial(tf.layers.dense,\
                                activation=activation,\
                                kernel_initializer=kernel_initializer,\
                                kernel_regularizer=kernel_regularizer)
        with tf.variable_scope(self.name) as scope:
            observation = tf.placeholder(tf.float32,shape=[None,self.n_observation])
            action = tf.placeholder(tf.float32,shape=[None,self.n_action])
            hid1 = default_dense(observation,32)
            hid2 = default_dense(action,32)
            hid3 = tf.concat([hid1,hid2],axis=1)
            hid4 = default_dense(hid3,128)
            Q = default_dense(hid4,1, activation=None)
            trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=self.name)
        self.observation,self.action,self.Q,self.trainable_vars= observation,action,Q,trainable_vars
    
    def build_train(self,learning_rate=0.001):
        with tf.variable_scope(self.name) as scope:
            Qexpected = tf.placeholder(tf.float32,shape=[None,1])
            loss = tf.losses.mean_squared_error(Qexpected,self.Q)
            optimizer = tf.train.AdamOptimizer(learning_rate)
            train_op = optimizer.minimize(loss)
        self.Qexpected,self.train_op = Qexpected,train_op
        self.action_grads = tf.gradients(self.Q,self.action)[0]
    
    def predict_Q(self,obs_batch,action_batch):
        return self.Q.eval(session=self.sess,\
                           feed_dict={self.observation:obs_batch,self.action:action_batch})
    
    def compute_action_grads(self,obs_batch,action_batch):
        return self.action_grads.eval(session=self.sess,\
                               feed_dict={self.observation:obs_batch,self.action:action_batch})
    def train(self,obs_batch,action_batch,Qexpected_batch):
        self.train_op.run(session=self.sess,\
                          feed_dict={self.observation:obs_batch,self.action:action_batch,self.Qexpected:Qexpected_batch})
    
    def set_session(self,sess):
        self.sess = sess
        
    def get_trainable_dict(self):
        return {var.name[len(self.name):]: var for var in self.trainable_vars}

In [5]:
class AsyncNets(object):
    def __init__(self,class_name):
        class_ = eval(class_name)
        self.net = class_(3,1,name=class_name)
        self.target_net = class_(3,1,name='{}_target'.format(class_name))
        self.TAU = tf.placeholder(tf.float32,shape=None)
        self.sess = None
        self.__build_async_assign()
    
    def __build_async_assign(self):
        net_dict = self.net.get_trainable_dict()
        target_net_dict = self.target_net.get_trainable_dict()
        keys = net_dict.keys()
        async_update_op = [target_net_dict[key].assign((1-self.TAU)*target_net_dict[key]+self.TAU*net_dict[key]) \
                           for key in keys]
        self.async_update_op = async_update_op
    
    def async_update(self,tau=0.01):
        self.sess.run(self.async_update_op,feed_dict={self.TAU:tau})
    
    def set_session(self,sess):
        self.sess = sess
        self.net.set_session(sess)
        self.target_net.set_session(sess)
    
    def get_subnets(self):
        return self.net, self.target_net
    
        

In [6]:
from collections import deque
class Memory(object):
    def __init__(self,memory_size=10000):
        self.memory = deque(maxlen=memory_size)
        self.memory_size = memory_size
        
    def __len__(self):
        return len(self.memory)
    
    def append(self,item):
        self.memory.append(item)
        
    def sample_batch(self,batch_size=256):
        idx = np.random.permutation(len(self.memory))[:batch_size]
        return [self.memory[i] for i in idx]

In [7]:
def UONoise():
    theta = 0.15
    sigma = 0.2
    state = 0
    while True:
        yield state
        state += -theta*state+sigma*np.random.randn()

In [8]:
import gym
from gym import wrappers
max_episode = 500
gamma = 0.99
tau = 0.001
memory_size = 10000
batch_size = 256
memory_warmup = batch_size*3
max_explore_eps = 100
save_path = 'DDPG_net_Class.ckpt'

tf.reset_default_graph()
actorAsync = AsyncNets('Actor')
actor,actor_target = actorAsync.get_subnets()
criticAsync = AsyncNets('Critic')
critic,critic_target = criticAsync.get_subnets()

init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
    init.run()
    actorAsync.set_session(sess)
    criticAsync.set_session(sess)
    env = gym.make('Pendulum-v0')
    env = wrappers.Monitor(env,'./tmp/',force=True)
    obs = env.reset()
    iteration = 0
    episode = 0
    episode_score = 0
    episode_steps = 0
    noise = UONoise()
    memory = Memory(memory_size)
    while episode < max_episode:
        print('\riter {}, ep {}'.format(iteration,episode),end='')
        action = actor.predict_action(np.reshape(obs,[1,-1]))[0]
        if episode<max_explore_eps: # exploration policy
            p = episode/max_explore_eps
            action = action*p + (1-p)*next(noise)
        action *= 2 # scale action
        next_obs, reward, done,info = env.step(action)
        memory.append([obs,action,reward,next_obs,done])
        if iteration >= memory_warmup:
            memory_batch = memory.sample_batch(batch_size)
            extract_mem = lambda k : np.array([item[k] for item in memory_batch])
            obs_batch = extract_mem(0)
            action_batch = extract_mem(1)
            reward_batch = extract_mem(2)
            next_obs_batch = extract_mem(3)
            done_batch = extract_mem(4)
            action_next = actor_target.predict_action(next_obs_batch)
            Q_next = critic_target.predict_Q(next_obs_batch,action_next)[:,0]
            Qexpected_batch = reward_batch + gamma*(1-done_batch)*Q_next # target Q value
            Qexpected_batch = np.reshape(Qexpected_batch,[-1,1])
            # train critic
            critic.train(obs_batch,action_batch,Qexpected_batch)
            # train actor
            action_grads = critic.compute_action_grads(obs_batch,action_batch)
            actor.train(obs_batch,action_grads)
            # async update
            actorAsync.async_update(tau)
            criticAsync.async_update(tau)
        episode_score += reward
        episode_steps += 1
        iteration += 1
        if done:
            print(', score {:8f}, steps {}'.format(episode_score,episode_steps))
#             if episode%5 == 0:
                
#                 Q_check = 
            obs = env.reset()
            episode += 1
            episode_score = 0
            episode_steps = 0
            noise = UONoise()
            if episode%100==0:
                saver.save(sess,save_path)
        else:
            obs = next_obs
env.close()

[2017-08-26 21:22:36,925] Making new env: Pendulum-v0
[2017-08-26 21:22:36,973] Clearing 18 monitor files from previous run (because force=True was provided)
[2017-08-26 21:22:36,975] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1408.video000000.mp4


iter 193, ep 0

[2017-08-26 21:22:41,880] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1408.video000001.mp4


iter 199, ep 0, score -1064.337803, steps 200
iter 399, ep 1, score -1452.929239, steps 200
iter 599, ep 2, score -900.730704, steps 200
iter 799, ep 3, score -979.678247, steps 200
iter 999, ep 4, score -1464.379648, steps 200
iter 1199, ep 5, score -964.648093, steps 200
iter 1399, ep 6, score -1625.818235, steps 200
iter 1591, ep 7

[2017-08-26 21:22:51,209] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1408.video000008.mp4


iter 1599, ep 7, score -1263.597151, steps 200
iter 1799, ep 8, score -1451.116410, steps 200
iter 1999, ep 9, score -992.277210, steps 200
iter 2199, ep 10, score -1620.274533, steps 200
iter 2399, ep 11, score -1731.880773, steps 200
iter 2599, ep 12, score -1094.504865, steps 200
iter 2799, ep 13, score -1741.083284, steps 200
iter 2999, ep 14, score -1442.242095, steps 200
iter 3199, ep 15, score -1098.027994, steps 200
iter 3399, ep 16, score -1098.776404, steps 200
iter 3599, ep 17, score -1798.072430, steps 200
iter 3799, ep 18, score -1092.271777, steps 200
iter 3999, ep 19, score -1021.271582, steps 200
iter 4199, ep 20, score -1223.411688, steps 200
iter 4399, ep 21, score -1309.548927, steps 200
iter 4599, ep 22, score -1027.574288, steps 200
iter 4799, ep 23, score -1188.868957, steps 200
iter 4999, ep 24, score -1140.436419, steps 200
iter 5199, ep 25, score -1235.387716, steps 200
iter 5391, ep 26

[2017-08-26 21:23:18,553] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1408.video000027.mp4


iter 5399, ep 26, score -987.548204, steps 200
iter 5599, ep 27, score -1619.530272, steps 200
iter 5799, ep 28, score -968.738765, steps 200
iter 5999, ep 29, score -988.404297, steps 200
iter 6199, ep 30, score -858.700286, steps 200
iter 6399, ep 31, score -1725.719085, steps 200
iter 6599, ep 32, score -1411.769340, steps 200
iter 6799, ep 33, score -1499.526679, steps 200
iter 6999, ep 34, score -1111.330258, steps 200
iter 7199, ep 35, score -1268.784305, steps 200
iter 7399, ep 36, score -1227.741326, steps 200
iter 7599, ep 37, score -1479.133063, steps 200
iter 7799, ep 38, score -1403.646349, steps 200
iter 7999, ep 39, score -1484.729898, steps 200
iter 8199, ep 40, score -1298.216238, steps 200
iter 8399, ep 41, score -1098.002777, steps 200
iter 8599, ep 42, score -1182.987359, steps 200
iter 8799, ep 43, score -1471.492544, steps 200
iter 8999, ep 44, score -1211.134201, steps 200
iter 9199, ep 45, score -1132.366484, steps 200
iter 9399, ep 46, score -1160.883524, steps 

[2017-08-26 21:24:08,642] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1408.video000064.mp4


iter 12799, ep 63, score -514.583627, steps 200
iter 12999, ep 64, score -492.792294, steps 200
iter 13199, ep 65, score -0.809584, steps 200
iter 13399, ep 66, score -128.301171, steps 200
iter 13599, ep 67, score -129.260956, steps 200
iter 13799, ep 68, score -377.287180, steps 200
iter 13999, ep 69, score -368.860775, steps 200
iter 14199, ep 70, score -123.629120, steps 200
iter 14399, ep 71, score -384.817258, steps 200
iter 14599, ep 72, score -0.677299, steps 200
iter 14799, ep 73, score -370.016429, steps 200
iter 14999, ep 74, score -259.441145, steps 200
iter 15199, ep 75, score -237.910846, steps 200
iter 15399, ep 76, score -119.175965, steps 200
iter 15599, ep 77, score -516.978731, steps 200
iter 15799, ep 78, score -1344.161227, steps 200
iter 15999, ep 79, score -0.807787, steps 200
iter 16199, ep 80, score -114.978708, steps 200
iter 16399, ep 81, score -768.592530, steps 200
iter 16599, ep 82, score -128.405298, steps 200
iter 16799, ep 83, score -0.127020, steps 200

[2017-08-26 21:25:33,603] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1408.video000125.mp4


iter 24999, ep 124, score -260.784997, steps 200
iter 25199, ep 125, score -125.729126, steps 200
iter 25399, ep 126, score -260.855328, steps 200
iter 25599, ep 127, score -385.974153, steps 200
iter 25799, ep 128, score -127.646294, steps 200
iter 25999, ep 129, score -250.428526, steps 200
iter 26199, ep 130, score -270.953140, steps 200
iter 26399, ep 131, score -389.787544, steps 200
iter 26599, ep 132, score -250.267412, steps 200
iter 26799, ep 133, score -257.369616, steps 200
iter 26999, ep 134, score -375.464715, steps 200
iter 27199, ep 135, score -335.487404, steps 200
iter 27399, ep 136, score -262.835743, steps 200
iter 27599, ep 137, score -492.912502, steps 200
iter 27799, ep 138, score -259.321548, steps 200
iter 27999, ep 139, score -364.541715, steps 200
iter 28199, ep 140, score -129.515003, steps 200
iter 28399, ep 141, score -240.775872, steps 200
iter 28599, ep 142, score -243.228757, steps 200
iter 28799, ep 143, score -245.208978, steps 200
iter 28999, ep 144, 

[2017-08-26 21:27:37,112] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1408.video000216.mp4


iter 43199, ep 215, score -125.225655, steps 200
iter 43399, ep 216, score -3.873638, steps 200
iter 43599, ep 217, score -122.130962, steps 200
iter 43799, ep 218, score -124.214464, steps 200
iter 43999, ep 219, score -127.464848, steps 200
iter 44199, ep 220, score -360.385836, steps 200
iter 44399, ep 221, score -240.291437, steps 200
iter 44599, ep 222, score -240.390900, steps 200
iter 44799, ep 223, score -124.180430, steps 200
iter 44999, ep 224, score -116.245345, steps 200
iter 45199, ep 225, score -117.838239, steps 200
iter 45399, ep 226, score -124.368810, steps 200
iter 45599, ep 227, score -125.467418, steps 200
iter 45799, ep 228, score -120.172418, steps 200
iter 45999, ep 229, score -118.514655, steps 200
iter 46199, ep 230, score -5.093049, steps 200
iter 46399, ep 231, score -135.675310, steps 200
iter 46599, ep 232, score -258.219327, steps 200
iter 46799, ep 233, score -233.724384, steps 200
iter 46999, ep 234, score -5.685182, steps 200
iter 47199, ep 235, score 

[2017-08-26 21:30:27,232] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1408.video000343.mp4


iter 68599, ep 342, score -366.539098, steps 200
iter 68799, ep 343, score -119.283715, steps 200
iter 68999, ep 344, score -264.026636, steps 200
iter 69199, ep 345, score -127.457260, steps 200
iter 69399, ep 346, score -126.855899, steps 200
iter 69599, ep 347, score -116.423614, steps 200
iter 69799, ep 348, score -365.954037, steps 200
iter 69999, ep 349, score -267.084830, steps 200
iter 70199, ep 350, score -125.671618, steps 200
iter 70399, ep 351, score -368.480243, steps 200
iter 70599, ep 352, score -235.926462, steps 200
iter 70799, ep 353, score -125.897110, steps 200
iter 70999, ep 354, score -298.170883, steps 200
iter 71199, ep 355, score -336.450492, steps 200
iter 71399, ep 356, score -358.646415, steps 200
iter 71599, ep 357, score -129.539452, steps 200
iter 71799, ep 358, score -126.687282, steps 200
iter 71999, ep 359, score -128.661384, steps 200
iter 72199, ep 360, score -124.836461, steps 200
iter 72399, ep 361, score -239.609822, steps 200
iter 72599, ep 362, 

[2017-08-26 21:34:01,853] Finished writing results. You can upload them to the scoreboard via gym.upload('/Users/winter/Google Drive/handson-ml/tmp')


In [9]:
gym.upload('./tmp/', api_key='sk_BlwjttPKR6ZsXVrObENYA')

[2017-08-26 21:34:01,863] [Pendulum-v0] Uploading 500 episodes of training data
[2017-08-26 21:34:03,713] [Pendulum-v0] Uploading videos of 8 training episodes (628501 bytes)
[2017-08-26 21:34:05,040] [Pendulum-v0] Creating evaluation object from ./tmp/ with learning curve and training video
[2017-08-26 21:34:05,260] 
****************************************************
You successfully uploaded your evaluation on Pendulum-v0 to
OpenAI Gym! You can find it at:

    https://gym.openai.com/evaluations/eval_ZVyGQYhVTb67h0Vu6UtOYQ

****************************************************
