## Deep Deterministic Policy Gradient
Implementation followed: Continuous control with deep reinforcement learning (arXiv:1509.02971v5)
- Memory Relay
- A3C
- Trained with a target net
- Initial exploration policy is quite important to warm up the net

In [11]:
import datetime
print(datetime.datetime.now().isoformat())

2017-08-26T20:05:05.197940


In [1]:
import numpy as np
import tensorflow as tf
from functools import partial

In [2]:
class Actor(object):
    def __init__(self, n_observation, n_action, name='actor_net'):
        self.n_observation = n_observation
        self.n_action = n_action
        self.name = name
        self.sess = None
        self.build_model()
        self.build_train()
        
    def build_model(self):
        activation = tf.nn.elu
        kernel_initializer = tf.contrib.layers.variance_scaling_initializer()
        kernel_regularizer = tf.contrib.layers.l2_regularizer(0.1)
        default_dense = partial(tf.layers.dense,\
                                activation=activation,\
                                kernel_initializer=kernel_initializer,\
                                kernel_regularizer=kernel_regularizer)
        with tf.variable_scope(self.name) as scope:
            observation = tf.placeholder(tf.float32,shape=[None,self.n_observation])
            hid1 = default_dense(observation,32)
            hid2 = default_dense(hid1,64)
            action = default_dense(hid2,self.n_action,activation=tf.nn.tanh,use_bias=False)
            trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=self.name)
        self.observation,self.action,self.trainable_vars = observation,action,trainable_vars
        
    def build_train(self,learning_rate = 0.0001):
        with tf.variable_scope(self.name) as scope:
            action_grads = tf.placeholder(tf.float32,[None,self.n_action])
            var_grads = tf.gradients(self.action,self.trainable_vars,-action_grads)
            train_op = tf.train.AdamOptimizer(learning_rate).apply_gradients(zip(var_grads,self.trainable_vars))
        self.action_grads,self.train_op = action_grads,train_op
        
    def predict_action(self,obs_batch):
        return self.action.eval(session=self.sess,feed_dict={self.observation:obs_batch})

    def train(self,obs_batch,action_grads):
        batch_size = len(action_grads)
        self.train_op.run(session=self.sess,feed_dict={self.observation:obs_batch,self.action_grads:action_grads/batch_size})
        
    def set_session(self,sess):
        self.sess = sess
        
    def get_trainable_dict(self):
        return {var.name[len(self.name):]: var for var in self.trainable_vars}

In [3]:
class Critic(object):
    def __init__(self, n_observation, n_action, name='critic_net'):
        self.n_observation = n_observation
        self.n_action = n_action
        self.name = name
        self.sess = None
        self.build_model()
        self.build_train()
        
    def build_model(self):
        activation = tf.nn.elu
        kernel_initializer = tf.contrib.layers.variance_scaling_initializer()
        kernel_regularizer = tf.contrib.layers.l2_regularizer(0.1)
        default_dense = partial(tf.layers.dense,\
                                activation=activation,\
                                kernel_initializer=kernel_initializer,\
                                kernel_regularizer=kernel_regularizer)
        with tf.variable_scope(self.name) as scope:
            observation = tf.placeholder(tf.float32,shape=[None,self.n_observation])
            action = tf.placeholder(tf.float32,shape=[None,self.n_action])
            hid1 = default_dense(observation,32)
            hid2 = default_dense(action,32)
            hid3 = tf.concat([hid1,hid2],axis=1)
            hid4 = default_dense(hid3,128)
            Q = default_dense(hid4,1, activation=None)
            trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=self.name)
        self.observation,self.action,self.Q,self.trainable_vars= observation,action,Q,trainable_vars
    
    def build_train(self,learning_rate=0.001):
        with tf.variable_scope(self.name) as scope:
            Qexpected = tf.placeholder(tf.float32,shape=[None,1])
            loss = tf.losses.mean_squared_error(Qexpected,self.Q)
            optimizer = tf.train.AdamOptimizer(learning_rate)
            train_op = optimizer.minimize(loss)
        self.Qexpected,self.train_op = Qexpected,train_op
        self.action_grads = tf.gradients(self.Q,self.action)[0]
    
    def predict_Q(self,obs_batch,action_batch):
        return self.Q.eval(session=self.sess,\
                           feed_dict={self.observation:obs_batch,self.action:action_batch})
    
    def compute_action_grads(self,obs_batch,action_batch):
        return self.action_grads.eval(session=self.sess,\
                               feed_dict={self.observation:obs_batch,self.action:action_batch})
    def train(self,obs_batch,action_batch,Qexpected_batch):
        self.train_op.run(session=self.sess,\
                          feed_dict={self.observation:obs_batch,self.action:action_batch,self.Qexpected:Qexpected_batch})
    
    def set_session(self,sess):
        self.sess = sess
        
    def get_trainable_dict(self):
        return {var.name[len(self.name):]: var for var in self.trainable_vars}

In [4]:
class AsyncNets(object):
    def __init__(self,class_name):
        class_ = eval(class_name)
        self.net = class_(2,1,name=class_name)
        self.target_net = class_(2,1,name='{}_target'.format(class_name))
        self.TAU = tf.placeholder(tf.float32,shape=None)
        self.sess = None
        self.__build_async_assign()
    
    def __build_async_assign(self):
        net_dict = self.net.get_trainable_dict()
        target_net_dict = self.target_net.get_trainable_dict()
        keys = net_dict.keys()
        async_update_op = [target_net_dict[key].assign((1-self.TAU)*target_net_dict[key]+self.TAU*net_dict[key]) \
                           for key in keys]
        self.async_update_op = async_update_op
    
    def async_update(self,tau=0.01):
        self.sess.run(self.async_update_op,feed_dict={self.TAU:tau})
    
    def set_session(self,sess):
        self.sess = sess
        self.net.set_session(sess)
        self.target_net.set_session(sess)
    
    def get_subnets(self):
        return self.net, self.target_net
    
        

In [5]:
from collections import deque
class Memory(object):
    def __init__(self,memory_size=10000):
        self.memory = deque(maxlen=memory_size)
        self.memory_size = memory_size
        
    def __len__(self):
        return len(self.memory)
    
    def append(self,item):
        self.memory.append(item)
        
    def sample_batch(self,batch_size=256):
        idx = np.random.permutation(len(self.memory))[:batch_size]
        return [self.memory[i] for i in idx]

In [6]:
def UONoise():
    theta = 0.15
    sigma = 0.2
    state = 0
    while True:
        yield state
        state += -theta*state+sigma*np.random.randn()

In [9]:
import gym
from gym import wrappers
max_episode = 200
gamma = 0.99
tau = 0.001
memory_size = 10000
batch_size = 256
memory_warmup = batch_size*3
max_explore_eps = 100
save_path = 'DDPG_net_Class.ckpt'

tf.reset_default_graph()
actorAsync = AsyncNets('Actor')
actor,actor_target = actorAsync.get_subnets()
criticAsync = AsyncNets('Critic')
critic,critic_target = criticAsync.get_subnets()

init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
    init.run()
    actorAsync.set_session(sess)
    criticAsync.set_session(sess)
    env = gym.make('MountainCarContinuous-v0')
    env = wrappers.Monitor(env,'./tmp/',force=True)
    obs = env.reset()
    iteration = 0
    episode = 0
    episode_score = 0
    episode_steps = 0
    noise = UONoise()
    memory = Memory(memory_size)
    while episode < max_episode:
        print('\riter {}, ep {}'.format(iteration,episode),end='')
        action = actor.predict_action(np.reshape(obs,[1,-1]))[0]
        if episode<max_explore_eps: # exploration policy
            p = episode/max_explore_eps
            action = action*p + (1-p)*next(noise)
        next_obs, reward, done,info = env.step(action)
        memory.append([obs,action,reward,next_obs,done])
        if iteration >= memory_warmup:
            memory_batch = memory.sample_batch(batch_size)
            extract_mem = lambda k : np.array([item[k] for item in memory_batch])
            obs_batch = extract_mem(0)
            action_batch = extract_mem(1)
            reward_batch = extract_mem(2)
            next_obs_batch = extract_mem(3)
            done_batch = extract_mem(4)
            action_next = actor_target.predict_action(next_obs_batch)
            Q_next = critic_target.predict_Q(next_obs_batch,action_next)[:,0]
            Qexpected_batch = reward_batch + gamma*(1-done_batch)*Q_next # target Q value
            Qexpected_batch = np.reshape(Qexpected_batch,[-1,1])
            # train critic
            critic.train(obs_batch,action_batch,Qexpected_batch)
            # train actor
            action_grads = critic.compute_action_grads(obs_batch,action_batch)
            actor.train(obs_batch,action_grads)
            # async update
            actorAsync.async_update(tau)
            criticAsync.async_update(tau)
        episode_score += reward
        episode_steps += 1
        iteration += 1
        if done:
            print(', score {:8f}, steps {}'.format(episode_score,episode_steps))
#             if episode%5 == 0:
                
#                 Q_check = 
            obs = env.reset()
            episode += 1
            episode_score = 0
            episode_steps = 0
            noise = UONoise()
            if episode%25==0:
                saver.save(sess,save_path)
        else:
            obs = next_obs
env.close()

[2017-08-26 19:41:54,989] Making new env: MountainCarContinuous-v0
[2017-08-26 19:41:55,000] Finished writing results. You can upload them to the scoreboard via gym.upload('/Users/winter/Google Drive/handson-ml/tmp')
[2017-08-26 19:41:55,008] Clearing 12 monitor files from previous run (because force=True was provided)
[2017-08-26 19:41:55,010] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.2.955.video000000.mp4


iter 993, ep 0

[2017-08-26 19:42:12,100] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.2.955.video000001.mp4


iter 998, ep 0, score -16.821533, steps 999
iter 1575, ep 1, score 90.513038, steps 577
iter 2574, ep 2, score -13.971783, steps 999
iter 3573, ep 3, score -12.553009, steps 999
iter 4338, ep 4, score 88.878217, steps 765
iter 5337, ep 5, score -14.499419, steps 999
iter 6048, ep 6, score 91.099272, steps 711
iter 6676, ep 7

[2017-08-26 19:42:55,373] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.2.955.video000008.mp4


, score 90.996675, steps 628
iter 7675, ep 8, score -10.819755, steps 999
iter 8674, ep 9, score -11.558511, steps 999
iter 8979, ep 10, score 94.314570, steps 305
iter 9978, ep 11, score -10.546130, steps 999
iter 10977, ep 12, score -12.766908, steps 999
iter 11976, ep 13, score -10.189042, steps 999
iter 12975, ep 14, score -11.460311, steps 999
iter 13974, ep 15, score -13.411151, steps 999
iter 14973, ep 16, score -14.304451, steps 999
iter 15676, ep 17, score 88.887782, steps 703
iter 16242, ep 18, score 92.691063, steps 566
iter 17241, ep 19, score -11.693453, steps 999
iter 18023, ep 20, score 90.685539, steps 782
iter 19022, ep 21, score -13.186908, steps 999
iter 19463, ep 22, score 94.026987, steps 441
iter 20462, ep 23, score -13.140392, steps 999
iter 21063, ep 24, score 90.732156, steps 601
iter 22062, ep 25, score -15.018492, steps 999
iter 22621, ep 26

[2017-08-26 19:44:53,473] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.2.955.video000027.mp4


iter 22644, ep 26, score 93.464829, steps 582
iter 23408, ep 27, score 89.554092, steps 764
iter 24091, ep 28, score 87.892311, steps 683
iter 25005, ep 29, score 85.951233, steps 914
iter 26004, ep 30, score -12.744702, steps 999
iter 26531, ep 31, score 94.225969, steps 527
iter 27530, ep 32, score -16.797004, steps 999
iter 28219, ep 33, score 89.064114, steps 689
iter 28892, ep 34, score 85.746313, steps 673
iter 29891, ep 35, score -17.804687, steps 999
iter 30890, ep 36, score -20.038071, steps 999
iter 31889, ep 37, score -21.388418, steps 999
iter 32613, ep 38, score 86.763517, steps 724
iter 32895, ep 39, score 94.907800, steps 282
iter 33183, ep 40, score 96.955224, steps 288
iter 33407, ep 41, score 96.408442, steps 224
iter 33651, ep 42, score 95.072191, steps 244
iter 33817, ep 43, score 97.042655, steps 166
iter 34050, ep 44, score 95.092518, steps 233
iter 34209, ep 45, score 97.400811, steps 159
iter 34365, ep 46, score 97.076128, steps 156
iter 34601, ep 47, score 94.9

[2017-08-26 19:46:36,338] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.2.955.video000064.mp4


iter 37144, ep 63, score 93.996951, steps 152
iter 37291, ep 64, score 94.829838, steps 147
iter 37467, ep 65, score 93.702264, steps 176
iter 37644, ep 66, score 92.224951, steps 177
iter 37794, ep 67, score 93.515279, steps 150
iter 37950, ep 68, score 93.164297, steps 156
iter 38083, ep 69, score 94.774690, steps 133
iter 38235, ep 70, score 93.599692, steps 152
iter 38354, ep 71, score 95.467911, steps 119
iter 38484, ep 72, score 94.388090, steps 130
iter 38558, ep 73, score 95.755785, steps 74
iter 38706, ep 74, score 92.254909, steps 148
iter 38783, ep 75, score 95.610128, steps 77
iter 38864, ep 76, score 95.461576, steps 81
iter 38967, ep 77, score 94.318447, steps 103
iter 39102, ep 78, score 92.680197, steps 135
iter 39179, ep 79, score 95.486308, steps 77
iter 39318, ep 80, score 92.853924, steps 139
iter 39394, ep 81, score 95.659714, steps 76
iter 39465, ep 82, score 95.117314, steps 71
iter 39547, ep 83, score 94.727829, steps 82
iter 39629, ep 84, score 94.514347, steps

[2017-08-26 19:47:15,862] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.2.955.video000125.mp4


iter 42572, ep 124, score 91.147539, steps 94
iter 42668, ep 125, score 90.983828, steps 96
iter 42761, ep 126, score 91.405670, steps 93
iter 42826, ep 127, score 93.698148, steps 65
iter 42897, ep 128, score 93.416037, steps 71
iter 42963, ep 129, score 93.638845, steps 66
iter 43028, ep 130, score 93.683336, steps 65
iter 43094, ep 131, score 93.612474, steps 66
iter 43175, ep 132, score 93.375587, steps 81
iter 43243, ep 133, score 93.515115, steps 68
iter 43309, ep 134, score 93.638845, steps 66
iter 43380, ep 135, score 93.605524, steps 71
iter 43445, ep 136, score 93.657869, steps 65
iter 43531, ep 137, score 92.299140, steps 86
iter 43597, ep 138, score 93.641797, steps 66
iter 43662, ep 139, score 93.683167, steps 65
iter 43743, ep 140, score 93.438447, steps 81
iter 43809, ep 141, score 93.611080, steps 66
iter 43874, ep 142, score 93.664508, steps 65
iter 43942, ep 143, score 93.549058, steps 68
iter 44008, ep 144, score 93.622160, steps 66
iter 44073, ep 145, score 93.65642

[2017-08-26 19:47:55,111] Finished writing results. You can upload them to the scoreboard via gym.upload('/Users/winter/Google Drive/handson-ml/tmp')
