In [1]:
import random 
import gym
import numpy as np 
from collections import deque
import tensorflow as tf 
import tflearn as tl 
from tflearn.activations import softmax

Instructions for updating:
Colocations handled automatically by placer.


In [2]:
episode_infos = {'run' : [], 'step' : [], 'total_reward' : [], 'avg_reward' : []}

In [3]:
def cartpole():
    env = gym.make('CartPole-v1')
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    sess = tf.InteractiveSession()
    a3c_solver = A3CSolver(observation_space, action_space, sess)
    sess.run(tf.global_variables_initializer())
    #a3c_solver.model.load_weights('dqn_weights_%i.h5' % (prev_stop))
    run = 0
    while True:
    #for _ in range(300):
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        step = 0
        total_reward = 0
        while True:
            step += 1
            action = a3c_solver.act(state)
            state_next, reward, terminal, info = env.step(action)
            reward = reward if not terminal else -reward
            total_reward += reward
            state_next = np.reshape(state_next, [1, observation_space])
            a3c_solver.remember(state, action, reward, state_next, terminal)
            state = state_next    
            if terminal:
                
                episode_infos['run'].append(run)
                episode_infos['step'].append(step)
                episode_infos['total_reward'].append(total_reward)
                episode_infos['avg_reward'].append(total_reward / step)
                
                print('epsilon : ' + str(a3c_solver.exploration_rate))
                print('run : ' + str(run) + ' score : ' + str(total_reward) + ' avg_score : ' + str(total_reward / step))
                
                #Check gradients 
                t_s, t_a, t_r, _, _ = random.sample(a3c_solver.memory, 1)[0]
                t_r = np.array(t_r).reshape([1, 1])
                t_a = np.identity(2)[t_a].reshape([1, a3c_solver.action_space])
                t_loss = a3c_solver.sess.run([a3c_solver.loss], 
                                                      feed_dict = {a3c_solver.state : t_s, 
                                                                   a3c_solver.R : t_r, 
                                                                   a3c_solver.action : t_a})
                print('l : ' + str(t_loss))
                
                break
            a3c_solver.experience_replay()
        #dqn_solver.model.save_weights('dqn_weights_%i.h5' % (run))

### Loss function : 
$$L= L_\pi + \alpha L_Q + \beta L_{reg}$$
### Policy loss :
$$A = (R - Q_{a_i\sim \pi}(a_i,s_i; \theta'_q))$$
$$\frac{\partial\,J(\pi)}{\partial\, \theta'} = \frac{1}{n}\sum_{i=1}^{n}\frac{\partial\,log\,\pi (a_i|s_i;\theta')}{\partial\,\theta'}\; A$$
$$\therefore J(\pi) = \frac{1}{n}\sum_{i=1}^{n}log\,\pi (a_i|s_i;\theta')\; R\;\;\;\;\;\;[\because A\;is\;considered\;constant]$$
∵ We want to maximize 𝐽(𝜋) $$L_\pi = -J(\pi)$$
### Value loss :
$$J(Q) = \sum_{i=1}^{n}(R - Q_{a_i\sim \pi}(a_i,s_i; \theta'_q))$$
$$L_Q = J(Q)$$
### Policy entropy :
$$H(\overrightarrow{\pi(s)})=-\sum_{i=1}^{n}\sum_{k=1}^{m} \pi(s_i)_k\cdot log\, \pi(s_i)_k$$
$$L_{reg}=H(\overrightarrow{\pi(s)})$$

In [4]:
y = 0.95
l_rate = -0.0001

exp_memory_size = 10000
batch_size = 32

exploration_max = 1.0
exploration_min = 0.01
exploration_decay = 0.9995

alpha = 0.5
beta = 0.1

In [5]:
losses = []

In [6]:
class A3CSolver():
    def __init__(self, observation_space, action_space, sess):
        self.sess = sess
        self.exploration_rate = exploration_max
        self.action_space = action_space
        self.memory = deque(maxlen = exp_memory_size)
        
        self.state = tl.input_data(shape = [None, observation_space], name = 'S_i')
        self.action = tl.input_data(shape = [None, action_space], name = 'A_i')
        self.R = tl.input_data(shape = [None, 1], name = 'R_i')
        
        #Actor 𝜋(a_i|s_i;𝜃′)
        self.actor = self.build_actor(self.state, action_space)
        #Critic Q_a_i∼𝜋(a_i,s_i;𝜃′_q)
        self.critic = self.build_critic(self.state, self.action)
        
        #Loss L
        self.loss = self.compute_loss()
        #Trainable vars 
        t_vars = tf.trainable_variables()
        #Optimize ops 
        self.adam = tf.train.AdamOptimizer(learning_rate = l_rate)
        #self.opt = tf.train.AdamOptimizer(learning_rate = l_rate).minimize(self.loss)
        self.grads = self.adam.compute_gradients(self.loss, t_vars)
        self.clipped_grads = [[self.clip_grads(grad), var] for grad, var in self.grads]
        self.opt = self.adam.apply_gradients(self.grads)
        
    def clip_grads(self, values):
        if values is None:
            return values
        return tf.clip_by_value(values, -1.0, 1.0)
        
    def build_actor(self, state_input, action_space):
        with tf.variable_scope('actor', reuse = tf.AUTO_REUSE):
            #State input s_i
            #a_h1 = tl.fully_connected(state_input, 24)        
            #a_h2 = tl.fully_connected(a_h1, 48)        
            #a_logit = tl.fully_connected(a_h2, action_space)
            a_logit = tl.fully_connected(state_input, action_space)
            return softmax(a_logit)
    
    def build_critic(self, state_input, action_input):
        with tf.variable_scope('critic', reuse = tf.AUTO_REUSE):
            #Action input a_i
            as_h1 = tl.fully_connected(action_input, 24)
            as_h2 = tl.fully_connected(as_h1, 48)
            #State input s_i
            ss_h1 = tl.fully_connected(state_input, 24)
            ss_h2 = tl.fully_connected(ss_h1, 48)
            #Combine state action input 
            q_h1 = tl.layers.merge_ops.merge([as_h2, ss_h2], mode = 'elemwise_sum')
            return tl.fully_connected(q_h1, 1)
    
    def remember(self, state, action, reward, state_next, done):
        self.memory.append([state, action, reward, state_next, done])
        
    def act(self, state, infer = False):
        #act
        if not infer and np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        action = self.sess.run(self.actor, feed_dict = {self.state : state})
        return np.argmax(action[0])
    
    def experience_replay(self):
        if len(self.memory) < batch_size:
            return
        batch = random.sample(self.memory, batch_size)
        for state, action, reward, state_next, done in batch:
            R = np.array(reward).reshape([1, 1])
            action = np.identity(2)[action].reshape([1, self.action_space])
            if not done:
                action_next = self.sess.run(self.actor, feed_dict = {self.state : state_next})
                R = reward + y * self.sess.run(self.critic, feed_dict = {self.state : state_next, self.action : action_next})
            else:
                R = np.array(-100).reshape([1, 1]) 
            #Optimize
            loss, _, a, b, c= self.sess.run([self.loss, self.opt, self.L_pi, self.L_v, self.actor], feed_dict = {self.state : state, 
                                                                        self.action : action, 
                                                                        self.R : R})
            #print(loss, a, b, c)
            losses.append(loss)
            self.exploration_rate *= exploration_decay
            self.exploration_rate = max(exploration_min, self.exploration_rate)

    def compute_loss(self):
        #Advantage
        A = self.R #- self.critic
        #Policy loss 
        self.L_pi = - tf.reduce_mean(tf.reduce_sum(tf.log(self.actor + 1e-13) * tf.stop_gradient(A), axis = -1))
        #Value loss
        self.L_v = tf.reduce_mean(tf.math.square(self.R - self.critic))
        #Policy entropy 
        self.L_reg = - tf.reduce_mean(tf.reduce_sum(tf.multiply(self.actor, tf.log(self.actor + 1e-13)), axis = -1))
        #Total loss 
        return self.L_pi + alpha * self.L_v #+ beta * self.L_reg 

In [7]:
cartpole()

Instructions for updating:
Use tf.cast instead.
epsilon : 1.0
run : 1 score : 9.0 avg_score : 0.8181818181818182
l : [1.8862686]
epsilon : 0.9684988314739038
run : 2 score : 21.0 avg_score : 0.9130434782608695
l : [1.889775]
epsilon : 0.6491392424807709
run : 3 score : 24.0 avg_score : 0.9230769230769231
l : [1.8637575]
epsilon : 0.44210668023652117
run : 4 score : 23.0 avg_score : 0.92
l : [1.4494327]
epsilon : 0.3828009720733851
run : 5 score : 8.0 avg_score : 0.8
l : [1.6891031]
epsilon : 0.3159131022678686
run : 6 score : 11.0 avg_score : 0.8461538461538461
l : [1.4278322]
epsilon : 0.26919261643987724
run : 7 score : 9.0 avg_score : 0.8181818181818182
l : [4.381146]
epsilon : 0.22938163762610433
run : 8 score : 9.0 avg_score : 0.8181818181818182
l : [3.9553552]
epsilon : 0.2018157486181985
run : 9 score : 7.0 avg_score : 0.7777777777777778
l : [5.734805]
epsilon : 0.174743491117198
run : 10 score : 8.0 avg_score : 0.8
l : [1.3947928]
epsilon : 0.1489006224876073
run : 11 score : 9

KeyboardInterrupt: 

In [8]:
losses

[1.8866302,
 1.8864217,
 1.8859949,
 1.8856814,
 1.8850236,
 1.8850944,
 1.8842952,
 1.8840072,
 1.8835546,
 1.8834795,
 1.8832691,
 1.8830943,
 1.8830812,
 1.8822014,
 1.8818552,
 1.8807997,
 1.8814535,
 4860.7046,
 1.8804206,
 1.8810799,
 1.8804361,
 1.8805481,
 1.8807585,
 1.8812937,
 1.8817184,
 1.8809578,
 1.8811824,
 1.8819249,
 1.8813486,
 1.881192,
 1.8816605,
 1.8812704,
 1.8823018,
 1.8813815,
 1.8813691,
 1.881762,
 1.8813431,
 1.8815612,
 1.8821712,
 1.8815744,
 1.8815045,
 1.8817811,
 1.8813827,
 1.8813651,
 1.8821554,
 1.8822999,
 1.8813324,
 1.8823237,
 1.882097,
 1.8825989,
 1.881489,
 1.881233,
 1.8812644,
 1.8820438,
 4860.842,
 1.8813679,
 1.8818722,
 1.8824522,
 1.8817766,
 1.8827128,
 1.88201,
 1.8829978,
 1.8830905,
 1.883127,
 4861.015,
 1.8824899,
 1.8835385,
 1.8837781,
 1.8833675,
 1.8831884,
 1.8842168,
 1.8833916,
 1.8834312,
 1.8835642,
 1.8836136,
 1.8845098,
 1.8837028,
 1.8837726,
 1.8838015,
 1.8839769,
 1.8840157,
 1.8846996,
 1.8847659,
 1.8848145,
 1

In [None]:
episode_infos['run'][-1]

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.xlabel('episode')
plt.ylabel('reward')
plt.plot(episode_infos['run'][:300], episode_infos['total_reward'][:300])