In [2]:
import random 
import gym
import numpy as np 
from collections import deque
import tensorflow as tf 
import tflearn as tl 
from tflearn.activations import sigmoid

Instructions for updating:
Colocations handled automatically by placer.


### Loss function : 
$$L= L_\pi + \alpha L_Q + \beta L_{reg}$$
### Policy loss :
$$A = (R - Q_{a_i\sim \pi}(a_i,s_i; \theta'_q))$$
$$\frac{\partial\,J(\pi)}{\partial\, \theta'} = \frac{1}{n}\sum_{i=1}^{n}\frac{\partial\,log\,\pi (a_i|s_i;\theta')}{\partial\,\theta'}\; A$$
$$\therefore J(\pi) = \frac{1}{n}\sum_{i=1}^{n}log\,\pi (a_i|s_i;\theta')\; A\;\;\;\;\;\;[\because A\;is\;considered\;constant]$$
∵ We want to maximize 𝐽(𝜋) $$L_\pi = -J(\pi)$$
### Value loss :
$$J(Q) = \sum_{i=1}^{n}(R - Q_{a_i\sim \pi}(a_i,s_i; \theta'_q))$$
$$L_Q = J(Q)$$
### Policy entropy :
$$H(\overrightarrow{\pi(s)})=-\sum_{i=1}^{n}\sum_{k=1}^{m} \pi(s_i)_k\cdot log\, \pi(s_i)_k$$
$$L_{reg}=H(\overrightarrow{\pi(s)})$$

In [1]:
y = 0.95
l_rate = 0.001

exp_memory_size = 1000000
batch_size = 20

exploration_max = 1.0
exploration_min = 0.01
exploration_decay = 0.9995

In [None]:
def A3CSolver():
    def __init__(self, observation_space, action_space, sess):
        self.sess = sess
        self.exploration_rate = exploration_max
        self.action_space = action_space
        self.memory = deque(maxlen = exp_memory_size)
        
        self.state = tl.input_data(shape = [None, observation_space])
        self.action = tl.input_data(shape = [None, action_space])
        
        #Actor 𝜋(a_i|s_i;𝜃′)
        self.actor = build_actor(self.state, action_space)
        #Critic Q_a_i∼𝜋(a_i,s_i;𝜃′_q)
        self.critic = build_critic(self.state, self.action)
        
    def build_actor(self, state_input, action_space):
        #State input s_i
        a_h1 = tl.fully_connected(state_input, 24)        
        a_h2 = tl.fully_connected(self.a_h1, 48)        
        a_logit = tl.fully_connected(self.a_h2, 1)
        return sigmoid(self.a_logit)
    
    def build_critic(self, state_input, action_input):
        #Action input a_i
        as_h1 = tl.fully_connected(action_input, 24)
        as_h2 = tl.fully_connected(self.as_h1, 48)
        #State input s_i
        ss_h1 = tl.fully_connected(state_input, 24)
        ss_h2 = tl.fully_connected(self.ss_h2, 48)
        #Combine state action input 
        q_h1 = tl.layers.merge_ops.merge([self.as_h2, self.ss_h2], mode = 'elemwise_sum')
        return tl.fully_connected(self.q_h1, 1)
        
        
    def network_loss(self, state, action, reward, state_next, done):
        l_policy = policy_loss()
        l_value = value_loss()
        l_p_entropy = policy_entropy()
        
    def policy_loss(self, state, action, reward, state_next):
        #Advantage calculation A = (R-Q_a_i∼𝜋(a_i,s_i;𝜃′_q))
        action_next = self.sess.run(self.actor, feed_dict = {self.state : state_next})
        R = reward + y * self.sess.run(self.critic, feed_dict = {self.state : state_next, self.action : action_next})
        Q = self.sess.run(self.critic, feed_dict = {self.state : state, self.action : action})
        A = R - Q
        #policy loss L_𝜋 = -1/n∑log𝜋(a_i|s_i;𝜃′)A
        return - tf.reduce_mean(tf.reduce_mean(tf.log(self.actor) * A, axis = -1))
    