In [1]:
import tensorflow as tf
import numpy as np
import gym

In [None]:
env = gym.make('Cartpole-v0')
env = env.unwrapped
env.seed(1)

In [None]:
obs_space = 4
act_space = env.action_space.n

max_eps = 300
l_rate = 0.01
gamma = 0.95

In [None]:
state_in = tf.placeholder(tf.float32, [None, obs_space], name = 'st_in')
action_in = tf.placeholder(tf.float32, [None, act_space], name = 'ac_in')
disc_rewards_in = tf.placeholder(tf.float32, [None, ], name = 'disc_r_in')

#Policy
with tf.name_scope('Policy_stream'):
    
    p_fc1 = tf.layers.dense(state_in, 
                            10, 
                            activation = tf.nn.relu, 
                            kernel_initializer = tf.contrib.layers.xavier_initializer())
    p_fc2 = tf.layers.dense(p_fc1, 
                            act_space, 
                            activation = tf.nn.relu, 
                            kernel_initializer = tf.contrib.layers.xavier_initializer())
    p_fc3 = tf.layers.dense(p_fc2, 
                            act_space, 
                            activation = None, 
                            kernel_initializer = tf.contrib.layers.xavier_initializer())
    
    action_distribution = tf.nn.softmax(p_fc3)
    
    #Policy loss
    neg_log_probs = - tf.reduce_sum(tf.math.multiply(action_in, tf.log(action_distribution)), axis = -1)
    p_loss = tf.reduce_mean(neg_log_probs * disc_rewards_in)

#Value
with tf.name_scope('Value_stream'):
    
    v_fc1 = tf.layers.dense(state_in, 
                            24, 
                            activation = tf.nn.relu, 
                            kernel_initializer = tf.contrib.layers.xavier_initializer())
    value = tf.layers.dense(v_fc1, 
                            1, 
                            activation = None, 
                            kernel_initializer = tf.contrib.layers.xavier_initializer())
    
    #Value loss
    v_loss = tf.reduce_mean(tf.math.square(value - disc_rewards_in))

#Combined loss
alpha = 0.8
loss = p_loss + alpha * v_loss

#Optimizer
opt = tf.train.AdamOptimizer(l_rate).minimize(loss)

In [None]:
max_memory = 1000000
memory = {'states' : deque(maxlen = max_memory), 
          'actions' : deque(maxlen = max_memory), 
          'rewards' : deque(maxlen = max_memory), 
          'next_states' : deque(maxlen = max_memory), 
          'done' : deque(maxlen = max_memory)}

def memorize(state, action, reward, next_state, done):
    memory['states'].append(state)
    memory['actions'].append(action)
    memory['rewards'].append(reward)
    memory['next_states'].append(next_state)
    memory['done'].append(done)
    
def sample(sample_size):    
    indexes = np.random.choice(max_memory, sample_size, replace=False)
    
    s_s = []
    s_a = []
    s_r = []
    s_s1 = []
    s_d = []
    
    for index in indexes:
        
        s_s.append(memory['states'][index])
        s_a.append(memory['action'][index])
        s_r.append(memory['rewards'][index])
        s_d.append(memory['done'][index])
        
        if memory['done'][index]:
            s_s1.append(np.zeros(obs_space))
        else:
            s_s1.append(memory['action'][index])
            
    return np.vstack(s_s), np.vstack(s_a), np.vstack(s_r), np.vstack(s_s1), np.vstack(s_d)
        

In [None]:
batch_size = 32
sess = tf.InteractiveSession()
def exp_replay():    
    
    if len(self.memory['states']) < batch_size:
        return
    
    states, actions, rewards, next_states, done = sample(batch_size)
    #R = r + γ * V(s)
    disc_rewards = rewards + gamma * sess.run(value, feed_dict = {state_in : next_states})
    
    _loss, _ = sess.run([loss, opt], feed_dict {state_in : states, 
                                                action_in : actions, 
                                                disc_rewards_in : disc_rewards})