## Basic Policy Gradient Agent

In [3]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import gym
import matplotlib.pyplot as plt
%matplotlib inline

In [16]:
# import environment from openAI Gym
env = gym.make('CartPole-v0')

# reward function and agent
gamma = 0.9

def discount_rewards(r):
    # compute discounted rewards 
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
        
    return discounted_r

class agent():
    def __init__(self, lr, s_size, a_size, h_size):
        # feedforward of network
        self.state_in = tf.placeholder(shape = [None, s_size], dtype = tf.float32)
        hidden = slim.fully_connected(self.state_in, h_size, biases_initializer = None, activation_fn = tf.nn.relu)
        self.output = slim.fully_connected(hidden, a_size, biases_initializer = None, activation_fn = tf.nn.softmax)
        self.chosen_action = tf.argmax(self.output, 1)
        
        # training process
        # to calculate cost, feed reward and action to network and use it to update
        self.reward_holder = tf.placeholder(shape = [None], dtype = tf.float32)
        self.action_holder = tf.placeholder(shape = [None], dtype = tf.int32)
        
        self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)
        
        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs) * self.reward_holder)
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx, var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32, name = str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
            
        self.gradients = tf.gradients(self.loss, tvars)
        
        optimizer = tf.train.AdamOptimizer(learning_rate = lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders, tvars))
        

# training agent
tf.reset_default_graph()

myAgent = agent(lr = 1e-2, s_size = 4, a_size = 2, h_size = 8)
total_episodes = 5000
max_ep = 999
update_frequency = 5

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    i = 0
    total_reward = []
    total_lenght = []
    
    gradBuffer = sess.run(tf.trainable_variables())
    for ix, grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
        
    while i < total_episodes:
        s = env.reset()
        running_reward = 0
        ep_history = []
        for j in range(max_ep):
            # from output of network, choose action with probability
            a_dist = sess.run(myAgent.output, feed_dict = {myAgent.state_in:[s]})
            a = np.random.choice(a_dist[0], p = a_dist[0])
            a = np.argmax(a_dist == a)
            
            # reward about action
            s1, r, d, _ = env.step(a)
            ep_history.append([s, a, r, s1])
            s = s1
            running_reward += r
            if d == True:
                # update network
                ep_history = np.array(ep_history)
                ep_history[:, 2] = discount_rewards(ep_history[:, 2])
                feed_dict = {myAgent.reward_holder:ep_history[:, 2],
                            myAgent.action_holder:ep_history[:, 1],
                            myAgent.state_in:np.vstack(ep_history[:,0])}
                grads = sess.run(myAgent.gradients, feed_dict = feed_dict)
                for idx, grad in enumerate(grads):
                    gradBuffer[idx] += grad
                    
                if i % update_frequency == 0 and i != 0:
                    feed_dict = dictionary = dict(zip(myAgent.gradient_holders, gradBuffer))
                    _ = sess.run(myAgent.update_batch, feed_dict = feed_dict)
                    for ix, grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad * 0
                        
                total_reward.append(running_reward)
                total_lenght.append(j)
                break
                
                
        # update total reward
        if i % 100 == 0:
            print(np.mean(total_reward[-100:]))
        i += 1
        

36.0
22.43
23.75
24.97
26.64
26.51
28.56
24.72
26.15
27.15
26.11
25.66
27.04
25.21
26.72
24.56
24.17
27.38
26.87
27.4
23.63
25.55
22.35
22.13
24.6
24.45
21.34
21.87
23.15
21.94
21.78
22.85
23.07
26.28
23.76
22.75
23.96
22.32
24.49
24.49
23.47
24.98
25.7
23.45
25.13
24.84
27.75
25.69
25.78
25.29


In [14]:
print(ep_history)
print(env)
print(discount_rewards(ep_history[:, 2]))
a = sess.run(myAgent.indexes, feed_dict = feed_dict)
print(a)

[[array([ 0.01245611, -0.01089109, -0.01127786,  0.04325728]) 1
  6.12579511 array([ 0.01223829,  0.18439075, -0.01041272, -0.25296247])]
 [array([ 0.01223829,  0.18439075, -0.01041272, -0.25296247]) 1
  5.6953279000000006
  array([ 0.0159261 ,  0.37965982, -0.01547197, -0.54891149])]
 [array([ 0.0159261 ,  0.37965982, -0.01547197, -0.54891149]) 1 5.217031
  array([ 0.0235193 ,  0.57499566, -0.0264502 , -0.84642875])]
 [array([ 0.0235193 ,  0.57499566, -0.0264502 , -0.84642875]) 1 4.68559
  array([ 0.03501921,  0.77046828, -0.04337877, -1.14731057])]
 [array([ 0.03501921,  0.77046828, -0.04337877, -1.14731057]) 1 4.0951
  array([ 0.05042858,  0.96612895, -0.06632498, -1.45327485])]
 [array([ 0.05042858,  0.96612895, -0.06632498, -1.45327485]) 1 3.439
  array([ 0.06975116,  1.1619998 , -0.09539048, -1.76592038])]
 [array([ 0.06975116,  1.1619998 , -0.09539048, -1.76592038]) 1 2.71
  array([ 0.09299115,  1.35806158, -0.13070889, -2.08667834])]
 [array([ 0.09299115,  1.35806158, -0.130708

RuntimeError: Attempted to use a closed Session.