# Policy Gradients on CartPole-v0

In [1]:
import pickle, math
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt


In [2]:
env = gym.make('CartPole-v0')
env.observation_space, env.action_space

(Box(4,), Discrete(2))

### Running the environment on random actions

In [3]:
env.reset()
random_episodes = 0
reward_sum = 0
while random_episodes < 10:
    env.render()
    observation, reward, done, _ = env.step(np.random.randint(0,2))
    reward_sum += reward
    if done: 
        print("Reward for " + str(random_episodes + 1) + "th episode was: ", reward_sum)
        reward_sum= 0
        random_episodes += 1
        env.reset()
env.close()

Reward for 1th episode was:  11.0
Reward for 2th episode was:  16.0
Reward for 3th episode was:  12.0
Reward for 4th episode was:  23.0
Reward for 5th episode was:  25.0
Reward for 6th episode was:  13.0
Reward for 7th episode was:  23.0
Reward for 8th episode was:  11.0
Reward for 9th episode was:  15.0
Reward for 10th episode was:  39.0


In [4]:
H = 10    #number of hidden layer neurons
batch_size = 5    #update parameters after these many episodes
learning_rate = 1e-2
gamma = 0.99    #discount
D = 4   #input states 

In [5]:
tf.reset_default_graph()
#The network gives the probability of taken an action (left or right) and takes state as input
observations = tf.placeholder(tf.float32, [None, D], name="input_x")
W1 = tf.get_variable("W1", shape=[D,H], initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations, W1))
W2 = tf.get_variable("W2", shape=[H, 1], initializer=tf.contrib.layers.xavier_initializer())
score = tf.matmul(layer1, W2)
probability = tf.nn.sigmoid(score)

tvars = tf.trainable_variables()
input_y = tf.placeholder(tf.float32, [None, 1], name="input_y")
advantages = tf.placeholder(tf.float32, name="reward_signal")

loglikelihood = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability))
loss = -tf.reduce_mean(loglikelihood * advantages)
newGrads = tf.gradients(loss, tvars)

#Apply gradients
adam = tf.train.AdamOptimizer(learning_rate=learning_rate)
W1Grad = tf.placeholder(tf.float32, name="batch_grad1")
W2Grad = tf.placeholder(tf.float32, name="batch_grad2")

batchGrad = [W1Grad, W2Grad]
updateGrads = adam.apply_gradients(zip(batchGrad, tvars))



For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


## Advantage Function
We want our agent to give more weight to the good actions. We do this by negatively weighing the actions that come towards the end of the episode as they most likely contributed to the end. Likewise, earlier episodes are seen as more positive

In [6]:
def discount_rewards(r):    #r is an array of rewards
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

## Running the agent in an environment

In [8]:
xs, hs, dlogps, drs, ys, tfps = [], [], [], [], [], []
running_reward = None
reward_sum = 0 
episode_number = 1
total_episodes = 10000
init = tf.global_variables_initializer()

In [None]:
with tf.Session() as sess:
    rendering = False
    sess.run(init)
    observation = env.reset()
    
    gradBuffer = sess.run(tvars)
    for ix, grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad*0    #initially the gradient buffer is zero
    
    while episode_number <= total_episodes:
        #Only render the environment once agent gets better
        if reward_sum/batch_size > 100 or rendering == True:
            env.render()
            rendering = True
        x = np.reshape(observation, [1, D])
        
        tfprob = sess.run(probability, feed_dict={observations:x})
        action = 1 if np.random.uniform() < tfprob else 0
        xs.append(x)
        y = 1 if action == 0 else 0
        ys.append(y)
        
        #take a step
        observation, reward, done, info = env.step(action)
        reward_sum += reward
        drs.append(reward)
        
        if done:
            episode_number += 1
            epx = np.vstack(xs)
            epy = np.vstack(ys)
            epr = np.vstack(drs)
            tfp = tfps
            xs, hs, dlogs, drs, ys, tfps = [], [], [], [], [], []    #reset array memory
            
            discounted_epr = discount_rewards(epr)
            # size the rewards to be unit normal (helps control the gradient estimator variance)
            discounted_epr -= np.mean(discounted_epr)  #subtract mean
            discounted_epr //= np.std(discounted_epr)  #divide by SD
            
            #save gradient for this episode in the gradBuffer
            tGrad = sess.run(newGrads, feed_dict={observations: epx, input_y:epy, advantages:discounted_epr})
            
            for ix, grad in enumerate(tGrad):
                gradBuffer[ix] += grad
                
            # If we have completed enough episodes, then update the policy network with our gradients.
            if episode_number % batch_size == 0: 
                sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1]})
                for ix,grad in enumerate(gradBuffer):
                    gradBuffer[ix] = grad * 0   #make gradients 0 again as we train only on 5 episodes
                    
                running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
                print('Average reward for episodes in this batch: %f\n  Total average reward %f.' % (reward_sum//batch_size, running_reward//batch_size))
                
                if reward_sum//batch_size > 200: 
                    print("Task solved in", episode_number,'episodes!')
                    env.close()
                    break
                    
                reward_sum = 0
            
            observation = env.reset()
        
print(episode_number,'Episodes completed.')
            
            
            
        

Average reward for episodes in this batch: 21.000000
  Total average reward 21.000000.
Average reward for episodes in this batch: 24.000000
  Total average reward 21.000000.
Average reward for episodes in this batch: 21.000000
  Total average reward 21.000000.
Average reward for episodes in this batch: 25.000000
  Total average reward 21.000000.
Average reward for episodes in this batch: 53.000000
  Total average reward 21.000000.
Average reward for episodes in this batch: 41.000000
  Total average reward 21.000000.
Average reward for episodes in this batch: 30.000000
  Total average reward 21.000000.
Average reward for episodes in this batch: 24.000000
  Total average reward 21.000000.
Average reward for episodes in this batch: 16.000000
  Total average reward 21.000000.
Average reward for episodes in this batch: 20.000000
  Total average reward 21.000000.
Average reward for episodes in this batch: 41.000000
  Total average reward 22.000000.
Average reward for episodes in this batch: 

Average reward for episodes in this batch: 86.000000
  Total average reward 39.000000.
Average reward for episodes in this batch: 99.000000
  Total average reward 39.000000.
Average reward for episodes in this batch: 76.000000
  Total average reward 40.000000.
Average reward for episodes in this batch: 63.000000
  Total average reward 40.000000.
Average reward for episodes in this batch: 100.000000
  Total average reward 41.000000.
Average reward for episodes in this batch: 59.000000
  Total average reward 41.000000.
Average reward for episodes in this batch: 90.000000
  Total average reward 41.000000.
Average reward for episodes in this batch: 133.000000
  Total average reward 42.000000.
Average reward for episodes in this batch: 74.000000
  Total average reward 43.000000.
Average reward for episodes in this batch: 81.000000
  Total average reward 43.000000.
Average reward for episodes in this batch: 93.000000
  Total average reward 43.000000.
Average reward for episodes in this batch