# Simple Policy Gradient

This notebook contains the implementation of the Simple Policy Gradient Algorithm using TensorFlow.
<br/>
This notebook is created while going through the official Spinning up in Deep RL Docs.

In [0]:
# Required modules
!pip install gym
!apt-get install python-opengl
!pip install pyglet==1.2.4

In [0]:
# Import required modules
import numpy as np
import tensorflow as tf
import gym
from gym.spaces import Discrete, Box
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

In [0]:
# Arguments
env_name = 'CartPole-v0'
render = True

In [4]:
# Create the env
env = gym.make('CartPole-v0')

  result = entry_point.load(False)


In [5]:
# Get the action space size and observation space size
act_size = env.action_space.n
obs_size = env.observation_space.shape[0]

print ('Action Space Size: {}'.format(act_size),
       '\nObservation Space Size: {}'.format(obs_size))

Action Space Size: 2 
Observation Space Size: 4


In [0]:
# Network Hyperparameters
layers = 2
hneurons = [32, act_size]
epochs = 50
batch_size = 5000
lr = 1e-2
hid_act = tf.tanh
out_act = None

In [0]:
# Build the network
obs_ph = tf.placeholder(shape=(None, obs_size), dtype=tf.float32, name='input')

a1 = tf.layers.dense(obs_ph, units=hneurons[0], activation=hid_act)
logits = tf.layers.dense(a1, units=hneurons[1], activation=None)

# Select the action
actions = tf.squeeze(tf.multinomial(logits=logits, num_samples=1), axis=1)

# Loss function whose gradient is the policy gradient
weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32)
act_ph = tf.placeholder(shape=(None,), dtype=tf.int32)
action_masks = tf.one_hot(act_ph, act_size)
log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logits), axis=1)
loss = -tf.reduce_mean(weights_ph * log_probs)

# Make the train op
train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

In [0]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [0]:
def show_state(env):
    plt.figure(3)
    plt.clf()
    a = env.render(mode='rgb_array')
    print (type(a))
    print (a)
    plt.imshow(env.render(mode='rgb_array'))
    plt.axis('off')
    
    display.clear_output(wait=True)
    display.display(plt.gcf())

In [0]:
def train_one_epoch():
    # Declaring variables to store epoch details
    batch_acts = []
    batch_len = []
    batch_weights = []
    batch_rews = []
    batch_obs = []
    
    # Reset env
    obs = env.reset()
    done = False
    ep_rews = []
    rendered_once_in_epoch = False
    
    while True:
        
        if not rendered_once_in_epoch:
            # For notebooks on server (like Colab)
            #show_state(env)
            # For notebooks on local machines
            #env.render()
            pass
            
        batch_obs.append(obs)
        
        act = sess.run([actions], feed_dict={obs_ph: obs.reshape(1 ,-1)})[0][0]
        
        # Take the action
        obs, rewards, done, info = env.step(act)
        
        # save action, reward
        batch_acts.append(act)
        ep_rews.append(rewards)
        
        if done:
            # Record info, as episode is complete
            ep_ret = sum(ep_rews)
            ep_len = len(ep_rews)
            
            batch_rews.append(ep_ret)
            batch_len.append(ep_len)
            
            batch_weights += [ep_ret] * ep_len
            
            # Reset the environment
            obs, done, ep_rews = env.reset(), False, []
            
            rendered_once_in_epoch = True
            
            if batch_size < len(batch_obs):
                break
                
    batch_loss, _ = sess.run([loss, train_op], feed_dict={obs_ph: np.array(batch_obs),
                                                              act_ph: np.array(batch_acts),
                                                              weights_ph: np.array(batch_weights)})
        
        
    return batch_loss, batch_rews, batch_len

In [27]:
# Training loop
for epoch in range(epochs):
    batch_loss, batch_rets, batch_lens = train_one_epoch()
    print ('Epoch: {:.3f} Loss: {:.3f} Return: {:.3f} ep_len: {:.3f}'
           .format(epoch+1, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))

Epoch: 1.000 Loss: 24.082 Return: 26.649 ep_len: 26.649
Epoch: 2.000 Loss: 24.285 Return: 28.441 ep_len: 28.441
Epoch: 3.000 Loss: 28.568 Return: 32.987 ep_len: 32.987
Epoch: 4.000 Loss: 30.247 Return: 35.203 ep_len: 35.203
Epoch: 5.000 Loss: 32.122 Return: 40.715 ep_len: 40.715
Epoch: 6.000 Loss: 35.210 Return: 44.202 ep_len: 44.202
Epoch: 7.000 Loss: 36.932 Return: 48.317 ep_len: 48.317
Epoch: 8.000 Loss: 40.528 Return: 50.970 ep_len: 50.970
Epoch: 9.000 Loss: 38.384 Return: 53.287 ep_len: 53.287
Epoch: 10.000 Loss: 38.973 Return: 53.287 ep_len: 53.287
Epoch: 11.000 Loss: 44.026 Return: 61.914 ep_len: 61.914
Epoch: 12.000 Loss: 45.591 Return: 62.543 ep_len: 62.543
Epoch: 13.000 Loss: 46.638 Return: 69.192 ep_len: 69.192
Epoch: 14.000 Loss: 54.469 Return: 76.121 ep_len: 76.121
Epoch: 15.000 Loss: 53.321 Return: 76.182 ep_len: 76.182
Epoch: 16.000 Loss: 55.920 Return: 76.000 ep_len: 76.000
Epoch: 17.000 Loss: 61.135 Return: 85.627 ep_len: 85.627
Epoch: 18.000 Loss: 60.521 Return: 88.29

In [0]:
# TensorBoard Setup
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip
get_ipython().system_raw('tensorboard --logdir=./tboard/FrozenLake-v0/ &')
get_ipython().system_raw('./ngrok http 6006 &')
!curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

--2018-12-01 19:11:33--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 34.232.181.106, 34.226.180.131, 34.232.40.183, ...
Connecting to bin.equinox.io (bin.equinox.io)|34.232.181.106|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5363700 (5.1M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip’


2018-12-01 19:11:33 (10.6 MB/s) - ‘ngrok-stable-linux-amd64.zip’ saved [5363700/5363700]

Archive:  ngrok-stable-linux-amd64.zip
  inflating: ngrok                   
http://d2cfdd99.ngrok.io


In [0]:
env.reset()
for _ in range(1000):
    env.step(env.action_space.sample())
    show_state(env)