# Simple Policy Gradient

This notebook contains the implementation of the Simple Policy Gradient Algorithm using TensorFlow.
<br/>
This notebook is created while going through the official Spinning up in Deep RL Docs.

In [1]:
# Required modules
!pip install gym
!apt-get install python-opengl
!pip install pyglet==1.2.4

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python-opengl is already the newest version (3.1.0+dfsg-1).
0 upgraded, 0 newly installed, 0 to remove and 7 not upgraded.


In [0]:
# Import required modules
import numpy as np
import tensorflow as tf
from tensorflow.python.framework import graph_util
from tensorflow.python.platform import gfile
import gym
from gym.spaces import Discrete, Box
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display
from google.colab import files

In [0]:
# Arguments
env_name = 'CartPole-v0'
render = True

In [24]:
# Create the env
env = gym.make('CartPole-v0')

  result = entry_point.load(False)


In [25]:
# Get the action space size and observation space size
act_size = env.action_space.n
obs_size = env.observation_space.shape[0]

print ('Action Space Size: {}'.format(act_size),
       '\nObservation Space Size: {}'.format(obs_size))

Action Space Size: 2 
Observation Space Size: 4


In [0]:
# Network Hyperparameters
layers = 2
hneurons = [32, act_size]
epochs = 50
batch_size = 5000
lr = 1e-2
hid_act = tf.tanh
out_act = None

In [0]:
# Build the network
obs_ph = tf.placeholder(shape=(None, obs_size), dtype=tf.float32, name='input')

a1 = tf.layers.dense(obs_ph, units=hneurons[0], activation=hid_act)
logits = tf.layers.dense(a1, units=hneurons[1], activation=None)

# Select the action
actions = tf.squeeze(tf.multinomial(logits=logits, num_samples=1), axis=1, name='output')

# Loss function whose gradient is the policy gradient
weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32)
act_ph = tf.placeholder(shape=(None,), dtype=tf.int32)
action_masks = tf.one_hot(act_ph, act_size)
log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logits), axis=1)
loss = -tf.reduce_mean(weights_ph * log_probs)

# Make the train op
train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

In [0]:
saver = tf.train.Saver()
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [0]:
def show_state(env):
    plt.figure(3)
    plt.clf()
    a = env.render(mode='rgb_array')
    print (type(a))
    print (a)
    plt.imshow(env.render(mode='rgb_array'))
    plt.axis('off')
    
    display.clear_output(wait=True)
    display.display(plt.gcf())

In [0]:
def train_one_epoch():
    # Declaring variables to store epoch details
    batch_acts = []
    batch_len = []
    batch_weights = []
    batch_rews = []
    batch_obs = []
    
    # Reset env
    obs = env.reset()
    done = False
    ep_rews = []
    rendered_once_in_epoch = False
    
    while True:
        
        if not rendered_once_in_epoch:
            # For notebooks on server (like Colab)
            #show_state(env)
            # For notebooks on local machines
            #env.render()
            pass
            
        batch_obs.append(obs)
        
        act = sess.run([actions], feed_dict={obs_ph: obs.reshape(1 ,-1)})[0][0]
        
        # Take the action
        obs, rewards, done, info = env.step(act)
        
        # save action, reward
        batch_acts.append(act)
        ep_rews.append(rewards)
        
        if done:
            # Record info, as episode is complete
            ep_ret = sum(ep_rews)
            ep_len = len(ep_rews)
            
            batch_rews.append(ep_ret)
            batch_len.append(ep_len)
            
            batch_weights += [ep_ret] * ep_len
            
            # Reset the environment
            obs, done, ep_rews = env.reset(), False, []
            
            rendered_once_in_epoch = True
            
            if batch_size < len(batch_obs):
                break
                
    batch_loss, _ = sess.run([loss, train_op], feed_dict={obs_ph: np.array(batch_obs),
                                                              act_ph: np.array(batch_acts),
                                                              weights_ph: np.array(batch_weights)})
        
        
    return batch_loss, batch_rews, batch_len

In [0]:
# Training Loop Parameters
ckpt_interval = 5
save_path = './ckpt_path/'
restore = False
ckpt_num = 45

In [0]:
# Saving the weights along with the graph
def save_graph(sess, graph, graph_name):
    output_graph_def = graph_util.convert_variables_to_constants(
                        sess, graph.as_graph_def(), ['output'])
    with gfile.FastGFile(graph_name, 'wb') as f:
        f.write(output_graph_def.SerializeToString())
    return

In [21]:
# Training loop

if restore and ckpt_num != None:
    saver.restore(sess, save_path + 'spg_ckpt{}.ckpt'.format(ckpt_num))
    print ('[INFO]Model Restored!!')
    
for epoch in range(epochs):
    batch_loss, batch_rets, batch_lens = train_one_epoch()
    print ('Epoch: {:.3f} Loss: {:.3f} Return: {:.3f} ep_len: {:.3f}'
           .format(epoch+1, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
    
    if (epoch+1) % ckpt_interval == 0:
        print ('[INFO]Saving Checkpoint...')
        curr_save_path = saver.save(sess, save_path + 'spg_ckpt{}.ckpt'.format(epoch+1))
        print ('[INFO]Session saved Successfully!!')
        print ('[INFO]Checkpoint saved at: {}'.format(curr_save_path))
        print ('*************************************************')

        
sess.close()

Epoch: 1.000 Loss: 30.518 Return: 34.384 ep_len: 34.384
Epoch: 2.000 Loss: 35.005 Return: 39.786 ep_len: 39.786
Epoch: 3.000 Loss: 33.453 Return: 41.372 ep_len: 41.372
Epoch: 4.000 Loss: 36.811 Return: 47.047 ep_len: 47.047
Epoch: 5.000 Loss: 36.649 Return: 47.255 ep_len: 47.255
[INFO]Saving Checkpoint...
[INFO]Session saved Successfully!!
[INFO]Checkpoint saved at: ./ckpt_path/spg_ckpt5.ckpt
*************************************************
Epoch: 6.000 Loss: 35.193 Return: 49.634 ep_len: 49.634
Epoch: 7.000 Loss: 41.239 Return: 57.159 ep_len: 57.159
Epoch: 8.000 Loss: 41.997 Return: 58.805 ep_len: 58.805
Epoch: 9.000 Loss: 40.687 Return: 60.554 ep_len: 60.554
Epoch: 10.000 Loss: 46.847 Return: 65.013 ep_len: 65.013
[INFO]Saving Checkpoint...
[INFO]Session saved Successfully!!
[INFO]Checkpoint saved at: ./ckpt_path/spg_ckpt10.ckpt
*************************************************
Epoch: 11.000 Loss: 45.727 Return: 67.373 ep_len: 67.373
Epoch: 12.000 Loss: 46.163 Return: 68.176 ep_len:

In [51]:
# Download checkpoints
dwnld_ckpt = 50
for file_ending in ['meta', 'index', 'data-00000-of-00001']:
    files.download('./ckpt_path/spg_ckpt{}.ckpt.{}'.format(dwnld_ckpt, file_ending))
    print ('[INFO]Download popup for ckpt file with .{} ending sent successfully!!'
           .format(file_ending))
print ('[INFO]All download notifications for the ckpt {} file sent successfully!!'.format(dwnld_ckpt))

[INFO]Download popup for ckpt file with .meta ending sent successfully!!
[INFO]Download popup for ckpt file with .index ending sent successfully!!
[INFO]Download popup for ckpt file with .data-00000-of-00001 ending sent successfully!!
All download notifications for the ckpt 50 file sent successfully!!


In [0]:
# save the weights and graph
if True:
    print ('[INFO]Saving the graph and weights...')
    save_graph(sess, tf.get_default_graph(), env_name + '_graph.pb')
    print ('[INFO]Saved Successfully!!')

In [0]:
# Load the graph
model_file = './CartPole-v0_graph.pb'

def load_graph(model_file):
    print ('[INFO]Loading Model...')
    graph = tf.Graph()
    graph_def = tf.GraphDef()
    
    print ('[INFO]Reading model file...')
    with open(model_file, 'rb') as f:
        graph_def.ParseFromString(f.read())

    with graph.as_default():
        tf.import_graph_def(graph_def)
    
    print ('[INFO]Model Loaded Successfully!!')
    return graph

graph = load_graph(model_file)

In [50]:
# Test the network

input_layer = 'import/input'
output_layer = 'import/output'

input_op = graph.get_operation_by_name(input_layer)
output_op = graph.get_operation_by_name(output_layer)


with tf.Session(graph=graph) as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    
    obs = env.reset()
    done = False
    ep_rews = 0
    
    while not done:
        act = sess.run([output_op.outputs[0]], feed_dict={input_op.outputs[0]: obs.reshape(1, -1)})
        
        obs, rewards, done, info = env.step(act[0][0])
        
        ep_rews += rewards
        
    print ('Test Episode Rewards: {}'.format(ep_rews))

Test Episode Rewards: 200.0


In [0]:
# TensorBoard Setup
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip
get_ipython().system_raw('tensorboard --logdir=./tboard/FrozenLake-v0/ &')
get_ipython().system_raw('./ngrok http 6006 &')
!curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

--2018-12-01 19:11:33--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 34.232.181.106, 34.226.180.131, 34.232.40.183, ...
Connecting to bin.equinox.io (bin.equinox.io)|34.232.181.106|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5363700 (5.1M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip’


2018-12-01 19:11:33 (10.6 MB/s) - ‘ngrok-stable-linux-amd64.zip’ saved [5363700/5363700]

Archive:  ngrok-stable-linux-amd64.zip
  inflating: ngrok                   
http://d2cfdd99.ngrok.io
