# Cart Pendulum balancing using Policy Gradients
This example shows how a deep q network can be created and applied to the cart pendulum problem.

In [1]:
# Set relative path to parent directory
import sys, os
sys.path.insert(0, os.path.abspath('..'))

In [2]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import matplotlib.pyplot as plt
import numpy as np

checkpoint_file = "./policy_net.ckpt"

In [3]:
# Import environment
from environments.cartPendulum import cartPendulum

env = cartPendulum(mass_cart=1, mass_pendulum=0.1, length_pendulum=1, gravity=9.81)
env.step_size = 0.02

## Defining environment functions
We start by definin some of the functions for the environment.
- The reward is 1 when the agent is within $0.1$ rad $\approx 5.7$ degrees of the upright position, -1 when the cart is out of bounds and 0 otherwise.
- The actions available to the agent are $\pm 10, \pm 1$ and $0$ Newtons of force on the cart
- The terminal conditions are when the cart is $\pm 2$m from the center or the pendulum is $>0.5$ rad from the upright position
- The state avalable to the agent is $x, \theta, \dot{x}, \dot{\theta}$ where $x$ is the position of the cart and $\theta$ is the angle of the pendulum

In [4]:
# Define reward function
env.reward = lambda : -1 if (np.abs(env.x[0]) > 2) else (1 if np.cos(env.state()[1]) > np.cos(0.1) else 0)

# Define legal actions 
env.actions = lambda : np.array([-10, -1, 0, 1, 10])

# Define terminal state
env.terminal = lambda : np.abs(env.x[0]) > 2 or np.abs(env.state()[1]) > 0.5

# Define state variabels
r = lambda theta : ((theta/np.pi - 1) % 2)*np.pi - np.pi
env.state = lambda : np.array([env.x[0], r(env.x[1]), env.x[2], env.x[3]])#, np.sin(env.x[1]), np.cos(env.x[1])])

## Creating the policy based agent
we first create a discounting function which takes in the reward over time, and returns a the discounted sequence of rewards given the discounting factor $\gamma$. The total discounted reward at time $t$ can be written as: 
$$G_t = R_t + \gamma R_{t+1} + \gamma^2 R_{t+2} + \dots $$ 
Recursively this can be written as:
$$ G_t = R_{t} + \gamma G_{t+1} $$

In [5]:
def discount_rewards(rewards, gamma, normalize = False):
    discounted_reward = np.zeros(rewards.shape)
    for t in reversed(range(0, rewards.size)):
        discounted_reward[t] = discounted_reward[(t + 1)%rewards.shape[0]]*gamma + rewards[t]
    largest = np.linalg.norm(discounted_reward, np.inf)
    return (discounted_reward/largest) if normalize and (largest > 0) else discounted_reward

The function approximator used is a neural network, here we use a ntwork with one hidden layer, where the input is the state, and the output is the probability distribution for selecting any of the available actions. The trainig consists of simulating the environment by following some policy until we reach the terminal conditions. We then calculate the gradients of the for each time step and multiply the gradients with thediscounted reward. we then nudge the policy network in this direction in order to promote good behaveour and punish bad behaveoure

In [6]:
class agent():
    def __init__(self, lr, s_size,a_size,h_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_in= tf.placeholder(shape=[None,s_size],dtype=tf.float32)
        hidden = slim.fully_connected(self.state_in,h_size,biases_initializer=None,activation_fn=tf.nn.relu)
        self.output = slim.fully_connected(hidden,a_size,activation_fn=tf.nn.softmax,biases_initializer=None)
        self.chosen_action = tf.argmax(self.output,1)

        #The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
        
        self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)

        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss,tvars)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))

In [7]:
tf.reset_default_graph() # Clear the Tensorflow graph.

myAgent = agent(lr = 1e-2, s_size=len(env.state()) ,a_size = len(env.actions()), h_size=12)
saver = tf.train.Saver() # Create a tensorflow saver

# Set number of episodes, max numer of steps and how often we train the nework
total_episodes = 200
max_steps = 1000
update_frequency = 5
save_frequency = 100
gamma = 0.99

init = tf.global_variables_initializer()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    saver.restore(sess, checkpoint_file)
    i = 0
    total_reward = []
    total_lenght = []
        
    gradBuffer = sess.run(tf.trainable_variables())
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
        
    while i < total_episodes:
        theta = (2*np.random.random() - 1)*0.4
        state = env.init([0,theta,0,0])
        running_reward = 0
        ep_history = []
        for j in range(max_steps):
            #Probabilistically pick an action given our network outputs.
            a_dist = sess.run(myAgent.output,feed_dict={myAgent.state_in:[state]})
            action = np.random.choice(env.actions(),p=a_dist[0])
            action_idx = np.argmax(env.actions() == action)

            next_state = env.step(env.actions()[action_idx])
            reward = env.reward()
            ep_history.append([state,action_idx,reward,next_state])
            state = next_state
            running_reward += reward
            if env.terminal() == True or (j + 1) == max_steps:
                #Update the network.
                ep_history = np.array(ep_history)
                ep_history[:,2] = discount_rewards(ep_history[:,2], gamma, normalize=False)
                feed_dict={myAgent.reward_holder:ep_history[:,2],
                        myAgent.action_holder:ep_history[:,1],myAgent.state_in:np.vstack(ep_history[:,0])}
                grads = sess.run(myAgent.gradients, feed_dict=feed_dict)
                for idx,grad in enumerate(grads):
                    gradBuffer[idx] += grad

                if i % update_frequency == 0 and i != 0:
                    feed_dict= dictionary = dict(zip(myAgent.gradient_holders, gradBuffer))
                    _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
                    for ix,grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad * 0
                
                total_reward.append(running_reward)
                total_lenght.append(j)
                break

        
        # Print trainig progress and save network
        if i % save_frequency == 0:
            saver.save(sess, checkpoint_file)
            mean_reward = np.mean(total_reward[-save_frequency:])
            print("Num episodes: {}, Mean reward: {}".format(i, mean_reward))
        i += 1

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Restoring parameters from ./policy_net.ckpt
Num episodes: 0, Mean reward: 930.0
Num episodes: 100, Mean reward: 984.65


## Show training progress

In [8]:
from ipywidgets import widgets
%matplotlib inline

def plot_moving_average(window_size):
    interval = total_reward
    window= np.ones(int(window_size))/float(window_size)
    data = np.convolve(interval, window, 'valid')
    plt.plot(data)
    plt.show()

slider = widgets.interact(plot_moving_average, window_size=(1, 1000, 1))

In [9]:
from matplotlib import pyplot as plt
from matplotlib.patches import Rectangle
from matplotlib import animation
from IPython.display import HTML

# Simulate the environment for n_steps and save states in X
n_steps = 500
X = np.zeros((len(env.x), n_steps))
with tf.Session() as sess:
    saver.restore(sess, checkpoint_file)
    state = env.init([0, (2*np.random.random() - 1)*0.4, 0, 0])
    for step in range(n_steps):
        X[:, step] = env.x
        a_dist = sess.run(myAgent.output,feed_dict={myAgent.state_in:[state]})
        state = env.step(env.actions()[np.argmax(a_dist)]) 

# Create the figure
fig = plt.figure()
ax = plt.axes(xlim=(-2, 2), ylim=(-1.1, 1.1), aspect='equal')
line, = ax.plot([], [], lw=2, marker='o', markersize=6)
rect = Rectangle([X[0][0] - 0.2, -0.1],0.4, 0.2, fill=True, color='red', ec='black')
ax.add_patch(rect)

animate = lambda i: (rect.set_xy([X[0,i] - 0.2, -0.1]), 
                     line.set_data([X[0,i], X[0,i] + np.sin(X[1, i])],[0, np.cos(X[1, i])]))
# Create and display animation
plt.close()
anim = animation.FuncAnimation(fig, animate, interval = env.step_size*1000, frames = n_steps)
HTML(anim.to_html5_video())

INFO:tensorflow:Restoring parameters from ./policy_net.ckpt
