In [5]:
from __future__ import division

import numpy as np
try:
    import cPickle as pickle
except:
    import pickle
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt
import math

try:
    xrange = xrange
except:
    xrange = range

### Loading the CartPole Environment










In [6]:
import gym
env = gym.make('CartPole-v0')

[2017-08-16 17:13:25,952] Making new env: CartPole-v0


What happens if we try running the environment with random actions? How well do we do? (Hint: not so well.)

In [7]:
env.reset()
random_episodes = 0
reward_sum = 0
while random_episodes < 10:
    env.render()
    observation, reward, done, _ = env.step(np.random.randint(0,2))
    reward_sum += reward
    if done:
        random_episodes += 1
        print("Reward for this episode was:",reward_sum)
        reward_sum = 0
        env.reset()

('Reward for this episode was:', 26.0)
('Reward for this episode was:', 21.0)
('Reward for this episode was:', 57.0)
('Reward for this episode was:', 19.0)
('Reward for this episode was:', 29.0)
('Reward for this episode was:', 14.0)
('Reward for this episode was:', 32.0)
('Reward for this episode was:', 47.0)
('Reward for this episode was:', 18.0)
('Reward for this episode was:', 12.0)


In [8]:
# hyperparameters
H = 10 # number of hidden layer neurons
batch_size = 5 # every how many episodes to do a param update?
learning_rate = 1e-2 # feel free to play with this to train faster or more stably.
gamma = 0.99 # discount factor for reward

D = 4 # input dimensionality

In [9]:
tf.reset_default_graph()

#This defines the network as it goes from taking an observation of the environment to 
#giving a probability of chosing to the action of moving left or right.
observations = tf.placeholder(tf.float32, [None,D] , name="input_x")
W1 = tf.get_variable("W1", shape=[D, H],
           initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations,W1))
W2 = tf.get_variable("W2", shape=[H, 1],
           initializer=tf.contrib.layers.xavier_initializer())
score = tf.matmul(layer1,W2)
probability = tf.nn.sigmoid(score)

#From here we define the parts of the network needed for learning a good policy.
tvars = tf.trainable_variables()
input_y = tf.placeholder(tf.float32,[None,1], name="input_y")
advantages = tf.placeholder(tf.float32,name="reward_signal")

# The loss function. This sends the weights in the direction of making actions 
# that gave good advantage (reward over time) more likely, and actions that didn't less likely.
loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability))
loss = -tf.reduce_mean(loglik * advantages) 
newGrads = tf.gradients(loss,tvars)

# Once we have collected a series of gradients from multiple episodes, we apply them.
# We don't just apply gradeients after every episode in order to account for noise in the reward signal.
adam = tf.train.AdamOptimizer(learning_rate=learning_rate) # Our optimizer
W1Grad = tf.placeholder(tf.float32,name="batch_grad1") # Placeholders to send the final gradients through when we update.
W2Grad = tf.placeholder(tf.float32,name="batch_grad2")
batchGrad = [W1Grad,W2Grad]
updateGrads = adam.apply_gradients(zip(batchGrad,tvars))

In [10]:
def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

### Running the Agent and Environment

In [11]:
xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 1
total_episodes = 10000
init = tf.global_variables_initializer()

# Launch the graph
with tf.Session() as sess:
    rendering = False
    sess.run(init)
    observation = env.reset() # Obtain an initial observation of the environment

    gradBuffer = sess.run(tvars)
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
    
    while episode_number <= total_episodes:
        
        if reward_sum/batch_size > 100 or rendering == True : 
            env.render()
            rendering = True
            
        x = np.reshape(observation,[1,D])
        
        tfprob = sess.run(probability,feed_dict={observations: x})
        action = 1 if np.random.uniform() < tfprob else 0
        
        xs.append(x) 
        y = 1 if action == 0 else 0 
        ys.append(y)

       
        observation, reward, done, info = env.step(action)
        reward_sum += reward

        drs.append(reward)
        if done: 
            episode_number += 1
            
            epx = np.vstack(xs)
            epy = np.vstack(ys)
            epr = np.vstack(drs)
            tfp = tfps
            xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[]

            
            discounted_epr = discount_rewards(epr)
            
            discounted_epr -= np.mean(discounted_epr)
            discounted_epr //= np.std(discounted_epr)
            
            
            tGrad = sess.run(newGrads,feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
            for ix,grad in enumerate(tGrad):
                gradBuffer[ix] += grad
                
            
            if episode_number % batch_size == 0: 
                sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1]})
                for ix,grad in enumerate(gradBuffer):
                    gradBuffer[ix] = grad * 0
                
                
                running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
                print('Average reward for episode %f.  Total average reward %f.' % (reward_sum//batch_size, running_reward//batch_size))
                
                if reward_sum//batch_size > 200: 
                    print("Task solved in",episode_number,'episodes!')
                    break
                    
                reward_sum = 0
            
            observation = env.reset()
        
print(episode_number,'Episodes completed.')

Average reward for episode 15.000000.  Total average reward 15.000000.
Average reward for episode 18.000000.  Total average reward 15.000000.
Average reward for episode 24.000000.  Total average reward 15.000000.
Average reward for episode 29.000000.  Total average reward 15.000000.
Average reward for episode 34.000000.  Total average reward 15.000000.
Average reward for episode 28.000000.  Total average reward 15.000000.
Average reward for episode 22.000000.  Total average reward 15.000000.
Average reward for episode 24.000000.  Total average reward 15.000000.
Average reward for episode 33.000000.  Total average reward 15.000000.
Average reward for episode 24.000000.  Total average reward 16.000000.
Average reward for episode 38.000000.  Total average reward 16.000000.
Average reward for episode 30.000000.  Total average reward 16.000000.
Average reward for episode 23.000000.  Total average reward 16.000000.
Average reward for episode 56.000000.  Total average reward 16.000000.
Averag

KeyboardInterrupt: 