## Policy gradient using REINFORCE algorithm

In this demo we create a multi-layer perceptron using tensorflow and train it to estimate our policy for the cart-pole environment in openai gym.

In [1]:
import tensorflow as tf
import numpy as np
import gym

In [2]:
env = gym.make('CartPole-v0')
env = env.unwrapped
# Policy gradient has high variance, seed for reproducability
env.seed(1)

  result = entry_point.load(False)


[1]

In [3]:
## ENVIRONMENT Hyperparameters
state_size = 4
action_size = env.action_space.n

## TRAINING Hyperparameters
max_episodes = 300
learning_rate = 0.01
gamma = 0.95 # Discount rate

In [4]:
def discount_and_normalize_rewards(episode_rewards):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    cumulative = 0.0
    for i in reversed(range(len(episode_rewards))):
        cumulative = cumulative * gamma + episode_rewards[i]
        discounted_episode_rewards[i] = cumulative
    
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean) / (std)
    
    return discounted_episode_rewards

In [5]:
with tf.name_scope("inputs"):
    input_ = tf.placeholder(tf.float32, [None, state_size], name="input_")
    actions = tf.placeholder(tf.int32, [None, action_size], name="actions")
    discounted_episode_rewards_ = tf.placeholder(tf.float32, [None,], name="discounted_episode_rewards")
    
    # Add this placeholder for having this variable in tensorboard
    mean_reward_ = tf.placeholder(tf.float32 , name="mean_reward")

    with tf.name_scope("fc1"):
        fc1 = tf.contrib.layers.fully_connected(inputs = input_,
                                                num_outputs = 10,
                                                activation_fn=tf.nn.relu,
                                                weights_initializer=tf.contrib.layers.xavier_initializer())

    with tf.name_scope("fc2"):
        fc2 = tf.contrib.layers.fully_connected(inputs = fc1,
                                                num_outputs = action_size,
                                                activation_fn= tf.nn.relu,
                                                weights_initializer=tf.contrib.layers.xavier_initializer())
    
    with tf.name_scope("fc3"):
        fc3 = tf.contrib.layers.fully_connected(inputs = fc2,
                                                num_outputs = action_size,
                                                activation_fn= None,
                                                weights_initializer=tf.contrib.layers.xavier_initializer())

    with tf.name_scope("softmax"):
        action_distribution = tf.nn.softmax(fc3)

    with tf.name_scope("loss"):
        # tf.nn.softmax_cross_entropy_with_logits computes the cross entropy of the result after applying the softmax function
        # If you have single-class labels, where an object can only belong to one class, you might now consider using 
        # tf.nn.sparse_softmax_cross_entropy_with_logits so that you don't have to convert your labels to a dense one-hot array. 
        neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = fc3, labels = actions)
        loss = tf.reduce_mean(neg_log_prob * discounted_episode_rewards_) 
        
    
    with tf.name_scope("train"):
        train_opt = tf.train.AdamOptimizer(learning_rate).minimize(loss)

In [6]:
allRewards = []
total_rewards = 0
maximumRewardRecorded = 0
episode = 0
episode_states, episode_actions, episode_rewards = [],[],[]

saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for episode in range(max_episodes):
        
        episode_rewards_sum = 0

        # Launch the game
        state = env.reset()
        
        #env.render()
           
        while True:
            
            # Choose action a, remember WE'RE NOT IN A DETERMINISTIC ENVIRONMENT, WE'RE OUTPUT PROBABILITIES.
            action_probability_distribution = sess.run(action_distribution, feed_dict={input_: state.reshape([1,4])})
            
            action = np.random.choice(range(action_probability_distribution.shape[1]), p=action_probability_distribution.ravel())  # select action w.r.t the actions prob

            # Perform a
            new_state, reward, done, info = env.step(action)

            # Store s, a, r
            episode_states.append(state)
                        
            # For actions because we output only one (the index) we need 2 (1 is for the action taken)
            # We need [0., 1.] (if we take right) not just the index
            action_ = np.zeros(action_size)
            action_[action] = 1
            
            episode_actions.append(action_)
            
            episode_rewards.append(reward)
            if done:
                # Calculate sum reward
                episode_rewards_sum = np.sum(episode_rewards)
                
                allRewards.append(episode_rewards_sum)
                
                total_rewards = np.sum(allRewards)
                
                # Mean reward
                mean_reward = np.divide(total_rewards, episode+1)
                
                
                maximumRewardRecorded = np.amax(allRewards)
                
                print("==========================================")
                print("Episode: ", episode)
                print("Reward: ", episode_rewards_sum)
                print("Mean Reward", mean_reward)
                print("Max reward so far: ", maximumRewardRecorded)
                
                # Calculate discounted reward
                discounted_episode_rewards = discount_and_normalize_rewards(episode_rewards)
                                
                # Feedforward, gradient and backpropagation
                loss_, _ = sess.run([loss, train_opt], feed_dict={input_: np.vstack(np.array(episode_states)),
                                                                 actions: np.vstack(np.array(episode_actions)),
                                                                 discounted_episode_rewards_: discounted_episode_rewards 
                                                                })

            
                
                # Reset the transition stores
                episode_states, episode_actions, episode_rewards = [],[],[]
                
                break
            
            state = new_state
        
        # Save Model
        if episode % 100 == 0:
            saver.save(sess, "./models/model.ckpt")
            print("Model saved")

Episode:  0
Reward:  11.0
Mean Reward 11.0
Max reward so far:  11.0
Model saved
Episode:  1
Reward:  11.0
Mean Reward 11.0
Max reward so far:  11.0
Episode:  2
Reward:  17.0
Mean Reward 13.0
Max reward so far:  17.0
Episode:  3
Reward:  20.0
Mean Reward 14.75
Max reward so far:  20.0
Episode:  4
Reward:  16.0
Mean Reward 15.0
Max reward so far:  20.0
Episode:  5
Reward:  11.0
Mean Reward 14.333333333333334
Max reward so far:  20.0
Episode:  6
Reward:  21.0
Mean Reward 15.285714285714286
Max reward so far:  21.0
Episode:  7
Reward:  12.0
Mean Reward 14.875
Max reward so far:  21.0
Episode:  8
Reward:  37.0
Mean Reward 17.333333333333332
Max reward so far:  37.0
Episode:  9
Reward:  18.0
Mean Reward 17.4
Max reward so far:  37.0
Episode:  10
Reward:  20.0
Mean Reward 17.636363636363637
Max reward so far:  37.0
Episode:  11
Reward:  24.0
Mean Reward 18.166666666666668
Max reward so far:  37.0
Episode:  12
Reward:  46.0
Mean Reward 20.307692307692307
Max reward so far:  46.0
Episode:  13
R

Episode:  83
Reward:  36.0
Mean Reward 23.154761904761905
Max reward so far:  74.0
Episode:  84
Reward:  18.0
Mean Reward 23.094117647058823
Max reward so far:  74.0
Episode:  85
Reward:  41.0
Mean Reward 23.302325581395348
Max reward so far:  74.0
Episode:  86
Reward:  37.0
Mean Reward 23.45977011494253
Max reward so far:  74.0
Episode:  87
Reward:  65.0
Mean Reward 23.931818181818183
Max reward so far:  74.0
Episode:  88
Reward:  10.0
Mean Reward 23.775280898876403
Max reward so far:  74.0
Episode:  89
Reward:  22.0
Mean Reward 23.755555555555556
Max reward so far:  74.0
Episode:  90
Reward:  32.0
Mean Reward 23.846153846153847
Max reward so far:  74.0
Episode:  91
Reward:  18.0
Mean Reward 23.782608695652176
Max reward so far:  74.0
Episode:  92
Reward:  27.0
Mean Reward 23.817204301075268
Max reward so far:  74.0
Episode:  93
Reward:  12.0
Mean Reward 23.69148936170213
Max reward so far:  74.0
Episode:  94
Reward:  14.0
Mean Reward 23.589473684210525
Max reward so far:  74.0
Episod

Episode:  150
Reward:  48.0
Mean Reward 24.582781456953644
Max reward so far:  74.0
Episode:  151
Reward:  18.0
Mean Reward 24.539473684210527
Max reward so far:  74.0
Episode:  152
Reward:  43.0
Mean Reward 24.66013071895425
Max reward so far:  74.0
Episode:  153
Reward:  41.0
Mean Reward 24.766233766233768
Max reward so far:  74.0
Episode:  154
Reward:  52.0
Mean Reward 24.941935483870967
Max reward so far:  74.0
Episode:  155
Reward:  22.0
Mean Reward 24.923076923076923
Max reward so far:  74.0
Episode:  156
Reward:  22.0
Mean Reward 24.904458598726116
Max reward so far:  74.0
Episode:  157
Reward:  93.0
Mean Reward 25.335443037974684
Max reward so far:  93.0
Episode:  158
Reward:  42.0
Mean Reward 25.440251572327043
Max reward so far:  93.0
Episode:  159
Reward:  22.0
Mean Reward 25.41875
Max reward so far:  93.0
Episode:  160
Reward:  22.0
Mean Reward 25.39751552795031
Max reward so far:  93.0
Episode:  161
Reward:  22.0
Mean Reward 25.376543209876544
Max reward so far:  93.0
Epis

Episode:  215
Reward:  48.0
Mean Reward 28.09259259259259
Max reward so far:  98.0
Episode:  216
Reward:  72.0
Mean Reward 28.294930875576036
Max reward so far:  98.0
Episode:  217
Reward:  48.0
Mean Reward 28.38532110091743
Max reward so far:  98.0
Episode:  218
Reward:  49.0
Mean Reward 28.47945205479452
Max reward so far:  98.0
Episode:  219
Reward:  39.0
Mean Reward 28.527272727272727
Max reward so far:  98.0
Episode:  220
Reward:  19.0
Mean Reward 28.4841628959276
Max reward so far:  98.0
Episode:  221
Reward:  28.0
Mean Reward 28.48198198198198
Max reward so far:  98.0
Episode:  222
Reward:  48.0
Mean Reward 28.569506726457398
Max reward so far:  98.0
Episode:  223
Reward:  17.0
Mean Reward 28.517857142857142
Max reward so far:  98.0
Episode:  224
Reward:  38.0
Mean Reward 28.56
Max reward so far:  98.0
Episode:  225
Reward:  82.0
Mean Reward 28.79646017699115
Max reward so far:  98.0
Episode:  226
Reward:  72.0
Mean Reward 28.986784140969164
Max reward so far:  98.0
Episode:  22

Episode:  281
Reward:  133.0
Mean Reward 51.45390070921986
Max reward so far:  749.0
Episode:  282
Reward:  431.0
Mean Reward 52.79505300353357
Max reward so far:  749.0
Episode:  283
Reward:  277.0
Mean Reward 53.58450704225352
Max reward so far:  749.0
Episode:  284
Reward:  170.0
Mean Reward 53.99298245614035
Max reward so far:  749.0
Episode:  285
Reward:  99.0
Mean Reward 54.15034965034965
Max reward so far:  749.0
Episode:  286
Reward:  193.0
Mean Reward 54.63414634146341
Max reward so far:  749.0
Episode:  287
Reward:  529.0
Mean Reward 56.28125
Max reward so far:  749.0
Episode:  288
Reward:  153.0
Mean Reward 56.6159169550173
Max reward so far:  749.0
Episode:  289
Reward:  61.0
Mean Reward 56.63103448275862
Max reward so far:  749.0
Episode:  290
Reward:  161.0
Mean Reward 56.98969072164948
Max reward so far:  749.0
Episode:  291
Reward:  13.0
Mean Reward 56.83904109589041
Max reward so far:  749.0
Episode:  292
Reward:  285.0
Mean Reward 57.617747440273035
Max reward so far:

In [7]:
with tf.Session() as sess:
    env.reset()
    rewards = []
    
    # Load the model
    saver.restore(sess, "./models/model.ckpt")

    for episode in range(10):
        state = env.reset()
        step = 0
        done = False
        total_rewards = 0
        print("****************************************************")
        print("EPISODE ", episode)

        while True:
            

            # Choose action a, remember WE'RE NOT IN A DETERMINISTIC ENVIRONMENT, WE'RE OUTPUT PROBABILITIES.
            action_probability_distribution = sess.run(action_distribution, feed_dict={input_: state.reshape([1,4])})
            #print(action_probability_distribution)
            action = np.random.choice(range(action_probability_distribution.shape[1]), p=action_probability_distribution.ravel())  # select action w.r.t the actions prob


            new_state, reward, done, info = env.step(action)

            total_rewards += reward

            if done:
                rewards.append(total_rewards)
                print ("Score", total_rewards)
                break
            state = new_state
    env.close()
    print ("Score over time: " +  str(sum(rewards)/10))

INFO:tensorflow:Restoring parameters from ./models/model.ckpt
****************************************************
EPISODE  0
Score 77.0
****************************************************
EPISODE  1
Score 101.0
****************************************************
EPISODE  2
Score 92.0
****************************************************
EPISODE  3
Score 43.0
****************************************************
EPISODE  4
Score 73.0
****************************************************
EPISODE  5
Score 44.0
****************************************************
EPISODE  6
Score 63.0
****************************************************
EPISODE  7
Score 47.0
****************************************************
EPISODE  8
Score 16.0
****************************************************
EPISODE  9
Score 32.0
Score over time: 58.8
