In [0]:
## Deep Reinforcement Learning:
## Using Monte Carlo Policy Gradient:
## Playing Atari Game - Cartpole:

In [0]:
## importing the libraries:
import tensorflow as tf
import numpy as np
import gym

In [13]:
## Create our Environment:
env = gym.make('CartPole-v1')
env = env.unwrapped

# Policy Gradient has high Variance , seed for reproducibility:
env.seed(1)

[1]

In [0]:
## Setup Hyper-Parameters:

# Environment Hyper-Parameter:
state_size = 4
action_size = env.action_space.n

# Training Hyper-Parameter:
max_episodes = 300
learning_rate = 0.01
gamma = 0.95      # Discount rate

In [15]:
print(action_size)

2


In [0]:
## Definr Pre-Processing Function:

# This function takes the reward and perform Discounting:

def discount_normalize_rewards(episode_rewards):
  discounted_episode_rewards = np.zeros_like(episode_rewards)
  cumulative = 0.0
  for i in reserved(range(len(episode_rewards))):
    cumulative = cumulative * gamma + episode_rewards[i]
    discounted_episode_rewards[i] = cumulative
    
  mean = np.mean(discounted_episode_rewards)
  std = np.std(discounted_episode_rewards)
  discounted_episode_rewards = (discounted_episode_rewards - mean ) / std
  
  return discounted_episode_rewards

In [0]:
## Create our policy gradint Neural Network Model:

# Our state which is an array of 4 values will be used as an input.
# Our NN is 3 fully connected layers.
# Our output activation function is softmax that squashes the outputs to a probability distribution.

with tf.name_scope("inputs"):
  inputs_ = tf.placeholder(tf.float32,[None,state_size],name='inputs_')
  actions = tf.placeholder(tf.float32,[None,action_size],name='actions')
  discounted_episode_rewards = tf.placeholder(tf.float32,[None,],name='discounted_episode_rewards')
  
  # Add this placeholder for having this variable in Tensorboard:
  mean_reward = tf.placeholder(tf.float32,name='mean_reward')
  
  with tf.name_scope('fc1'):
    fc1 = tf.contrib.layers.fully_connected(inputs = inputs_,
                                          num_outputs = 10,
                                          activation_fn = tf.nn.relu,
                                          weights_initializer = tf.contrib.layers.xavier_initializer())
    
  with tf.name_scope('fc2'):
    fc2 = tf.contrib.layers.fully_connected(inputs = fc1,
                                           num_outputs = action_size,
                                           activation_fn = tf.nn.relu,
                                           weights_initializer = tf.contrib.layers.xavier_initializer())
    
  with tf.name_scope('fc3'):
    fc3 = tf.contrib.layers.fully_connected(inputs = fc2,
                                           num_outputs = action_size,
                                           activation_fn = None,
                                           weights_initializer = tf.contrib.layers.xavier_initializer())
    
    
  with tf.name_scope('softmax'):
    action_distribution = tf.nn.softmax(fc3)
    
  with tf.name_scope('loss'):
    neg_loss_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = fc3 , labels = actions)
    loss = tf.reduce_mean(neg_loss_prob * discounted_episode_rewards)
    
  with tf.name_scope('train'):
    train_opt = tf.train.AdamOptimizer(learning_rate).minimize(loss)

In [18]:
## Setup tensorboard:

# setup tensorboard writer:
writer = tf.summary.FileWriter("/tensorboard/pg/1")

# loses:
tf.summary.scalar("Loss",loss)

# Reward Mean:
tf.summary.scalar("Reward Mean",mean_reward)

writer.op = tf.summary.merge_all()

INFO:tensorflow:Summary name Reward Mean is illegal; using Reward_Mean instead.


In [0]:
## Train our Agent:

allRewards = []
total_rewards = 0
maximum_Reward_Recorded = 0
episode = 0
episode_states , episode_actions , episode_rewards = [],[],[]

saver = tf.train.Saver()

with tf.Session() as sess:
  sess.run(tf.global_variables_initializer())
  
  for episode in range(max_episodes):
    
    episode_rewards_sum = 0
    
    # launch the game:
    state = env.reset()
    
    env.render()
    
    while True:
      # choose action a
      # we are not in Deterministic environment
      # we have output probabilities
      
      action_probabilities_distribution = sess.run(action_distribution , feed_dict = {input_:state.resize([1,4])})
      
      # select action w.r.t action probability:
      action = np.random.choice(range(action_probability_distribution.shape[1]),p=action_probability_distribution.ravel())
      
      # perform action:
      new_state , reward , done , info = env.step(action)
      
      # store state , action , reward :
      episode_states.append(state)
      
      # For actions because we output only one (the index) , we need 2 (1 for the action taken):
      # we need [0. , 1.] if we take right , not just the index:
      action_ = np.zeros(action_size)
      action_[action] = 1
      
      episode_actions.append(action_)
      episode_rewards.append(reward)
      
      if done:
        # calculate the sum reward:
        episode_rewards_sum = np.sum(episode_rewards)
        
        allRewards.append(episode_rewards_sum)
        
        total_rewards.append(allRewards)
        
        # Mean Reward:
        mean_reward = np.divide(total_rewards,episode+1)
        
        maximumRewardRecorded = np.amax(allRewards)
        
        print("=============================")
        print("Episode: ",episode)
        print("Episode Rewards: ",episode_rewards_sum)
        print("Mean Reward: ",mean_reward)
        print("Max Reward: ",maximumRewardRecorded)
        
        # calculate discounted reward:
        discounted_episode_rewards  = discount_normalize_rewards(episode_rewards)
        
        # feed-forward gradient and back-propogation:
        loss_,_ = sess.run([loss,train_opt],feed_dict = {input_:np.vstack(np.array(episodes_states)),
                                                        action:np.vstack(np.array(episode_actions)),
                                                        discounted_episode_rewards_:discounted_episode_rewards})
        
        
        # write TF summaries:
        summary = sess.run(write_op,feed_dict = {input_ : np.vstack(np.array(episode_states)),
                                                action : np.vstack(np.array(episode_actions)),
                                                discounted_episode_rewards_ : discounted_episode_rewards,
                                                mean_reward_ : mean_reward})
        
        writer.add_summary(summary,episode)
        writer.flush()
        
        
        # reset transition stores:
        episode_states , episode_actions , episode_rewards = [],[],[]
        
      state = new_state
    
    # save model:
    if episode % 100 == 0:
      saver.save(sess, "C:/timru/model.ckpt")
      print("Model saved")

In [0]:
with tf.Session() as sess:
  env.reset()
  rewards = []
  
  # Load the Model:
  saver.restore(sess,"C:/timru/model.ckpt")
  
  for episode in range(10):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    print("***************************")
    print("Episode : ",episode)
    
    while True:
      # choose an action a:
      action_probability_distribution = sess.run(action_distribution,feed_dict={input_:state.reshape([1,4])})
      
      print(action_probability_distribution)
      
      action = np.random.choice(range(action_probability_distribution.shape[1]),p=action_probability_distribution.ravel())
      
      new_state ,  reward , done , info = env.step(action)
      
      total_rewards += reward
      
      if done:
        rewards.append(total_rewards)
        print("Score",total_rewards)
        
        break
        
      state = new_state
      
  env.close()
  
  print("Score over time :", str(sum(rewards)/10))