<a href="https://colab.research.google.com/github/linjunzh/reinforcement_learning/blob/master/REINFORCE_with_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This code is more like the REINFORCE with baseline, which depends on episode samples and the computed returns for each episode.

In the following code, critic is indeed the baseline that approximates the state value V(s)

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt

from keras.models import Model
from keras.layers import Dense, Input
from keras.optimizers import Adam
from keras.utils.vis_utils import plot_model
from keras.losses import Huber

import tensorflow as tf


In [2]:
''' Environment '''

seed = 42
gamma = 0.99    # discount factor
max_steps_per_episode = 10000

env = gym.make('CartPole-v1')
env.seed(seed)
eps = np.finfo(np.float32).eps.item()   # smallest number s.t. 1.0+eps != 1.0

nb_states = env.observation_space.shape[0]
nb_actions = env.action_space.n

print('nb_states=%d, nb_actions=%d' % (nb_states, nb_actions))

nb_states=4, nb_actions=2


In [3]:
''' Actor-critic network '''

num_hidden = 128

input = Input(shape=(nb_states,))
x = Dense(num_hidden, activation='relu')(input)
action = Dense(nb_actions, activation='softmax')(x)
critic = Dense(1)(x)

model = Model(inputs=input, outputs=[action, critic])

In [4]:
''' Train '''

optimizer = Adam(learning_rate=0.01)
huber_loss = Huber()

action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0

while True:   # run until solved
  state = env.reset()
  episode_reward = 0
  with tf.GradientTape() as tape:
    for timestep in range(1, max_steps_per_episode):
      state = tf.convert_to_tensor(state)
      state = tf.expand_dims(state, 0)

      # predict action probabilities and estimate future rewards from environment state
      action_probs, critic_value = model(state)
      critic_value_history.append(critic_value[0, 0])

      # sample action
      action = np.random.choice(nb_actions, p=np.squeeze(action_probs))
      action_probs_history.append(tf.math.log(action_probs[0, action]))

      # apply the sampled action
      state, reward, done, _ = env.step(action)
      rewards_history.append(reward)
      episode_reward += reward

      if done:
        break

    # update running reward to check condition for solving
    running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

    # calculate expected value from rewards
    returns = []
    discounted_sum = 0
    for r in rewards_history[::-1]:
      discounted_sum = r + gamma * discounted_sum
      returns.insert(0, discounted_sum)

    # normalize returns
    returns = np.array(returns)
    returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
    returns = returns.tolist()

    # calculate loss values to update network
    history = zip(action_probs_history, critic_value_history, returns)
    actor_losses, critic_losses = [], []
    for log_prob, value, ret in history:
      diff = ret - value
      actor_losses.append(-log_prob * diff)   # actor loss

      # the critic must be updated so that it predicts a better estimate of
      # the future rewards
      critic_losses.append(huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0)))

    # backpropagation
    loss_value = sum(actor_losses) + sum(critic_losses)
    grads = tape.gradient(loss_value, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # clear the loss and reward history
    action_probs_history.clear()
    critic_value_history.clear()
    rewards_history.clear()

  # Log details
  episode_count += 1
  if episode_count % 10 == 0:
    template = 'running reward: {:.2f} at episode {}'
    print(template.format(running_reward, episode_count))

  if running_reward > 495:  # condition to consider the task solved
    print('Solved at episode {}!'.format(episode_count))
    break

running reward: 12.88 at episode 10
running reward: 19.26 at episode 20
running reward: 21.33 at episode 30
running reward: 22.27 at episode 40
running reward: 37.67 at episode 50
running reward: 52.37 at episode 60
running reward: 43.74 at episode 70
running reward: 43.73 at episode 80
running reward: 72.29 at episode 90
running reward: 82.41 at episode 100
running reward: 169.29 at episode 110
running reward: 248.35 at episode 120
running reward: 271.23 at episode 130
running reward: 274.06 at episode 140
running reward: 280.45 at episode 150
running reward: 281.87 at episode 160
running reward: 240.50 at episode 170
running reward: 226.25 at episode 180
running reward: 276.56 at episode 190
running reward: 284.49 at episode 200
running reward: 266.11 at episode 210
running reward: 275.18 at episode 220
running reward: 345.58 at episode 230
running reward: 351.65 at episode 240
running reward: 319.74 at episode 250
running reward: 274.91 at episode 260
running reward: 296.38 at episo

In [None]:
max_steps_per_episode

10000.0