In [1]:
# install openai gym
%pip install gym
# enable autoreload
%load_ext autoreload
%autoreload 2

Note: you may need to restart the kernel to use updated packages.


In [2]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Configuration parameters for the whole setup
seed = 42
gamma = 0.99  # Discount factor for past rewards
max_steps_per_episode = 10000
env = gym.make("CartPole-v0")  # Create the environment
env.seed(seed)
eps = np.finfo(np.float32).eps.item()  # Smallest number such that 1.0 + eps != 1.0

In [3]:
num_inputs = 4
num_actions = 2
num_hidden = 128

inputs = layers.Input(shape=(num_inputs,))
common = layers.Dense(num_hidden, activation="relu")(inputs)
action = layers.Dense(num_actions, activation="softmax")(common)
critic = layers.Dense(1)(common)

model = keras.Model(inputs=inputs, outputs=[action, critic])

In [4]:
optimizer = keras.optimizers.Adam()
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0

while True:  # Run until solved
    state = env.reset()
    episode_reward = 0
    with tf.GradientTape() as tape:
        for timestep in range(1, max_steps_per_episode):
            # env.render(); Adding this line would show the attempts
            # of the agent in a pop up window.

            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)

            # Predict action probabilities and estimated future rewards
            # from environment state
            action_probs, critic_value = model(state)
            critic_value_history.append(critic_value[0, 0])

            # Sample action from action probability distribution
            action = np.random.choice(num_actions, p=np.squeeze(action_probs))
            action_probs_history.append(tf.math.log(action_probs[0, action]))

            # Apply the sampled action in our environment
            state, reward, done, _ = env.step(action)
            rewards_history.append(reward)
            episode_reward += reward

            if done:
                break

        # Update running reward to check condition for solving
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

        # Calculate expected value from rewards
        # - At each timestep what was the total reward received after that timestep
        # - Rewards in the past are discounted by multiplying them with gamma
        # - These are the labels for our critic
        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]:
            discounted_sum = r + gamma * discounted_sum
            returns.insert(0, discounted_sum)

        # Normalize
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()

        # Calculating loss values to update our network
        history = zip(action_probs_history, critic_value_history, returns)
        actor_losses = []
        critic_losses = []
        for log_prob, value, ret in history:
            # At this point in history, the critic estimated that we would get a
            # total reward = `value` in the future. We took an action with log probability
            # of `log_prob` and ended up recieving a total reward = `ret`.
            # The actor must be updated so that it predicts an action that leads to
            # high rewards (compared to critic's estimate) with high probability.
            diff = ret - value
            actor_losses.append(-log_prob * diff)  # actor loss

            # The critic must be updated so that it predicts a better estimate of
            # the future rewards.
            critic_losses.append(
                huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
            )

        # Backpropagation
        loss_value = sum(actor_losses) + sum(critic_losses)
        print(loss_value)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Clear the loss and reward history
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()

    # Log details
    episode_count += 1
    if episode_count % 10 == 0:
        template = "running reward: {:.2f} at episode {}"
        print(template.format(running_reward, episode_count))

    if running_reward > 195:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break

tf.Tensor(10.582096, shape=(), dtype=float32)
tf.Tensor(4.657996, shape=(), dtype=float32)
tf.Tensor(8.444636, shape=(), dtype=float32)
tf.Tensor(7.2380557, shape=(), dtype=float32)
tf.Tensor(4.132463, shape=(), dtype=float32)
tf.Tensor(5.1792393, shape=(), dtype=float32)
tf.Tensor(11.741002, shape=(), dtype=float32)
tf.Tensor(4.45949, shape=(), dtype=float32)
tf.Tensor(16.44645, shape=(), dtype=float32)
tf.Tensor(18.660873, shape=(), dtype=float32)
running reward: 9.08 at episode 10
tf.Tensor(12.647433, shape=(), dtype=float32)
tf.Tensor(20.190159, shape=(), dtype=float32)
tf.Tensor(5.3502054, shape=(), dtype=float32)
tf.Tensor(19.79785, shape=(), dtype=float32)
tf.Tensor(7.035434, shape=(), dtype=float32)
tf.Tensor(7.3072653, shape=(), dtype=float32)
tf.Tensor(9.5591345, shape=(), dtype=float32)
tf.Tensor(6.658245, shape=(), dtype=float32)
tf.Tensor(7.114932, shape=(), dtype=float32)
tf.Tensor(5.28491, shape=(), dtype=float32)
running reward: 14.38 at episode 20
tf.Tensor(7.332076, s

tf.Tensor(-2.7820435, shape=(), dtype=float32)
tf.Tensor(-1.4501743, shape=(), dtype=float32)
tf.Tensor(5.5368977, shape=(), dtype=float32)
running reward: 37.00 at episode 170
tf.Tensor(6.849308, shape=(), dtype=float32)
tf.Tensor(2.7149982, shape=(), dtype=float32)
tf.Tensor(-0.3033619, shape=(), dtype=float32)
tf.Tensor(7.529022, shape=(), dtype=float32)
tf.Tensor(2.4393873, shape=(), dtype=float32)
tf.Tensor(1.5988693, shape=(), dtype=float32)
tf.Tensor(2.5081768, shape=(), dtype=float32)
tf.Tensor(3.9832, shape=(), dtype=float32)
tf.Tensor(0.9945164, shape=(), dtype=float32)
tf.Tensor(1.4152622, shape=(), dtype=float32)
running reward: 39.65 at episode 180
tf.Tensor(-0.2270298, shape=(), dtype=float32)
tf.Tensor(1.609848, shape=(), dtype=float32)
tf.Tensor(4.7584534, shape=(), dtype=float32)
tf.Tensor(1.6327438, shape=(), dtype=float32)
tf.Tensor(3.1197433, shape=(), dtype=float32)
tf.Tensor(0.28096557, shape=(), dtype=float32)
tf.Tensor(3.7049942, shape=(), dtype=float32)
tf.Tens

tf.Tensor(1.6253891, shape=(), dtype=float32)
tf.Tensor(1.9309654, shape=(), dtype=float32)
tf.Tensor(-2.760231, shape=(), dtype=float32)
tf.Tensor(18.643661, shape=(), dtype=float32)
tf.Tensor(-20.372704, shape=(), dtype=float32)
tf.Tensor(-18.124634, shape=(), dtype=float32)
tf.Tensor(0.5248985, shape=(), dtype=float32)
tf.Tensor(1.5471802, shape=(), dtype=float32)
tf.Tensor(-17.975594, shape=(), dtype=float32)
running reward: 116.50 at episode 340
tf.Tensor(-17.640417, shape=(), dtype=float32)
tf.Tensor(5.923744, shape=(), dtype=float32)
tf.Tensor(-3.2099915, shape=(), dtype=float32)
tf.Tensor(-14.709808, shape=(), dtype=float32)
tf.Tensor(-22.415361, shape=(), dtype=float32)
tf.Tensor(19.111374, shape=(), dtype=float32)
tf.Tensor(-12.689587, shape=(), dtype=float32)
tf.Tensor(3.3735428, shape=(), dtype=float32)
tf.Tensor(-11.596141, shape=(), dtype=float32)
tf.Tensor(-19.968792, shape=(), dtype=float32)
running reward: 118.17 at episode 350
tf.Tensor(-27.965637, shape=(), dtype=flo

tf.Tensor(-21.482794, shape=(), dtype=float32)
tf.Tensor(-28.118444, shape=(), dtype=float32)
tf.Tensor(22.592575, shape=(), dtype=float32)
tf.Tensor(-0.31706238, shape=(), dtype=float32)
tf.Tensor(-20.34801, shape=(), dtype=float32)
running reward: 173.62 at episode 500
tf.Tensor(2.6043777, shape=(), dtype=float32)
tf.Tensor(0.46365356, shape=(), dtype=float32)
tf.Tensor(-12.56588, shape=(), dtype=float32)
tf.Tensor(-20.621645, shape=(), dtype=float32)
tf.Tensor(-0.3894577, shape=(), dtype=float32)
tf.Tensor(-12.509144, shape=(), dtype=float32)
tf.Tensor(6.7912292, shape=(), dtype=float32)
tf.Tensor(-15.135323, shape=(), dtype=float32)
tf.Tensor(-3.7482529, shape=(), dtype=float32)
tf.Tensor(11.43573, shape=(), dtype=float32)
running reward: 171.14 at episode 510
tf.Tensor(-15.929939, shape=(), dtype=float32)
tf.Tensor(-30.343967, shape=(), dtype=float32)
tf.Tensor(-21.172161, shape=(), dtype=float32)
tf.Tensor(0.15975189, shape=(), dtype=float32)
tf.Tensor(1.8125153, shape=(), dtype=

tf.Tensor(-35.077255, shape=(), dtype=float32)
running reward: 174.15 at episode 660
tf.Tensor(-23.628637, shape=(), dtype=float32)
tf.Tensor(-26.320986, shape=(), dtype=float32)
tf.Tensor(-24.058975, shape=(), dtype=float32)
tf.Tensor(-26.53183, shape=(), dtype=float32)
tf.Tensor(-18.02774, shape=(), dtype=float32)
tf.Tensor(6.8862, shape=(), dtype=float32)
tf.Tensor(12.931305, shape=(), dtype=float32)
tf.Tensor(9.937027, shape=(), dtype=float32)
tf.Tensor(-19.435766, shape=(), dtype=float32)
tf.Tensor(22.207382, shape=(), dtype=float32)
running reward: 179.41 at episode 670
tf.Tensor(-14.761055, shape=(), dtype=float32)
tf.Tensor(-22.45958, shape=(), dtype=float32)
tf.Tensor(-32.197136, shape=(), dtype=float32)
tf.Tensor(-24.822699, shape=(), dtype=float32)
tf.Tensor(17.420242, shape=(), dtype=float32)
tf.Tensor(15.540222, shape=(), dtype=float32)
tf.Tensor(27.049942, shape=(), dtype=float32)
tf.Tensor(6.5434113, shape=(), dtype=float32)
tf.Tensor(13.8943405, shape=(), dtype=float32)

tf.Tensor(-22.51684, shape=(), dtype=float32)
tf.Tensor(-29.420784, shape=(), dtype=float32)
tf.Tensor(-19.261974, shape=(), dtype=float32)
tf.Tensor(-25.912586, shape=(), dtype=float32)
tf.Tensor(-18.044422, shape=(), dtype=float32)
tf.Tensor(15.065689, shape=(), dtype=float32)
tf.Tensor(19.209808, shape=(), dtype=float32)
tf.Tensor(-26.37791, shape=(), dtype=float32)
running reward: 169.81 at episode 830
tf.Tensor(-25.982515, shape=(), dtype=float32)
tf.Tensor(-31.239155, shape=(), dtype=float32)
tf.Tensor(0.27638245, shape=(), dtype=float32)
tf.Tensor(-19.054611, shape=(), dtype=float32)
tf.Tensor(-26.288044, shape=(), dtype=float32)
tf.Tensor(-16.927727, shape=(), dtype=float32)
tf.Tensor(-20.452068, shape=(), dtype=float32)
tf.Tensor(-23.992794, shape=(), dtype=float32)
tf.Tensor(-27.6884, shape=(), dtype=float32)
tf.Tensor(-16.47567, shape=(), dtype=float32)
running reward: 172.78 at episode 840
tf.Tensor(-17.043114, shape=(), dtype=float32)
tf.Tensor(-35.630795, shape=(), dtype=

In [5]:
grads

[<tf.Tensor: shape=(4, 128), dtype=float32, numpy=
 array([[ 4.15279007e+00,  4.74436951e+00,  2.64721560e+00,
          1.58712435e+00, -1.99492633e+00,  9.20157671e-01,
          2.60678434e+00,  4.27246189e+00, -2.62073636e+00,
          2.95595193e+00,  1.21674621e+00, -2.28490043e+00,
          1.98226023e+00,  2.21497372e-01,  2.52447009e-01,
          2.55069923e+00, -5.19203603e-01, -7.64860570e-01,
          1.52627692e-01,  2.29184413e+00, -6.43552899e-01,
         -1.60653305e+00,  2.58055806e-01, -2.43941092e+00,
          1.21243799e+00,  4.17794752e+00, -2.58732843e+00,
         -1.19673109e+00, -4.52645350e+00,  6.78121328e-01,
          1.86519456e+00,  2.51261282e+00,  8.45785677e-01,
          3.81118447e-01, -2.05498958e+00,  2.52297831e+00,
         -3.21869302e+00,  3.61408889e-01,  3.10604787e+00,
         -2.72877264e+00,  5.09427214e+00,  1.40827775e+00,
          3.51196432e+00,  3.29740691e+00, -5.56953096e+00,
          2.20332399e-01, -2.37178588e+00, -4.281

In [6]:
sum(critic_losses)

<tf.Tensor: shape=(), dtype=float32, numpy=85.4467>