# A2C - CartPole
Implementation of the A2C RL Algorithm for the OpenAI's Gym environment CartPole-V1 (not in parallel).

In [1]:
%load_ext tensorboard
# Import libraries
import numpy as np
import matplotlib.pyplot as pd
from datetime import datetime

import tensorflow as tf

import gym

In [2]:
# Action Value Fuction Estimator (q-network)
class Critic(tf.keras.layers.Layer):
    
    def __init__(self):
        super(Critic, self).__init__()
        self.fc1 = tf.keras.layers.Dense(units=16, input_shape=[4,], activation='relu')
        self.fc2 = tf.keras.layers.Dense(units=8, activation='relu')
        self.out = tf.keras.layers.Dense(units=1, activation='sigmoid')
    
    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.out(x)
        return x

In [3]:
# Action Value Fuction Estimator (q-network)
class Actor(tf.keras.layers.Layer):
    
    def __init__(self):
        super(Actor, self).__init__()
        self.fc1 = tf.keras.layers.Dense(units=16, input_shape=[4,], activation='relu')
        self.fc2 = tf.keras.layers.Dense(units=8, activation='relu')
        self.out = tf.keras.layers.Dense(units=1, activation='sigmoid')
    
    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.out(x)
        return x

In [8]:
# Prepare Tensorboard
!rm -rf ./logs/
current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
%tensorboard --logdir logs/
#tf.keras.backend.clear_session()
# Initialize cart pole environment
env = gym.make('CartPole-v1')
# Initialize model, loss and optimizer
actor = Actor()
critic = Critic()
actor_optimizer = tf.keras.optimizers.Adam()
critic_optimizer = tf.keras.optimizers.Adam()
mse = tf.keras.losses.MSE
# Initialize replay memory
observations = []
# Set hyperparameters
discount = 0.9
max_time_steps = 500
num_episodes = 40

step = 0
# Run for agent and environment for num_episodes
for i_episode in range(num_episodes):
    state = env.reset()
    observations.append(state)
    
    # Agent has 500 trials at max, if it does not fail beforehand
    for t in range(max_time_steps):
        env.render()
        # Compute action
        state = np.reshape(state, [1,4])
        action = int(actor(state))
        # Execute action and store action, state and reward
        next_state, reward, done, info = env.step(action)
        observations.append((state, action, reward))
        state = next_state
        
        # Interrupt the trial if the agent fails
        if done:
            break
        step += 1
        
    print(f"Episode {i_episode + 1} of {num_episodes} findished after {t+1} timesteps")
        
    # Store losses temporary
    losses = []

    # Initialize variable for the estimated return
    estimated_reward = 0 if done else critic(next_state)
    
    # Iterate over taken actions and observed states and rewards
    observations.reverse()
    for state, action, reward in observations:
        
        # Compute estimated return
        estimated_return = discount * estimated_reward + reward
        # Compute state value
        state_v = critic(state)

        # Compute gradients for the actor (policy gradient)
        # Maximize the estimated return
        policy_gradient = 0
        with tf.GradientTape() as actor_tape:
            # Compute natural logarithm of the action determined by the actor/policy
            log_action = np.log(action)
            # Compute gradient with respect to the parameters of the actor
            gradient = actor_tape.gradient(log_action, actor.trainable_variables)
            # Accumulate gradients
            policy_gradient = policy_gradient + gradient * (estimated_return - state_v)
        
        # Compute gradients for the critic
        # minimize MSE for the state value function
        critic_gradient = 0
        with tf.GradientTape() as critic_tape:
            # Compute the loss
            loss = mse(estimated_reward, state_v)
            # Compute the gradient
            gradient = critic_tape.gradient(loss, critic.trainable_variables)
            # Accumulate gradients
            critic_gradient += gradient
            
        # Apply gradients.
        actor_optimizer.apply_gradients(policy_gradient, actor.trainable_variables)
        critic_optimizer.apply_gradients(critic_gradient, critic.trainable_variables)
        losses.append(loss)

    observations = []

    # Store summary statistics
    with train_summary_writer.as_default():
        tf.summary.scalar('td_error', tf.reduce_mean(losses), step=step)

env.close()

Reusing TensorBoard on port 6006 (pid 6328), started 0:59:35 ago. (Use '!kill 6328' to kill it.)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Episode 1 of 40 findished after 10 timesteps


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.





AttributeError: 'numpy.dtype' object has no attribute 'is_floating'

In [None]:
# Set up the environment
env = gym.make("CartPole-V1")