# A2C - CartPole
Implementation of the A2C RL Algorithm for the OpenAI's Gym environment CartPole-V1 (not in parallel).

In [1]:
%load_ext tensorboard
# Import libraries
import numpy as np
import matplotlib.pyplot as pd
from datetime import datetime

import tensorflow as tf

import gym

In [2]:
# Action Value Fuction Estimator (q-network)
class Critic(tf.keras.layers.Layer):
    
    def __init__(self):
        super(Critic, self).__init__()
        self.fc1 = tf.keras.layers.Dense(units=16, input_shape=[4,], activation='relu')
        self.fc2 = tf.keras.layers.Dense(units=8, activation='relu')
        self.out = tf.keras.layers.Dense(units=1, activation='sigmoid')
    
    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.out(x)
        return x

In [3]:
# Action Value Fuction Estimator (q-network)
class Actor(tf.keras.layers.Layer):
    
    def __init__(self):
        super(Actor, self).__init__()
        self.fc1 = tf.keras.layers.Dense(units=16, input_shape=[4,], activation='relu')
        self.fc2 = tf.keras.layers.Dense(units=8, activation='relu')
        self.out = tf.keras.layers.Dense(units=2, activation='softmax')
    
    def call(self, x):
        x = tf.convert_to_tensor(x)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.out(x)
        return x

In [None]:
# Prepare Tensorboard
!rm -rf ./logs/
current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
%tensorboard --logdir logs/
#tf.keras.backend.clear_session()
# Initialize cart pole environment
env = gym.make('CartPole-v1')
# Initialize model, loss and optimizer
actor = Actor()
critic = Critic()
actor_optimizer = tf.keras.optimizers.Adam()
critic_optimizer = tf.keras.optimizers.Adam()
mse = tf.keras.losses.MSE
cce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# Initialize replay memory
observations = []
# Set hyperparameters
discount = 0.9
max_time_steps = 500
num_episodes = 40

step = 0
# Run for agent and environment for num_episodes
for i_episode in range(num_episodes):
    state = env.reset()
    observations.append(state)
    #breakpoint()
    
    # Agent has 500 trials at max, if it does not fail beforehand
    for t in range(max_time_steps):
        env.render()
        # Compute action
        state = np.reshape(state, [1,4])
        probabilities = actor(state)
        action = tf.random.categorical(tf.math.log(probabilities), 1)
        #breakpoint()
        # Execute action and store action, state and reward
        next_state, reward, done, info = env.step(int(action))
        observations.append((state, action, reward))
        state = next_state
        
        # Interrupt the trial if the agent fails
        if done:
            break
        step += 1
        
    print(f"Episode {i_episode + 1} of {num_episodes} findished after {t+1} timesteps")
        
    # Store losses temporary
    losses = []

    # Initialize variable for the estimated return
    estimated_reward = 0 if done else critic(next_state)
    
    # Iterate over taken actions and observed states and rewards
    observations.reverse()
    for state, action, reward in observations:
        
        state = np.reshape(state, [1,4])
        # Compute estimated return
        estimated_return = discount * estimated_reward + reward
        # Compute state value
        state_v = critic(state)
        
        # Compute gradients for the actor (policy gradient)
        # Maximize the estimated return
        policy_gradients = 0
        with tf.GradientTape() as actor_tape:
            #
            logits = tf.math.log(actor(state))
            
            # Compute the actor loss (log part of the policy gradient)
            #actor_loss = cce(tf.cast(action, tf.int32), logits[0])
            advantages = estimated_return - int(state_v)
            
            actor_loss = _logits_loss(action, logits, advantages)
            # Compute gradient with respect to the parameters of the actor            
            
            gradients = actor_tape.gradient(actor_loss, actor.trainable_variables)
            # Accumulate gradients
            #policy_gradients = policy_gradients + gradients * (estimated_return - state_v)
        #breakpoint()
        # Compute gradients for the critic
        # minimize MSE for the state value function
        #critic_gradients =
        with tf.GradientTape() as critic_tape:
            # Compute the loss
            critic_loss = mse(estimated_reward, state_v)
            # Compute the gradient
            gradients = critic_tape.gradient(critic_loss, critic.trainable_variables)
            breakpoint()
            # Accumulate gradients
            critic_gradients += gradients
            
        # Apply gradients.
        actor_optimizer.apply_gradients(zip(policy_gradients, actor.trainable_variables))
        critic_optimizer.apply_gradients(zip(critic_gradients, critic.trainable_variables))
        losses.append(loss)

    observations = []

    # Store summary statistics
    with train_summary_writer.as_default():
        tf.summary.scalar('?b', tf.reduce_mean(losses), step=step)

env.close()

Reusing TensorBoard on port 6006 (pid 13371), started 3:42:13 ago. (Use '!kill 13371' to kill it.)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.





Episode 1 of 40 findished after 15 timesteps


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

> <ipython-input-14-c19c9c0f3f49>(95)<module>()
-> critic_gradients += gradients
(Pdb) gradients
[None, None, None, None, None, None]
(Pdb) critic_loss
<tf.Tensor: id=4174, shape=(1,), dtype=float32, numpy=array([0.24073102], dtype=float32)>


In [11]:
import tensorflow.keras.losses as kls
import tensorflow.keras.optimizers as ko
  
def _logits_loss(actions, logits, advantages):
    # A trick to input actions and advantages through the same API.
    #actions, advantages = tf.split(actions_and_advantages, 2, axis=-1)

    # Sparse categorical CE loss obj that supports sample_weight arg on `call()`.
    # `from_logits` argument ensures transformation into normalized probabilities.
    weighted_sparse_ce = kls.SparseCategoricalCrossentropy(from_logits=True)

    # Policy loss is defined by policy gradients, weighted by advantages.
    # Note: we only calculate the loss on the actions we've actually taken.
    advantages = tf.cast([[advantages]], tf.float32)
    actions = tf.cast(actions, tf.int32)
    #breakpoint()
    policy_loss = weighted_sparse_ce(actions, logits, sample_weight=advantages)
    return policy_loss

In [8]:
a = np.ones(4)
b = np.zeros(4)

In [13]:
tf.split(np.concatenate([a[:,None],b[:,None]], axis=-1), 2, axis=-1)


[<tf.Tensor: id=12, shape=(4, 1), dtype=float64, numpy=
 array([[1.],
        [1.],
        [1.],
        [1.]])>, <tf.Tensor: id=13, shape=(4, 1), dtype=float64, numpy=
 array([[0.],
        [0.],
        [0.],
        [0.]])>]

In [16]:
tf.cast([[1]], tf.int32)

<tf.Tensor: id=16, shape=(1, 1), dtype=int32, numpy=array([[1]], dtype=int32)>