# A2C - CartPole
Implementation of the A2C RL Algorithm for the OpenAI's Gym environment CartPole-V1 (not in parallel).

In [4]:
%load_ext tensorboard
# Import libraries
import numpy as np
import matplotlib.pyplot as pd
from datetime import datetime

import tensorflow as tf

import gym

In [5]:
# Value Fuction Estimator
class Critic(tf.keras.layers.Layer):
    
    def __init__(self):
        super(Critic, self).__init__()
        self.fc1 = tf.keras.layers.Dense(units=32, input_shape=[8,], activation='relu')
        self.fc2 = tf.keras.layers.Dense(units=8, activation='relu')
        self.out = tf.keras.layers.Dense(units=1, activation='relu')
    
    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.out(x)
        return x

In [13]:
# Action Value Fuction Estimator (q-network)
class Actor(tf.keras.layers.Layer):
    
    def __init__(self):
        super(Actor, self).__init__()
        
        # 64(share) -> 64(share) -> 32 -> 32 -> mu(tanh) [-1,1]
        # 64(share) -> 64(share) -> 32 -> 32 -> sigma(sigmoid) [0,1]
        self.sharedFC1 = tf.keras.layers.Dense(units=64, input_shape=[8,], activation='relu')
        self.sharedFC2 = tf.keras.layers.Dense(units=64, activation='relu')
        
        self.muFC1 = tf.keras.layers.Dense(units=32, activation='relu')
        self.muFC2 = tf.keras.layers.Dense(units=32, activation='relu')
        
        self.sigmaFC1 = tf.keras.layers.Dense(units=32, activation='relu')
        self.sigmaFC2 = tf.keras.layers.Dense(units=32, activation='relu')
        
        
        self.mu_out = tf.keras.layers.Dense(units=2, activation='tanh')
        self.sigma_out = tf.keras.layers.Dense(units=2, activation='sigmoid')
    
    def call(self, x):
        x = tf.convert_to_tensor(x)
        x = self.sharedFC1(x)
        x = self.sharedFC2(x)
        
        mu = self.muFC1(x)
        mu = self.muFC2(mu)
        mu = self.mu_out(mu)
        
        sigma = self.sigmaFC1(x)
        sigma = self.sigmaFC2(sigma)
        sigma = self.sigma_out(sigma)     
        
        return mu, sigma

In [26]:
# Prepare Tensorboard
!rm -rf ./logs/
current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
#%tensorboard --logdir logs/
#tf.keras.backend.clear_session()
# Initialize cart pole environment
env = gym.make('LunarLanderContinuous-v2')
# Initialize model, loss and optimizer
actor = Actor()
critic = Critic()
actor_optimizer = tf.keras.optimizers.Adam()
critic_optimizer = tf.keras.optimizers.Adam()
mse = tf.keras.losses.MSE
weighted_sparse_ce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# Initialize replay memory
observations = []
# Set hyperparameters
discount = 0.95
max_time_steps = 500
num_episodes = 100

step = 0
# Run for agent and environment for num_episodes
for i_episode in range(num_episodes):
    state = env.reset()
    #breakpoint()
    
    # Agent has 500 trials at max, if it does not fail beforehand
    for t in range(max_time_steps):
#        env.render()
        # Compute action
        state = np.reshape(state, [1,8])
        mu, sigma = actor(state)
        
        # sample two values from normal distribution
        mainEngineAction = tf.random.normal((1,), mean=mu[0,0], stddev=sigma[0,0])
        sideEngineAction = tf.random.normal((1,), mean=mu[0,1], stddev=sigma[0,1])
        action = tf.concat(mainEngineAction, sideEngineAction, 0)
        #        mainEngineAction = np.reshape(action, (2,))
        # Execute action and store action, state and reward
        print(mainEngineAction)
        next_state, reward, done, info = env.step(action)
        observations.append((state, action, reward))
        state = next_state
        
        # Interrupt the trial if the agent fails
        if done:
            break
        step += 1
        
    print(f"Episode {i_episode + 1} of {num_episodes} findished after {t+1} timesteps")
        
    # Store losses temporary
    losses = []
    # Initialize variable for the estimated return
    estimated_return = 0 if done else critic(next_state)
    
    # Iterate over taken actions and observed states and rewards
    observations.reverse()
    for state, action, reward in observations:
        # Compute estimated return
        estimated_return = discount * estimated_return + reward
        # Compute state value
        state_v = critic(state)
    
        # Compute gradients for the actor (policy gradient)
        # Maximize the estimated return
        with tf.GradientTape() as actor_tape:
            logits = tf.math.log(actor(state))
            advantages = estimated_return - int(state_v)
            advantages = tf.cast([[advantages]], tf.float32)
            action = tf.cast(action, tf.int32)
            # Compute the actor loss (log part of the policy gradient)
            actor_loss = weighted_sparse_ce(action, logits, sample_weight=advantages)
            # Compute gradient with respect to the parameters of the actor            
            policy_gradients = actor_tape.gradient(actor_loss, actor.trainable_variables)

        # Compute gradients for the critic
        # minimize MSE for the state value function
        with tf.GradientTape() as critic_tape:
            state_v = critic(state)
            # Compute the loss
            critic_loss = mse(estimated_return, state_v)
            # Compute the gradient
            critic_gradients = critic_tape.gradient(critic_loss, critic.trainable_variables)
            #breakpoint()
            # Accumulate gradients
            #critic_gradients.append(gradients)
            
        # Apply gradients.
        actor_optimizer.apply_gradients(zip(policy_gradients, actor.trainable_variables))
        critic_optimizer.apply_gradients(zip(critic_gradients, critic.trainable_variables))
        losses.append(actor_loss)

    observations = []

    # Store summary statistics
    with train_summary_writer.as_default():
        tf.summary.scalar('?b', tf.reduce_mean(losses), step=step)

env.close()

ValueError: Tensor conversion requested dtype int32 for Tensor with dtype float32: <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-0.0586404], dtype=float32)>

Box(2,)
