# A2C - CartPole
Implementation of the A2C RL Algorithm for the OpenAI's Gym environment CartPole-V1 (not in parallel).

In [1]:
%load_ext tensorboard
# Import libraries
import numpy as np
import matplotlib.pyplot as pd
from datetime import datetime

import tensorflow as tf

import gym

In [2]:
# Value Fuction Estimator
class Critic(tf.keras.layers.Layer):
    
    def __init__(self):
        super(Critic, self).__init__()
        self.fc1 = tf.keras.layers.Dense(units=32, input_shape=[8,], activation='relu')
        self.fc2 = tf.keras.layers.Dense(units=8, activation='relu')
        self.out = tf.keras.layers.Dense(units=1, activation='relu')
    
    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.out(x)
        return x

In [3]:
# Action Value Fuction Estimator (q-network)
class Actor(tf.keras.layers.Layer):
    
    def __init__(self):
        super(Actor, self).__init__()
        
        # 64(share) -> 64(share) -> 32 -> 32 -> mu(tanh) [-1,1]
        # 64(share) -> 64(share) -> 32 -> 32 -> sigma(sigmoid) [0,1]
        self.sharedFC1 = tf.keras.layers.Dense(units=64, input_shape=[8,], activation='relu')
        self.sharedFC2 = tf.keras.layers.Dense(units=64, activation='relu')
        
        self.muFC1 = tf.keras.layers.Dense(units=32, activation='relu')
        self.muFC2 = tf.keras.layers.Dense(units=32, activation='relu')
        
        self.sigmaFC1 = tf.keras.layers.Dense(units=32, activation='relu')
        self.sigmaFC2 = tf.keras.layers.Dense(units=32, activation='relu')
        
        
        self.mu_out = tf.keras.layers.Dense(units=1, activation='tanh')
        self.sigma_out = tf.keras.layers.Dense(units=1, activation='sigmoid')
    
    def call(self, x):
        x = tf.convert_to_tensor(x)
        x = self.sharedFC1(x)
        x = self.sharedFC2(x)
        
        mu = self.muFC1(x)
        mu = self.muFC2(mu)
        mu = self.mu_out(mu)
        
        sigma = self.sigmaFC1(x)
        sigma = self.sigmaFC2(sigma)
        sigma = self.sigma_out(sigma)     
        
        return (mu, sigma)

In [11]:
# Prepare Tensorboard
!rm -rf ./logs/
current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
#%tensorboard --logdir logs/
#tf.keras.backend.clear_session()
# Initialize cart pole environment
env = gym.make('LunarLanderContinuous-v2')
# Initialize model, loss and optimizer
actor = Actor()
critic = Critic()
actor_optimizer = tf.keras.optimizers.Adam()
critic_optimizer = tf.keras.optimizers.Adam()
mse = tf.keras.losses.MSE
weighted_sparse_ce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# Initialize replay memory
observations = []
# Set hyperparameters
discount = 0.95
max_time_steps = 500
num_episodes = 100

step = 0
# Run for agent and environment for num_episodes
for i_episode in range(num_episodes):
    state = env.reset()
    #breakpoint()
    
    # Agent has 500 trials at max, if it does not fail beforehand
    for t in range(max_time_steps):
#        env.render()
        # Compute action
        state = np.reshape(state, [1,8])
        mu, sigma = actor(state)
        
        # sample two values from normal distribution
        action = tf.random.normal((2,), mean=mu, stddev=sigma)
        action = np.reshape(action, (2,))
        # Execute action and store action, state and reward
        print(action)
        next_state, reward, done, info = env.step(action)
        observations.append((state, action, reward))
        state = next_state
        
        # Interrupt the trial if the agent fails
        if done:
            break
        step += 1
        
    print(f"Episode {i_episode + 1} of {num_episodes} findished after {t+1} timesteps")
        
    # Store losses temporary
    losses = []
    # Initialize variable for the estimated return
    estimated_return = 0 if done else critic(next_state)
    
    # Iterate over taken actions and observed states and rewards
    observations.reverse()
    for state, action, reward in observations:
        # Compute estimated return
        estimated_return = discount * estimated_return + reward
        # Compute state value
        state_v = critic(state)
    
        # Compute gradients for the actor (policy gradient)
        # Maximize the estimated return
        with tf.GradientTape() as actor_tape:
            logits = tf.math.log(actor(state))
            advantages = estimated_return - int(state_v)
            advantages = tf.cast([[advantages]], tf.float32)
            action = tf.cast(action, tf.int32)
            # Compute the actor loss (log part of the policy gradient)
            actor_loss = weighted_sparse_ce(action, logits, sample_weight=advantages)
            # Compute gradient with respect to the parameters of the actor            
            policy_gradients = actor_tape.gradient(actor_loss, actor.trainable_variables)

        # Compute gradients for the critic
        # minimize MSE for the state value function
        with tf.GradientTape() as critic_tape:
            state_v = critic(state)
            # Compute the loss
            critic_loss = mse(estimated_return, state_v)
            # Compute the gradient
            critic_gradients = critic_tape.gradient(critic_loss, critic.trainable_variables)
            #breakpoint()
            # Accumulate gradients
            #critic_gradients.append(gradients)
            
        # Apply gradients.
        actor_optimizer.apply_gradients(zip(policy_gradients, actor.trainable_variables))
        critic_optimizer.apply_gradients(zip(critic_gradients, critic.trainable_variables))
        losses.append(actor_loss)

    observations = []

    # Store summary statistics
    with train_summary_writer.as_default():
        tf.summary.scalar('?b', tf.reduce_mean(losses), step=step)

env.close()



[0.28530455 1.1542523 ]
[ 0.245451   -0.44724196]
[-0.38074675  0.08604362]
[-0.14134149 -0.60266405]
[0.27019036 0.1077247 ]
[-1.1040916   0.05945595]
[-0.09902799 -0.3725792 ]
[-0.36259574 -0.75470775]
[-0.24704966  0.9929735 ]
[-0.35394356 -0.48451877]
[0.0257036  0.54721546]
[-0.06890254  0.32277626]
[-0.4002181   0.30130306]
[ 0.43381265 -0.23089729]
[0.46184132 0.23931234]
[0.1635864  0.35165933]
[0.54420996 0.5298963 ]
[-0.04836908 -0.12592994]
[-0.03511038  0.1852528 ]
[-1.1173435  -0.24602829]
[-0.01030482  0.40038556]
[ 0.42590514 -0.63675827]
[0.35714528 0.154502  ]
[0.81446916 0.42151278]
[0.18502414 0.9310327 ]
[-0.5011111   0.10578859]
[-0.03380073 -0.40437925]
[-0.56764567 -0.10875219]
[ 0.1087542  -0.66132253]
[0.29736346 0.12485707]
[0.24273135 0.0116913 ]
[-0.31810665 -0.33983552]
[0.05975203 1.4467914 ]
[0.31143478 0.9642538 ]
[0.5935407  0.62621677]
[ 0.25494638 -0.18613185]
[-0.00828307 -0.24017277]
[0.3883799 0.3455398]
[-0.98170143 -0.5537721 ]
[-0.29493386  0.34

InvalidArgumentError: Received a label value of -1 which is outside the valid range of [0, 1).  Label values: 0 -1 [Op:SparseSoftmaxCrossEntropyWithLogits]

Box(2,)
