# A2C - CartPole
Implementation of the A2C RL Algorithm for the OpenAI's Gym environment CartPole-V1 (not in parallel).

In [1]:
%load_ext tensorboard
# Import libraries
import numpy as np
import matplotlib.pyplot as pd
from datetime import datetime

import tensorflow as tf
import tensorflow_probability as tfp

import gym

In [2]:
# Value Fuction Estimator
class Critic(tf.keras.layers.Layer):
    
    def __init__(self):
        super(Critic, self).__init__()
        self.fc1 = tf.keras.layers.Dense(units=128, input_shape=[8,], activation='relu')
        
        self.Dropout = tf.keras.layers.Dropout(rate=0.2)

        self.fc2 = tf.keras.layers.Dense(units=64, activation='relu')
        self.out = tf.keras.layers.Dense(units=1, activation=None)
    
    def call(self, x):
        x = self.fc1(x)
        x = self.Dropout(x, training=True)
        x = self.fc2(x)
        x = self.out(x)
        return x

In [3]:
# Action Value Fuction Estimator (q-network)
class Actor(tf.keras.layers.Layer):
    
    def __init__(self):
        super(Actor, self).__init__()
        
        # 64(share) -> 64(share) -> 32 -> 32 -> mu(tanh) [-1,1]
        # 64(share) -> 64(share) -> 32 -> 32 -> sigma(sigmoid) [0,1]
        self.sharedFC1 = tf.keras.layers.Dense(units=64, input_shape=[8,], activation='relu')
        self.sharedFC2 = tf.keras.layers.Dense(units=64, activation='relu')
        
        self.sharedBatchNorm = tf.keras.layers.BatchNormalization()
        
        self.muFC1 = tf.keras.layers.Dense(units=32, activation='relu')
        self.muFC2 = tf.keras.layers.Dense(units=32, activation='relu')
        
        self.sigmaFC1 = tf.keras.layers.Dense(units=32, activation='relu')
        self.sigmaFC2 = tf.keras.layers.Dense(units=32, activation='relu')
        
        
        self.mu_out = tf.keras.layers.Dense(units=2, activation='tanh')
        self.sigma_out = tf.keras.layers.Dense(units=2, activation='sigmoid')
    
    def call(self, x):
        x = tf.convert_to_tensor(x)
        x = self.sharedFC1(x)
        x = self.sharedFC2(x)
        
        x = self.sharedBatchNorm(x, training=True)
        
        mu = self.muFC1(x)
        mu = self.muFC2(mu)
        mu = self.mu_out(mu)
        
        sigma = self.sigmaFC1(x)
        sigma = self.sigmaFC2(sigma)
        sigma = self.sigma_out(sigma)     
        
        return mu, sigma

In [4]:
# Prepare Tensorboard
!rm -rf ./logs/
current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
#%tensorboard --logdir logs/
tf.keras.backend.clear_session()
# Initialize cart pole environment
env = gym.make('LunarLanderContinuous-v2')
# Initialize model, loss and optimizer
actor = Actor()
critic = Critic()
actor_optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)
critic_optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)
mse = tf.keras.losses.MSE
weighted_sparse_ce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
# Initialize replay memory
observations = []
# Set hyperparameters
discount = 0.95
max_time_steps = 500
num_episodes = 10000

# Store losses temporary
actor_losses = []
critic_losses = []
accum_reward = 0.

step = 0
# Run for agent and environment for num_episodes
for i_episode in range(num_episodes):
    state = env.reset()
    #breakpoint()
    
    # Agent has 500 trials at max, if it does not fail beforehand
    for t in range(max_time_steps):
        env.render()
        # Compute action
        state = np.reshape(state, [1,8])
        mu, sigma = actor(state)
        
        # sample two values from normal distribution
        mainEngineAction = tf.random.normal((1,), mean=mu[0,0], stddev=sigma[0,0])
        sideEngineAction = tf.random.normal((1,), mean=mu[0,1], stddev=sigma[0,1])
        action = tf.concat([mainEngineAction, sideEngineAction], 0)
        #        mainEngineAction = np.reshape(action, (2,))
        # Execute action and store action, state and reward
        next_state, reward, done, info = env.step(action)
        observations.append((state, action, reward))
        state = next_state
        accum_reward += reward
        # Interrupt the trial if the agent fails
        if done:
            break
        step += 1
        
    print(f"Episode {i_episode + 1} of {num_episodes} finished after {t+1} timesteps")
        
    # Initialize variable for the estimated return
    estimated_return = 0 if done else critic(next_state)
    
    # Iterate over taken actions and observed states and rewards
    observations.reverse()
    for state, action, reward in observations:
        # Compute estimated return
        estimated_return = discount * estimated_return + reward
        # Compute state value
        state_v = critic(state)
    
        # Compute gradients for the actor (policy gradient)
        # Maximize the estimated return
        with tf.GradientTape() as actor_tape:
            mu, sigma = actor(state)
            advantages = estimated_return - int(state_v)
            advantages = tf.cast([[advantages]], tf.float32)
            action_distribution = tfp.distributions.Normal(loc=mu, scale=sigma)
            logprob = action_distribution.log_prob(action)
            #breakpoint()
            actor_loss = logprob * advantages
            #breakpoint()
            # Compute the actor loss (log part of the policy gradient)
            # Compute gradient with respect to the parameters of the actor            
            policy_gradients = actor_tape.gradient(actor_loss, actor.trainable_variables)

        # Compute gradients for the critic
        # minimize MSE for the state value function
        with tf.GradientTape() as critic_tape:
            state_v = critic(state)
            # Compute the loss
            critic_loss = mse(estimated_return, state_v)
            # Compute the gradient
            critic_gradients = critic_tape.gradient(critic_loss, critic.trainable_variables)
            #breakpoint()
            # Accumulate gradients
            #critic_gradients.append(gradients)
            
        # Apply gradients.
        actor_optimizer.apply_gradients(zip(policy_gradients, actor.trainable_variables))
        critic_optimizer.apply_gradients(zip(critic_gradients, critic.trainable_variables))
        actor_losses.append(actor_loss)
        critic_losses.append(critic_loss)

    observations = []

    # Store summary statistics
    with train_summary_writer.as_default():
        tf.summary.scalar('policy loss', tf.reduce_mean(actor_losses), step=step)
        
        # Store summary statistics
        tf.summary.scalar('critic loss', tf.reduce_mean(critic_losses), step=step)
        
        # Critic
        #tf.summary.scalar('V(s)', state_v[0,0], step=step)
        
        # Actor
        tf.summary.scalar('mu0', mu[0,0], step=step)
        tf.summary.scalar('sigma0', sigma[0,0], step=step)
        tf.summary.scalar('mu1', mu[0,1], step=step)
        tf.summary.scalar('sigma1', sigma[0,1], step=step)
        
        # Accumulative reward
        tf.summary.scalar("accumulative reward", accum_reward, step=step)
    
    accum_reward = 0.

env.close()



Episode 1 of 10000 finished after 166 timesteps
Episode 2 of 10000 finished after 150 timesteps
Episode 3 of 10000 finished after 165 timesteps
Episode 4 of 10000 finished after 65 timesteps
Episode 5 of 10000 finished after 71 timesteps
Episode 6 of 10000 finished after 66 timesteps
Episode 7 of 10000 finished after 80 timesteps
Episode 8 of 10000 finished after 64 timesteps
Episode 9 of 10000 finished after 79 timesteps
Episode 10 of 10000 finished after 83 timesteps
Episode 11 of 10000 finished after 51 timesteps
Episode 12 of 10000 finished after 71 timesteps
Episode 13 of 10000 finished after 50 timesteps
Episode 14 of 10000 finished after 66 timesteps
Episode 15 of 10000 finished after 57 timesteps
Episode 16 of 10000 finished after 62 timesteps
Episode 17 of 10000 finished after 69 timesteps
Episode 18 of 10000 finished after 51 timesteps
Episode 19 of 10000 finished after 51 timesteps
Episode 20 of 10000 finished after 77 timesteps
Episode 21 of 10000 finished after 73 timestep

Episode 171 of 10000 finished after 60 timesteps
Episode 172 of 10000 finished after 81 timesteps
Episode 173 of 10000 finished after 86 timesteps
Episode 174 of 10000 finished after 78 timesteps
Episode 175 of 10000 finished after 66 timesteps
Episode 176 of 10000 finished after 80 timesteps
Episode 177 of 10000 finished after 62 timesteps
Episode 178 of 10000 finished after 57 timesteps
Episode 179 of 10000 finished after 51 timesteps
Episode 180 of 10000 finished after 70 timesteps
Episode 181 of 10000 finished after 85 timesteps
Episode 182 of 10000 finished after 84 timesteps
Episode 183 of 10000 finished after 48 timesteps
Episode 184 of 10000 finished after 72 timesteps
Episode 185 of 10000 finished after 83 timesteps
Episode 186 of 10000 finished after 65 timesteps
Episode 187 of 10000 finished after 60 timesteps
Episode 188 of 10000 finished after 74 timesteps
Episode 189 of 10000 finished after 71 timesteps
Episode 190 of 10000 finished after 89 timesteps
Episode 191 of 10000

KeyboardInterrupt: 

In [None]:
env.observation_space