## Imports

In [None]:
import gym
import time
import base64
import imageio
import numpy as np

import tensorflow as tf
import tensorflow_probability as tfp
tfkl = tf.keras.layers

import matplotlib.pyplot as plt
from IPython import display as ipythondisplay
import matplotlib.pyplot as plt
plt.style.use('ggplot') 

from IPython.display import clear_output

## Task 01: Implementing the Network

In [None]:
class CNN(tf.keras.Model):


    def __init__(self, env):
        
        super(CNN, self).__init__()

        self.conv2D1 = tfkl.Conv2D(filters=16, kernel_size=3, strides=(2,2), padding="valid", activation="relu") #, kernel_initializer=tf.initializers.glorot_normal(), bias_initializer=tf.initializers.constant(0.1)
        self.conv2D2 = tfkl.Conv2D(filters=32, kernel_size=3, strides=(2,2), padding="valid", activation="relu")
        self.flatten = tfkl.Flatten()
        self.dense = tfkl.Dense(units=64, activation="relu")
        self.out1 = tfkl.Dense(units=1, activation = "tanh")
        self.out2 = tfkl.Dense(units=env.action_space.shape[0]-1, activation = "sigmoid") # oder rescalen

        
    @tf.function()
    def call(self, states, actions=None):
        
        x = self.conv2D1(states)
        x = self.conv2D2(x)
        x = self.flatten(x)
        x = self.dense(x)
        mus1 = self.out1(x)
        mus2 = self.out2(x)
        mus = tf.concat((mus1, mus2), axis=1)

        multivariate = tfp.distributions.MultivariateNormalDiag(loc=mus, scale_diag=[[0.5, 0.2, 0.2]])
        
        if actions==None:
            actions = multivariate.sample()
            log_probs = multivariate.log_prob(actions)
            return actions, log_probs
        else:
            log_probs = multivariate.log_prob(actions)
            return log_probs

## Task 02: Creating Trajectories

In [None]:
def create_trajectory(env, model, discount_factor, return_avg_return: bool=False, steps=None, num_trajectories: int=1):
    
    s = []
    a = []
    r = []
    g = []
    s_prime = []
    time_steps = []
    
    avg_return = 0.0
    
    for _ in range(num_trajectories):
        
        obs = env.reset()
        cumul_return = 0.0
        step_count = 0
        done = False
        
        r_cache = []

        while not (done or step_count==steps):
            actions, _ = model(tf.expand_dims(tf.convert_to_tensor(obs/255, dtype=tf.float32), axis=0))
            obs_prime, reward, done, _ = env.step(tf.squeeze(actions, axis=0).numpy())
            
            s.append(tf.convert_to_tensor(obs/255, dtype=tf.float32))
            a.append(actions)
            r.append(reward)
            r_cache.append(reward)
            s_prime.append(tf.convert_to_tensor(obs_prime/255, dtype=tf.float32))
            
            
            obs = obs_prime
            time_steps.append(step_count)
            step_count+=1
    
        g_cache = []
        
        for idx, reweward in enumerate(reversed(r_cache)):
            cumul_return = discount_factor*cumul_return + reweward
            g_cache.append(cumul_return)
            
        avg_return += g_cache[-1]
        
        g.extend(reversed(g_cache))
        
    if return_avg_return:
        return  tf.data.Dataset.from_tensor_slices((s,a,r,g,s_prime, time_steps)), avg_return/num_trajectories
    else:
        return  tf.data.Dataset.from_tensor_slices((s,a,r,g,s_prime, time_steps))

## Task 03: Vanilla Policy Gradient

In [None]:
@tf.function()
def gradient_step(model, s, a, g, time_steps, optimizer):

    with tf.GradientTape() as tape:
        loss = - tf.math.reduce_mean(model(states=s, actions=tf.squeeze(a, axis=1))*(g*discount_factor**time_steps))
        
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss

In [None]:
def embed_mp4(path):
    """Function to display an mp4 file in the current notebook
    
     Arguments:
            path (str): Directory from where the video should be loaded
            
    Returns:
            (ipythondisplay.HTML): Create a display object given raw data.
    """
    
    video = open(path,'rb').read()
    b64 = base64.b64encode(video)
    tag = '''<video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
    Your browser does not support the video tag.
    </video>'''.format(b64.decode())

    return ipythondisplay.HTML(tag)

In [None]:
def create_policy_eval_video(env, model, path: str, num_episodes: int=1, fps: int=30):
    """Function to save an mp4 file of the given models performance
    
     Arguments:
            env (gym): Environment the agent interacts with
            policy (Policy): Policy to perform the actions in the environment
            model_network (tf.keras.Model): Model to calculate the Q-values
            path (str): Directory to which the video should be saved to
            num_episodes (int): Number of to be saved episodes 
            fps (int): Frames per second of the to be saved video
    """
    
    path += ".mp4"
    with imageio.get_writer(path, fps=fps) as video:
        for _ in range(num_episodes):
            obs = env.reset()
            video.append_data(env.render(mode='rgb_array'))
            done = False
            while not done:
                actions, _ = model(tf.expand_dims(tf.convert_to_tensor(obs/255, dtype=tf.float32), axis=0))
                obs, _, done, _  = env.step(tf.squeeze(actions, axis=0).numpy())
                video.append_data(env.render(mode='rgb_array'))

In [None]:
def timing(start):
    """Function to time the duration of each epoch

    Arguments:
        start (time): Start time needed for computation 
    
    Returns:
        time_per_training_step (time): Rounded time in seconds 
    """
    
    now = time.time()
    time_per_training_step = now - start
    
    return round(time_per_training_step, 4)

In [None]:
def train_CNN(env, model, steps, num_trajectories, num_epochs: int, batch_size: int, learning_rate: float, discount_factor: float):

    
    tf.keras.backend.clear_session()
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    train_losses = []
    episode_rewards = []
    cumul_reward = 0.0
    step_counter = 0

    create_policy_eval_video(env, model, f"Videos/eval_vid_epoch_pretraining")
    model.save_weights(f"CarRacingCNNWeights_epoch_pretraining")

    for epoch in range(num_epochs):

        running_average = 0
        avg_return = 0
        start = time.time()
        
        dataset, avg_return = create_trajectory(env, model, discount_factor=discount_factor, return_avg_return=True, steps=steps, num_trajectories=num_trajectories)
        
        dataset = dataset.cache().shuffle(buffer_size=len(dataset)).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
        
        num_steps = 0
        
        episode_rewards.append(avg_return)

        for s, a, _, g, _, time_steps in dataset.take(1):
            loss = gradient_step(model, s, a, g, time_steps, optimizer)
            running_average = 0.95 * running_average + (1 - 0.95) * loss

        
        train_losses.append(float(running_average))
                            
        # Create videos of performance and save them
        if epoch%10 == 0:
            create_policy_eval_video(env, model, f"Videos/eval_vid_epoch_{epoch}")
            model.save_weights(f"CarRacingCNNWeights_epoch_{epoch}")

        clear_output()
        print(f"The last epoch took: {timing(start)} seconds")
        print()
        fig1, ax1 = plt.subplots(nrows=1, ncols=2, figsize = (20, 6))
        ax1[0].plot(train_losses)
        ax1[0].set(ylabel='Loss', xlabel='Epoch', title=f'Average loss over {epoch} epochs')
        ax1[1].plot(episode_rewards)
        ax1[1].set(ylabel='Return', xlabel='Epoch', title=f'Average return over {epoch} epochs')
        plt.show()
        
    env.close()

In [None]:
# Set hyperparameters
discount_factor = 0.99
batch_size = 32
learning_rate = 0.0001
num_epochs = 500
steps = None
num_trajectories = 3

env = gym.make("CarRacing-v1")

model = CNN(env)

train_CNN(env=env, model=model, steps=steps, num_trajectories=num_trajectories, num_epochs=num_epochs, batch_size=batch_size, learning_rate=learning_rate, discount_factor=discount_factor)

# A2C - PPO

In [None]:
class DVN(tf.keras.Model):
    """DVN to calculate state values given the current state and action"""


    def __init__(self, env):
        super(DVN, self).__init__()
        
        self.conv2D1 = tfkl.Conv2D(filters=16, kernel_size=3, strides=(2,2), padding="valid", activation="relu")
        self.conv2D2 = tfkl.Conv2D(filters=32, kernel_size=3, strides=(2,2), padding="valid", activation="relu")
        self.flatten = tfkl.Flatten()
        self.dense = tfkl.Dense(units=64, activation="relu")
        self.out = tfkl.Dense(units=1)
        
        
    @tf.function()
    def call(self, states):
        
        x = self.conv2D1(states)
        x = self.conv2D2(x)
        x = self.flatten(x)
        x = self.dense(x)
        out = self.out(x)
        
        return out

In [None]:
@tf.function()
def gradient_step(actor, critic, s, a, r, g, s_prime, ppo_steps: int, discount_factor: float, eps: float, actor_optimizer, critic_optimizer):

    log_old = actor(states = s, actions = tf.squeeze(a, axis=1))
    advantages = (r + discount_factor*critic(states=s_prime)) - critic(states=s)
    
    
    for _ in range(ppo_steps):
        
        with tf.GradientTape() as actor_tape:  
            log_new = actor(states = s, actions = tf.squeeze(a, axis=1))
            ratios = tf.math.exp(log_new)/tf.math.exp(log_old)
            actor_loss = - tf.reduce_mean(tf.math.minimum(x = ratios*advantages, y=tf.clip_by_value(ratios, 1.0-eps, 1.0+eps)*advantages))
            
        gradients = actor_tape.gradient(actor_loss, actor.trainable_variables)
        actor_optimizer.apply_gradients(zip(gradients, actor.trainable_variables))
    
    
    with tf.GradientTape() as critic_tape:
        critic_loss = tf.keras.losses.mean_squared_error(tf.squeeze(critic(states=s)), g)
    
    gradients = critic_tape.gradient(critic_loss, critic.trainable_variables)
    critic_optimizer.apply_gradients(zip(gradients, critic.trainable_variables))    
    
    return actor_loss, critic_loss

In [None]:
def train_PPO(env, actor, critic, steps, num_trajectories, num_epochs: int, ppo_steps: int, batch_size: int, learning_rate: float, discount_factor: float, eps: float):
    
    tf.keras.backend.clear_session()
    actor_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    critic_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    actor_train_losses = []
    critic_train_losses = []
    episode_rewards = []
    cumul_reward = 0.0
    step_counter = 0


    for epoch in range(num_epochs):

        actor_running_average = 0
        critic_running_average = 0
        avg_return = 0
        start = time.time()
        
        dataset, avg_return = create_trajectory(env, actor, discount_factor=discount_factor, return_avg_return=True, steps=steps, num_trajectories=num_trajectories)
        
        dataset = dataset.cache().shuffle(buffer_size=len(dataset)).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
        
        num_steps = 0
        
        episode_rewards.append(avg_return)

        for s, a, r, g, s_prime, _ in dataset.take(1):
            actor_loss, critic_loss = gradient_step(actor=actor, critic=critic, s=s, a=a, r=r, g=g, s_prime=s_prime, ppo_steps=ppo_steps, 
                                 discount_factor=discount_factor, eps=eps, actor_optimizer=actor_optimizer, critic_optimizer=critic_optimizer)
            # Calculate running average to smoothen loss plot
            actor_running_average = 0.95 * actor_running_average + (1 - 0.95) * actor_loss
            critic_running_average = 0.95 * critic_running_average + (1 - 0.95) * critic_loss

        actor_train_losses.append(float(actor_running_average))
        critic_train_losses.append(float(critic_running_average))
                            
        # Create videos of performance and save them
        if epoch%20 == 0:
            create_policy_eval_video(env, actor, f"Videos/PPO_eval_vid_epoch_{epoch}")
            actor.save_weights(f"CarRacingPPOWeights_epoch_{epoch}")
            critic.save_weights(f"CarRacingPPOWeights_epoch_{epoch}")
                    
        clear_output()
        print(f"The last epoch took: {timing(start)} seconds")
        print()
        fig1, ax1 = plt.subplots(nrows=1, ncols=2, figsize = (20, 6))
        ax1[0].plot(actor_train_losses, label="actor")
        ax1[0].plot(critic_train_losses, label="critic")
        ax1[0].set(ylabel='Loss', xlabel='Epoch', title=f'Average loss over {epoch} epochs')
        ax1[1].plot(episode_rewards)
        ax1[1].set(ylabel='Return', xlabel='Epoch', title=f'Average return over {epoch} epochs')
        ax1[0].legend()
        plt.show()
        
    env.close()

In [None]:
# Set hyperparameters
discount_factor = 0.999
batch_size = 128
learning_rate = 0.001
num_epochs = 500
steps = None
num_trajectories = 3
ppo_steps = 10
eps = 0.1

env = gym.make("CarRacing-v1")

actor = CNN(env)
critic = DVN(env)

train_PPO(env=env, actor=actor, critic=critic, steps=steps, num_trajectories=num_trajectories, num_epochs=num_epochs, 
          ppo_steps=ppo_steps, batch_size=batch_size, learning_rate=learning_rate, discount_factor=discount_factor, eps=0.1)