In [None]:
import gym
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from IPython.display import clear_output
import matplotlib.pyplot as plt


physical_devices = tf.config.list_physical_devices('GPU') 
for device in physical_devices:
    tf.config.experimental.set_memory_growth(device, True)
    

# Proximal Policy Optimization

PPO is a policy gradient Actor-Critic algorithm. The policy model, the **actor** network  produces a stochastic policy. It maps the state to a probability distribution over the set of possible actions. The **critic** network is used to approximate the value function and then, the advantage is calculated:

$$
A_\Phi (s_t, a_t) = q_\Phi (s_t,a_t) - v_\Phi (s_t) = R_t + \gamma v_{\Phi'} (s_{t+1}) - v_\Phi (s_t)
$$

The critic, $v_\Phi$ is trained in the same manner, as the DQN model and the critic of DDPG, with TD-learning and a "frozen" and periodically updated target critic network, $v_{\Phi'}$. Instead of approximating a q-value, it approximates the value.

To train the actor, PPO uses the ratio of two policies:
- a current policy $\pi_\Theta$, that is learned currently
- a baseline policy $\pi_{\Theta´}$, an earlier version of the policy

$$
r^t (\Theta)=r_\Theta (s_t,a_t) = \frac{\pi_\Theta (a_t | s_t)}{\pi_{\Theta'} (a_t | s_t)}
$$

It is the ratio of the probabilities of selecting $a_t$ given $\pi_\Theta$ and the probability of selecting the same action with $\pi_{\Theta´}$.

When multiplied with the the approximated advantage, calculated using the critic network, it can be used as the objective function (maximize with SGA)

$$
loss_{actor} = - r_\Theta (s_t, a_t) A_\Phi (s_t, a_t)
$$

as when
- the advantage is positive, meaning, that selecting the action would increase the value, the probability of selecting this action would increase
- the advantage is negative, meaning, that selecting the action would decrease the value, the probability of selecting this action would decrease

Instead of using this directly as loss function, to stabilize the implementation by adjusting the policy optimization step size, the loss is extended in a pessimistic way:

$$
loss_{actor} = \min [r_\Theta (s_t, a_t) A_\Phi (s_t, a_t), clip(r_\Theta (s_t, a_t), 1-\epsilon, 1+\epsilon) A_\Phi (s_t, a_t)]
$$

PPO uses 2 main models. The actor network learns the stochastic policy. It maps the state to a probability distribution over the set of possible actions. The critic network learns the value function. It maps the state to a scalar.

The critic, $v_\Phi$ is trained in the same manner, as the DQN model and the critic of DDPG, with TD-learning and a "frozen" and periodically updated target critic network, $v_{\Phi'}$. Instead of approximating a q-value, it approximates the value.

To train the actor, PPO uses the ratio of two policies:
- a current policy $\pi_\Theta$, that is learned currently
- a baseline policy $\pi_{\Theta´}$, an earlier version of the policy

$$
r^t (\Theta)=r_\Theta (s_t,a_t) = \frac{\pi_\Theta (a_t | s_t)}{\pi_{\Theta'} (a_t | s_t)}
$$

It is the ratio of the probabilities of selecting $a_t$ given $\pi_\Theta$ and the probability of selecting the same action with $\pi_{\Theta´}$.

When multiplied with the the approximated advantage, calculated using the critic network, it can be used as the objective function (maximize with SGA)

$$
loss_{actor} = - r_\Theta (s_t, a_t) A_\Phi (s_t, a_t)
$$

as when
- the advantage is positive, meaning, that selecting the action would increase the value, the probability of selecting this action would increase
- the advantage is negative, meaning, that selecting the action would decrease the value, the probability of selecting this action would decrease

Instead of using this directly as loss function, to stabilize the implementation by adjusting the policy optimization step size, the loss is extended in a pessimistic way:

$$
loss_{actor} = \min [r_\Theta (s_t, a_t) A_\Phi (s_t, a_t), clip(r_\Theta (s_t, a_t), 1-\epsilon, 1+\epsilon) A_\Phi (s_t, a_t)]
$$

In [None]:
class Actor(tf.keras.Model):
    def __init__(self, units=(400, 300), n_actions=2, **kwargs):
        super(Actor, self).__init__(**kwargs)
        self._layers = []
        for i, u in enumerate(units):
            self._layers.append(tf.keras.layers.Dense(u, activation='relu'))
        self._layers.append(tf.keras.layers.Dense(n_actions, activation='softmax'))
        
    def call(self, inputs, **kwargs):
        outputs = inputs
        for l in self._layers:
            outputs = l(outputs)
        return outputs
    
class Critic(tf.keras.Model):
    def __init__(self, units=(400, 300), **kwargs):
        super(Critic, self).__init__(**kwargs)
        self._layers = []
        for i, u in enumerate(units):
            self._layers.append(tf.keras.layers.Dense(u, activation='relu'))
        self._layers.append(tf.keras.layers.Dense(1))
        
    def call(self, inputs, **kwargs):
        outputs = inputs
        for l in self._layers:
            outputs = l(outputs)
        return outputs
    
class PPOAgent:
    def __init__(self, action_space, observation_shape, gamma=0.99, epsilon = 0.1):
        self.gamma = gamma
        self.epsilon = epsilon
        self.actor = Actor(n_actions=action_space.n)
        self.actor_old = Actor(n_actions=action_space.n)
        self.critic = Critic()
        self.target_critic = Critic()
        
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=0.00025)
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
        self._init_networks(observation_shape)
        
    def _init_networks(self, observation_shape):
        initializer = np.zeros([1, *observation_shape.shape])
        self.actor(initializer)
        self.actor_old(initializer)
        
        self.critic(initializer)
        self.target_critic(initializer)
        
        self.update_frozen_nets()
        
    def act(self, observation):
        probs = self.actor(observation).numpy()
        probs = np.squeeze(probs)
        action = np.random.choice(env.action_space.n, p=probs)
        return action
    
    def get_critic_grads(self, states, rewards, next_states, dones):
        with tf.GradientTape() as tape:
            next_value = self.target_critic(next_states)
            q_value = rewards + (1-dones) * self.gamma * next_value
            value = self.critic(states)
            
            advantage = q_value - value
            loss = tf.reduce_mean(tf.square(advantage))
        gradients = tape.gradient(loss, self.critic.trainable_variables)
        return gradients, loss, advantage
    
    def get_actor_grads(self, states, actions, advantage):
        with tf.GradientTape() as tape:
            p_current = tf.gather(self.actor(states), actions, axis=1)
            p_old = tf.gather(self.actor_old(states), actions, axis=1)
            ratio = p_current / p_old
            clip_ratio = tf.clip_by_value(ratio, 1-self.epsilon, 1+self.epsilon)
            # standardize advantage
            advantage = (advantage - tf.reduce_mean(advantage)) / (tf.keras.backend.std(advantage) + 1e-8)
            objective = ratio * advantage
            clip_objective = clip_ratio * advantage
            loss = -tf.reduce_mean(tf.where(objective < clip_objective, objective, clip_objective))
        gradients = tape.gradient(loss, self.actor.trainable_variables)
        return gradients, loss
        

    def learn(self, states, actions, rewards, next_states, dones):
        critic_grads, critic_loss, advantage = self.get_critic_grads(states, rewards, next_states, dones)
        actor_grads, actor_loss = self.get_actor_grads(states, actions, advantage)
        
        self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables))
        self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))
        return actor_loss, critic_loss
    
    def update_frozen_nets(self):
        # TODO: set discount factor
        weights = self.actor.get_weights()
        self.actor_old.set_weights(weights)
        
        weights = self.critic.get_weights()
        self.target_critic.set_weights(weights)

## Training

PPO is an on-policy method. We allways complete a full episode, record the trajectory and the rewards. We then use these to update our network. 

In [None]:
def compute_avg_return(env, agent, num_episodes=1, max_steps=200, render=False):
    total_return = 0.0
    for _ in range(num_episodes):
        obs, _ = env.reset()
        episode_return = 0.0
        done = False
        steps = 0
        while not (done or steps > max_steps):
            if render:
                clear_output(wait=True)
                plt.axis('off')
                plt.imshow(env.render())
                plt.show()
            action = agent.act(np.array([obs]))
            obs, r, done, _, _ = env.step(action)
            episode_return += r
            steps += 1
        total_return += episode_return
    return total_return / num_episodes

In [None]:
env = gym.make('CartPole-v1', render_mode='rgb_array')

agent = PPOAgent(env.action_space, env.observation_space)

In [None]:
n_rollouts = 5
batch_size = 8
learn_steps = 16

for i in range(150):
    obs, _ = env.reset()
    done = False
    
    states = []
    rewards = []
    actions = []
    next_states = []
    dones = []
    
    for _ in range(n_rollouts):
        while not done:
            env.render()
            action = agent.act(np.array([obs]))
            new_obs, r, done, _, _ = env.step(action)

            states.append(obs)
            rewards.append([r])
            actions.append([action])
            obs = new_obs
            next_states.append(obs)
            dones.append([done])
    states, actions, rewards, next_states, dones = map(np.array, [states, actions, rewards, next_states, dones])
    
    for _ in range(learn_steps):
        indices = np.arange(states.shape[0])
        np.random.shuffle(indices)
        
        shuffled_states = states[indices]
        shuffled_actions = actions[indices]
        shuffled_rewards = rewards[indices]
        shuffled_next_states = next_states[indices]
        shuffled_dones = dones[indices]
        for j in range(0, states.shape[0], batch_size):
            states_batch = shuffled_states[j:j + batch_size]
            actions_batch = shuffled_actions[j:j + batch_size]
            rewards_batch = shuffled_rewards[j:j + batch_size]
            next_states_batch = shuffled_next_states[j:j + batch_size]
            dones_batch = shuffled_dones[j:j + batch_size]
            actor_loss, critic_loss = agent.learn(states_batch,
                                                  actions_batch,
                                                  rewards_batch,
                                                  next_states_batch,
                                                  dones_batch)
    agent.update_frozen_nets()
    
    if (i + 1) % 10 == 0:
        avg_return = compute_avg_return(env, agent, num_episodes=2)
        print(f'epoch {i + 1}, actor loss {actor_loss}, critic loss {critic_loss}, avg_return {avg_return}')
    

# Storing and loading models

In [None]:
import os

def store_models(agent, path='./model'):
    os.makedirs(path, exist_ok=True)
    
    actor_path = f'{path}/actor'
    agent.actor.save_weights(actor_path)
    
    critic_path = f'{path}/critic'
    agent.critic.save_weights(critic_path)


def load_models(agent, path='./model'):
    actor_path = f'{path}/actor'
    if not os.path.exists(actor_path + '.index'):
        raise FileNotFoundError(f"Actor model not found at {actor_path}.")
    
    critic_path = f'{path}/critic'
    if not os.path.exists(critic_path + '.index'):
        raise FileNotFoundError(f"Critic model not found at {critic_path}.")
    
    agent.actor.load_weights(actor_path)
    agent.critic.load_weights(critic_path)
    agent.update_frozen_nets()

In [None]:
store_models(agent)

In [None]:
# to load the trained agent, initialize an agent
agent = PPOAgent(env.action_space, env.observation_space)
# and load the stored weights
load_models(agent)

In [None]:
compute_avg_return(env, agent, num_episodes=2, render=True)
env.close()