###  Reference

https://spinningup.openai.com/en/latest/algorithms/ppo.html

https://huggingface.co/learn/deep-rl-course/en/unit8/hands-on-cleanrl

In [2]:
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
import os
import shutil
import torch
import time
from collections import deque
import random
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
import seaborn as sns
import torch.nn as nn

In [3]:
# Check if CUDA is available
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)


cuda


In [4]:
def build_env(name = 'LunarLander-v3', record_name = 'lunar', max_record_steps = int(1e3)):
    # Delete all contents in lunar-agent folder

    # Initialise the environment
    env = gym.make(name, render_mode="rgb_array")

    if record_name != None and record_name != "":
        path = os.path.join('output', record_name)
        if os.path.exists(path):
            shutil.rmtree(path)

        env = RecordVideo(
            env,
            video_folder=path,
            episode_trigger=lambda x: True,  # Record every episode
            name_prefix="training",
            video_length=max_record_steps,  # Maximum number of steps to record per episode
        )

    return env

env = build_env()

# Reset the environment to generate the first observation
observation, info = env.reset(seed=42)
print('state =', observation.shape ,' type =', type(observation))
print("action shape = ", env.action_space.sample().shape, ' type =', type(env.action_space.sample()))

for _ in range(1000):
    # this is where you would insert your policy
    action = env.action_space.sample()

    # step (transition) through the environment with the action
    # receiving the next observation, reward and if the episode has terminated or truncated
    observation, reward, terminated, truncated, info = env.step(action)

    # If the episode has ended then we can reset to start a new episode
    if terminated or truncated:
        observation, info = env.reset()

env.close()


state = (8,)  type = <class 'numpy.ndarray'>
action shape =  ()  type = <class 'numpy.int64'>


### PPO
- improve training stability by limit the changes the agent make at each step -> avoid large policy update
- small update -> converge to optimal solution
- measure how much policy changed, ratio calculation between the current and former policy.
- [ 1 - e , 1 + e ]
- Clipped Surrogate Objective Function

In [31]:
# Agent

class Agent(nn.Module):
    def __init__(self, state_size, action_size, hidden_size = 64):

        super().__init__()
        self.critic = nn.Sequential(
            nn.Linear(state_size, 64),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1),
        )

        self.actor = nn.Sequential(
            nn.Linear(state_size, 64),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, action_size),
        )

    def value(self, state):
        return self.critic(state).squeeze(-1)
    
    def action_and_value(self,x, action = None):
        #This only support discrete action space
        logits = self.actor(x) # TODO: ????
        probs = torch.distributions.Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        
        value = self.value(x)
        
        #TODO: why use entropy? some optimize tips 
        return action, probs.log_prob(action), probs.entropy(), value

# ====================================
# Test the agent class

env = build_env()

state,info = env.reset()
action = env.action_space.sample()


agent = Agent(state_size=env.observation_space.shape[0], action_size=env.action_space.n)

print(agent)
# state = torch.tensor(state).unsqueeze(0).float()
state = torch.rand(10,8)
print('state shape = ', state.shape)
actions = torch.ones(10)
print('actions shape = ', actions.shape)

action, log_prob, entropy, value =  agent.action_and_value(state, actions)
print(action.shape)
print(log_prob.shape)
print(entropy.shape)
print(value.shape)


env.close()

Agent(
  (critic): Sequential(
    (0): Linear(in_features=8, out_features=64, bias=True)
    (1): Tanh()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): Tanh()
    (4): Linear(in_features=64, out_features=1, bias=True)
  )
  (actor): Sequential(
    (0): Linear(in_features=8, out_features=64, bias=True)
    (1): Tanh()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): Tanh()
    (4): Linear(in_features=64, out_features=4, bias=True)
  )
)
state shape =  torch.Size([10, 8])
actions shape =  torch.Size([10])
torch.Size([10])
torch.Size([10])
torch.Size([10])
torch.Size([10])


In [65]:
# Hyperparameters
lr = 1e-3
gamma = 0.99
gae_lambda = 0.98
collect_steps = 2048
total_episodes = 1000

# init policy theta param, init value function param
env = build_env()
state, info = env.reset()
done = torch.zeros(1)

agent = Agent(state_size=state.shape[0], action_size=env.action_space.n)
optim = torch.optim.Adam(agent.parameters(), lr=lr)

loop = tqdm(range(total_episodes))

# debug = True
debug = False

avg_rewards = deque(maxlen=100)
current_rewards = 0

# for k = 0, 1, 2 ... M do 
for k in loop:
    states = []
    dones = []
    actions = []
    rewards = []
    log_probs = []
    values = []

    for step in range(collect_steps if not debug else 100):

        # collect trajectory D by running policy pi in the environment
        state = torch.tensor(state).float()  # ensure state is a float tensor

        states.append(state)
        dones.append(done)

        with torch.no_grad():
            action, log_prob, entropy, value = agent.action_and_value(state)  # state is already a tensor
            values.append(value)
        
        actions.append(action)
        log_probs.append(log_prob)

        # Old: state, reward, terminated, truncated, info = env.step(action.numpy())
        # fixed: use .item() to convert action tensor to a scalar
        state, reward, terminated, truncated, info = env.step(action.item())
        current_rewards += reward

        done = torch.tensor(terminated or truncated)
        # Old: (no conversion)
        # fixed: convert done to float for proper arithmetic later
        done = done.float()

        rewards.append(reward)

        if done.item() == 1:  # fixed: check if done using .item()
            # collect stats
            avg_rewards.append(current_rewards)
            loop.set_description(f"Episode reward: {current_rewards}, avg reward: {np.mean(avg_rewards)}")
            current_rewards = 0
            state, info = env.reset()
    
    # Convert collected lists to tensors
    states = torch.vstack(states)
    # Old: dones = torch.tensor(dones)
    dones = torch.tensor(dones).float()  # fixed: ensure dones is a float tensor (0 for non-terminal, 1 for terminal)
    actions = torch.tensor(actions)
    rewards = torch.tensor(rewards).float()
    log_probs = torch.tensor(log_probs)
    values = torch.tensor(values).float()

    if debug:
        print('states = ', states.shape)
        print('dones = ', dones.shape)
        print('actions = ', actions.shape)
        print('rewards = ', rewards.shape)
        print('log_probs = ', log_probs.shape)
        print('values = ', values.shape)
        break

    T = len(rewards)

    # compute rewards-to-go (Gt) and advantages
    with torch.no_grad():
        # Old: new_state = torch.tensor(state).float().unsqueeze(0)  (same)
        # fixed: add unsqueeze so that state has a batch dimension if needed
        new_state = torch.tensor(state).float().unsqueeze(0)
        next_value = agent.value(new_state)
        advantages = torch.zeros_like(rewards)
        last_gae_lambda = 0

        for t in reversed(range(T)):
            if t == T - 1:
                # Old: next_none_terminal = 1 - done.float()  
                # fixed: use the last element from dones (inverted) for proper terminal mask
                next_none_terminal = 1 - dones[-1]  # fixed: use dones[-1] instead of 'done'
                next_values = next_value
            else:
                # Old: next_none_terminal = 1 - dones[t+1]  (same)
                # fixed: invert dones to get non-terminal mask (1 for non-terminal, 0 for terminal)
                next_none_terminal = 1 - dones[t+1]
                next_values = values[t+1]

            # Old: delta = rewards[t] + 0.99 * next_values * next_none_terminal - values[t]
            # fixed: use gamma variable instead of hard-coded 0.99
            delta = rewards[t] + gamma * next_values * next_none_terminal - values[t]
            last_gae_lambda = delta + gamma * gae_lambda * next_none_terminal * last_gae_lambda
            advantages[t] = last_gae_lambda

        returns = advantages + values

    if debug:
        print()
        print('advantages ', advantages.shape)
        print('values ', values.shape)
        print('returns ', returns.shape)

    # Compute new log probabilities and value predictions for the collected batch
    _, new_log_prob, entropy, new_value = agent.action_and_value(states, actions)

    log_ratio = new_log_prob - log_probs
    ratio = log_ratio.exp()

    # Normalize advantages
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
    advantages = advantages.detach()  # fixed: detach so gradients do not flow through advantage normalization

    # PPO Clipped Objective for policy loss (actor)
    # Old:
    # pg_loss1 = -advantages * ratio
    # pg_loss2 = -advantages * ratio.clamp(1 - 0.2, 1 + 0.2)
    # pg_loss = torch.min(pg_loss1, pg_loss2).mean()
    # fixed: use the same clipping but ensure signs are correct; note that we want to maximize the surrogate,
    # so we minimize the negative surrogate objective.
    pg_loss1 = -advantages * ratio
    pg_loss2 = -advantages * ratio.clamp(1 - 0.2, 1 + 0.2)
    pg_loss = torch.min(pg_loss1, pg_loss2).mean()

    new_value = new_value.view(-1)

    # Value function loss
    value_loss = (returns - new_value).pow(2).mean()

    entropy_mean = entropy.mean()

    # Combined loss: policy loss + value loss coefficient - entropy bonus
    # Old: loss = pg_loss - 0.5 * value_loss - 0.01 * entropy_mean
    loss = pg_loss + 0.5 * value_loss - 0.01 * entropy_mean  # fixed: add value loss (not subtract it)

    optim.zero_grad()
    loss.backward()
    optim.step()

    if debug:
        print()
        print('new_log_prob ', new_log_prob.shape)
        print('entropy ', entropy.shape)
        print('new_value ', new_value.shape)
        print('log ratio ', log_ratio.shape)
        print('ratio shape = ', ratio.shape)
        print('pg loss = ', pg_loss.shape)
        print('value loss = ', value_loss.shape)
        print('loss = ', loss.item())

# Clean up
env.close()


  0%|          | 0/1000 [00:00<?, ?it/s]

Episode reward: -158.32285640307734, avg reward: -159.26805912297067:   1%|▏         | 14/1000 [01:14<1:26:53,  5.29s/it]


KeyboardInterrupt: 