###  Reference

https://spinningup.openai.com/en/latest/algorithms/ppo.html

https://huggingface.co/learn/deep-rl-course/en/unit8/hands-on-cleanrl

In [3]:
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
import os
import shutil
import torch
import time
from collections import deque
import random
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
import seaborn as sns
import torch.nn as nn

In [4]:
# Check if CUDA is available
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)


cuda


In [5]:
def build_env(name = 'LunarLander-v3', record_name = 'lunar', max_record_steps = int(1e3)):
    # Delete all contents in lunar-agent folder

    # Initialise the environment
    env = gym.make(name, render_mode="rgb_array")

    if record_name != None and record_name != "":
        path = os.path.join('output', record_name)
        if os.path.exists(path):
            shutil.rmtree(path)

        env = RecordVideo(
            env,
            video_folder=path,
            episode_trigger=lambda x: True,  # Record every episode
            name_prefix="training",
            video_length=max_record_steps,  # Maximum number of steps to record per episode
        )

    return env

env = build_env()

# Reset the environment to generate the first observation
observation, info = env.reset(seed=42)
print('state =', observation.shape ,' type =', type(observation))
print("action shape = ", env.action_space.sample().shape, ' type =', type(env.action_space.sample()))

for _ in range(1000):
    # this is where you would insert your policy
    action = env.action_space.sample()

    # step (transition) through the environment with the action
    # receiving the next observation, reward and if the episode has terminated or truncated
    observation, reward, terminated, truncated, info = env.step(action)

    # If the episode has ended then we can reset to start a new episode
    if terminated or truncated:
        observation, info = env.reset()

env.close()


state = (8,)  type = <class 'numpy.ndarray'>
action shape =  ()  type = <class 'numpy.int64'>


### PPO
- improve training stability by limit the changes the agent make at each step -> avoid large policy update
- small update -> converge to optimal solution
- measure how much policy changed, ratio calculation between the current and former policy.
- [ 1 - e , 1 + e ]
- Clipped Surrogate Objective Function

In [None]:
# Agent

class Agent(nn.Module):
    def __init__(self, state_size, action_size, hidden_size = 64):

        super().__init__()
        self.critic = nn.Sequential(
            nn.Linear(state_size, 64),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1),
        )

        self.actor = nn.Sequential(
            nn.Linear(state_size, 64),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, action_size),
        )

    def value(self, state):
        return self.critic(state)
    
    def action_and_value(self,x, action = None):
        #This only support discrete action space
        logits = self.actor(x) # TODO: ????
        probs = torch.distributions.Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        
        value = self.value(x)
        
        #TODO: why use entropy? some optimize tips 
        return action, probs.log_prob(action), probs.entropy(), value

# ====================================
# Test the agent class

env = build_env()

state,info = env.reset()
action = env.action_space.sample()

print('state shape = ', state.shape)

agent = Agent(state_size=env.observation_space.shape[0], action_size=env.action_space.n)

print(agent)

action, log_prob, entropy, value =  agent.action_and_value(torch.tensor(state).float())
print(action)
print(log_prob)
print(entropy)
print(value)


env.close()

state shape =  (8,)
Agent(
  (critic): Sequential(
    (0): Linear(in_features=8, out_features=64, bias=True)
    (1): Tanh()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): Tanh()
    (4): Linear(in_features=64, out_features=1, bias=True)
  )
  (actor): Sequential(
    (0): Linear(in_features=8, out_features=64, bias=True)
    (1): Tanh()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): Tanh()
    (4): Linear(in_features=64, out_features=4, bias=True)
  )
)
tensor(1)
tensor(-1.2665, grad_fn=<SqueezeBackward1>)
tensor(1.3814, grad_fn=<NegBackward0>)
tensor([0.1223], grad_fn=<ViewBackward0>)


In [None]:
# Hyperparameters

lr = 1e-3

# init policy theta param, init value function param

env = build_env()
state, info = env.reset()


agent = Agent(state_size=state.shape[0], action_size=env.action_space.n)
optim = torch.optim.Adam(agent.parameters(), lr=lr)

loop = tqdm(range(1000))

for iter in loop:

    # for k = 0, 1, 2 ... M do 
    for k in range(2048):

        # collect trajectory D by running policy pi in the environment
        states = [state]
        actions = []
        rewards = []
        log_probs = []
        values = []
        

        

        



# compute rewards to go (Gt)

# compute advantage estimates A_t

# update policy by maximize the PPO-clip objective with Adam

# fit value function by regression on mean squared error with gradient descent algo


# Clean up

env.close()



