<a href="https://colab.research.google.com/github/kimsooyoung/rl_oc_python/blob/main/rl_oc_python%20/oc_lec3_actor_critic/PPO_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Requirements

In [None]:
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install swig
!pip install renderlab
!pip install gymnasium
!pip install gymnasium[box2d]

Collecting swig
  Downloading swig-4.2.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.2.1
Collecting renderlab
  Downloading renderlab-0.1.20230421184216-py3-none-any.whl (4.0 kB)
Collecting gymnasium (from renderlab)
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium->renderlab)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium, renderlab
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1 renderlab-0.1.20230421184216
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (

### Import the Necessary Packages

In [None]:
import gymnasium as gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

## Render Test

In [None]:
import renderlab as rl

env = gym.make("CartPole-v1", render_mode = "rgb_array")
env = rl.RenderFrame(env, "./output")

observation, info = env.reset()
score = 0

while True:
  action = env.action_space.sample()
  observation, reward, terminated, truncated, info = env.step(action)
  score += reward

  if terminated:
    print("Score : ", score)
    break

env.play()

## Initialize Hyper Params

In [None]:
#Hyperparameters
learning_rate = 0.0002
gamma         = 0.98
n_rollout     = 10

## Define Actor-Critic Class

- input: 4 length tensor
- Actor layer structure: (4 * 256) = ReLU > (256 * 2) = softmax>
- Critic layer structure: (4 * 256) = ReLU > (256 * 1)

- `put_data` method: append episodes ($S, A, R, S', \Pi(a, s), Done$) into class variable.
- `make_batch` method : return torch type transitions and clear buffer
- `train_net` method : optimize network with proximal policy gradient loss + Q-learning loss

$for \ 2-3 \ epoch$

$\quad \quad R = \frac{P_{\theta_{new}}(a, s)}{P_{\theta_{old}}(a, s)} $

$\quad \quad S_1 = RA, (A = advantage)$

$\quad \quad S_2 = clamp(RA, 1-ϵ, 1+ϵ)$

$\quad \quad loss = -min(S_1, S_2) + E[(R + rV(s) - V(s'))^2]$

$end$


In [None]:
class PPO(nn.Module):

  def __init__(self):
    super(PPO, self).__init__()
    self.data = []

    self.fc1 = nn.Linear(4, 256)
    self.fc_pi = nn.Linear(256, 2)
    self.fc_v = nn.Linear(256, 1)

    self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

  def pi(self, state, softmax_dim=0):
    x1 = F.relu(self.fc1(state))
    x2 = F.softmax(self.fc_pi(x1), dim=softmax_dim)
    return x2

  def v(self, state):
    x1 = F.relu(self.fc1(state))
    x2 = self.fc_v(x1)
    return x2

  def put_data(self, transition):
    self.data.append(transition)

  def make_batch(self):
    s_list, a_list, r_list, sp_list, pa_list, done_list = [],[],[],[],[],[]

    for transition in self.data:
      s, a, r, sp, pa, done = transition
      s_list.append(s)
      a_list.append([a])
      r_list.append([r/100.0])
      sp_list.append(sp)
      pa_list.append([pa])
      done = 0.0 if done else 1.0
      done_list.append([done])

    s_batch, a_batch, r_batch, sp_batch, pa_batch, done_batch  = \
      torch.tensor(s_list, dtype=torch.float), torch.tensor(a_list), \
      torch.tensor(r_list,  dtype=torch.float), torch.tensor(sp_list, dtype=torch.float), \
      torch.tensor(pa_list), torch.tensor(done_list,  dtype=torch.float)

    self.data = []

    return s_batch, a_batch, r_batch, sp_batch, pa_batch, done_batch

  def train_net(self):
    s, a, r, sp, prob_a, done = self.make_batch()

    for _ in range(K_epoch):
      td_target = r + gamma * self.v(sp) * done
      delta = td_target - self.v(s)
      delta = delta.detach().numpy()

      advantage = 0.0
      advantage_list = []
      for i, _delta in enumerate(delta[::-1]):
        advantage = gamma * lmbda * advantage * done[i] + _delta[0]
        advantage_list.append([advantage])
      advantage_list.reverse()
      advantage = torch.tensor(advantage_list, dtype=torch.float)

      pi_old = prob_a
      pi_new = self.pi(s).gather(1, a)
      ratio = torch.exp(torch.log(pi_new) - torch.log(pi_old))

      sur1 = ratio * advantage
      sur2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage
      loss = -torch.min(sur1, sur2) + F.smooth_l1_loss(self.v(s) , td_target.detach())

      self.optimizer.zero_grad()
      loss.mean().backward()
      self.optimizer.step()

## Main loop

- create environments, PPO Model, print_interval, and reset score value
- for each train epochs
  - reset environment
  - for each episodes loop
    - rollout $T_{horizon}$ times
      - obtain policy action probability and actual action value
      - step environment
      - gather transitions $(S, A, R, S', \Pi(a, s), Done)$
      - put transition into dataset
      - update state, update score
    - train network
  - print progress
- close env

In [None]:
env = gym.make("CartPole-v1")
ppo = PPO()

score = 0.0
print_interval = 500

for epi in range(10000):
  s, _ = env.reset()
  done = False

  while not done:
    for _ in range(100):
      prob = ppo.pi( torch.from_numpy(s).float() )
      m = Categorical(prob)
      a = m.sample().item()
      sp, r, done, truncated, info = env.step(a)
      ppo.put_data( (s, a, r/100.0, sp, prob[a].item(), done) )

      s = sp
      score += r

      if done or truncated:
        break

    ppo.train_net()

  if (epi % print_interval == 0) and (epi != 0):
    print(f"epi: {epi} / avg_score: {score / print_interval}")
    score = 0.0

env.close()

epi: 100 / avg_score: 26.76
epi: 200 / avg_score: 37.2
epi: 300 / avg_score: 51.88
epi: 400 / avg_score: 95.49
epi: 500 / avg_score: 170.55


## Play with the result

In [None]:
env = gym.make("CartPole-v1", render_mode = "rgb_array")
env = rl.RenderFrame(env, "./output")
s, info = env.reset()
done = False

while not done:
  prob = ac_nn.pi( torch.from_numpy(s).float() )
  m = Categorical(prob)
  a = m.sample().item()
  sp, r, done, truncated, info = env.step(a)

  s = sp
  score += r

  if done or truncated:
    print("Score : ", score)
    break

env.play()