<a href="https://colab.research.google.com/github/kimsooyoung/rl_oc_python/blob/main/oc_lec3_actor_critic/TDAC_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Requirements

In [1]:
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install swig
!pip install renderlab
!pip install gymnasium
!pip install gymnasium[box2d]

Collecting swig
  Downloading swig-4.2.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.2.1
Collecting renderlab
  Downloading renderlab-0.1.20230421184216-py3-none-any.whl (4.0 kB)
Collecting gymnasium (from renderlab)
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium->renderlab)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium, renderlab
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1 renderlab-0.1.20230421184216
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (3

### Import the Necessary Packages

In [2]:
import gymnasium as gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

## Render Test

In [14]:
import renderlab as rl

env = gym.make("CartPole-v1", render_mode = "rgb_array")
env = rl.RenderFrame(env, "./output")

observation, info = env.reset()
score = 0

while True:
  action = env.action_space.sample()
  observation, reward, terminated, truncated, info = env.step(action)
  score += reward

  if terminated:
    print("Score : ", score)
    break

env.play()

Score :  9.0
Moviepy - Building video temp-{start}.mp4.
Moviepy - Writing video temp-{start}.mp4



                                                   

Moviepy - Done !
Moviepy - video ready temp-{start}.mp4




## `torch.gather()` example

In [4]:
matrix = torch.range(0, 99).reshape(10,10)
indices = torch.tensor([0,1,2,3,4,5,6,7,8,9]).unsqueeze(axis=-1)
matrix, indices

  matrix = torch.range(0, 99).reshape(10,10)



(tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.],
         [10., 11., 12., 13., 14., 15., 16., 17., 18., 19.],
         [20., 21., 22., 23., 24., 25., 26., 27., 28., 29.],
         [30., 31., 32., 33., 34., 35., 36., 37., 38., 39.],
         [40., 41., 42., 43., 44., 45., 46., 47., 48., 49.],
         [50., 51., 52., 53., 54., 55., 56., 57., 58., 59.],
         [60., 61., 62., 63., 64., 65., 66., 67., 68., 69.],
         [70., 71., 72., 73., 74., 75., 76., 77., 78., 79.],
         [80., 81., 82., 83., 84., 85., 86., 87., 88., 89.],
         [90., 91., 92., 93., 94., 95., 96., 97., 98., 99.]]),
 tensor([[0],
         [1],
         [2],
         [3],
         [4],
         [5],
         [6],
         [7],
         [8],
         [9]]))

In [5]:
torch.gather(matrix, 1, indices)

tensor([[ 0.],
        [11.],
        [22.],
        [33.],
        [44.],
        [55.],
        [66.],
        [77.],
        [88.],
        [99.]])

## Initialize Hyper Params

In [10]:
learning_rate = 0.0002
gamma         = 0.98
n_rollout     = 10

## Define Actor-Critic Class

- input: 4 length tensor
- Actor layer structure: (4 * 256) = ReLU > (256 * 2) = softmax>
- Critic layer structure: (4 * 256) = ReLU > (256 * 1)

- `put_data` method: append episodes ($S, A, R, S', Done$) into class variable.
- `make_batch` method : return torch type transitions and clear buffer
- `train_net` method : optimize network with policy gradient loss + Q-learning loss

$\quad \quad J = - G * log \pi(s, a) + E[(R + rV(s) - V(s'))^2]$


In [11]:
class ActorCritic(nn.Module):

  def __init__(self):
    super(ActorCritic, self).__init__()
    self.data = []

    self.fc1 = nn.Linear(4, 256)
    self.fc_pi = nn.Linear(256, 2)
    self.fc_v = nn.Linear(256, 1)

    self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

  def pi(self, state, softmax_dim=0):
    x1 = F.relu(self.fc1(state))
    x2 = F.softmax(self.fc_pi(x1), dim=softmax_dim)
    return x2

  def v(self, state):
    x1 = F.relu(self.fc1(state))
    x2 = self.fc_v(x1)
    return x2

  def put_data(self, transition):
    self.data.append(transition)

  def make_batch(self):
    s_list, a_list, r_list, sp_list, done_list = [],[],[],[],[]

    for transition in self.data:
      s, a, r, sp, done = transition
      s_list.append(s)
      a_list.append([a])
      r_list.append([r/100.0])
      sp_list.append(sp)
      done = 0.0 if done else 1.0
      done_list.append([done])

    s_batch, a_batch, r_batch, sp_batch, done_batch = \
      torch.tensor(s_list, dtype=torch.float), torch.tensor(a_list), \
      torch.tensor(r_list,  dtype=torch.float), torch.tensor(sp_list, dtype=torch.float), \
      torch.tensor(done_list,  dtype=torch.float)

    self.data = []

    return s_batch, a_batch, r_batch, sp_batch, done_batch

  def train_net(self):
    s, a, r, sp, done = self.make_batch()

    td_target = r + gamma * self.v(sp) * done
    delta = td_target - self.v(s)

    prob_a = self.pi(s, softmax_dim=1).gather(1, a)

    loss = -torch.log(prob_a) * delta.detach() + F.smooth_l1_loss(self.v(s), td_target.detach())

    self.optimizer.zero_grad()
    loss.mean().backward()
    self.optimizer.step()

## Main loop

- create environments, Actor-Critic Model, print_interval, and reset score value
- for each train epochs
  - reset environment
  - for each episodes loop
    - rollout 10 times
      - obtain policy action probability and actual action value
      - step environment
      - gather transitions $(S, A, R, S', Done)$
      - put transition into dataset
      - update state, update score
    - train network
  - print progress
- close env

In [12]:
env = gym.make("CartPole-v1")
ac_nn = ActorCritic()

score = 0.0
print_interval = 200

for epi in range(2000):
  s, _ = env.reset()
  done = False

  while not done:
    for i in range(n_rollout):
      prob = ac_nn.pi( torch.from_numpy(s).float() )
      m = Categorical(prob)
      a = m.sample().item()
      sp, r, done, truncated, info = env.step(a)
      ac_nn.put_data( (s, a, r, sp, done) )

      s = sp
      score += r

      if done or truncated:
        break

    ac_nn.train_net()

  if (epi % print_interval == 0) and (epi != 0):
    print(f"epi: {epi} / avg_score: {score / print_interval}")
    score = 0.0

env.close()

epi: 200 / avg_score: 19.5
epi: 400 / avg_score: 62.315
epi: 600 / avg_score: 246.89
epi: 800 / avg_score: 262.445
epi: 1000 / avg_score: 154.57
epi: 1200 / avg_score: 165.89
epi: 1400 / avg_score: 170.94
epi: 1600 / avg_score: 116.515
epi: 1800 / avg_score: 297.21


In [15]:
env = gym.make("CartPole-v1", render_mode = "rgb_array")
env = rl.RenderFrame(env, "./output")
s, info = env.reset()
done = False

while not done:
  prob = ac_nn.pi( torch.from_numpy(s).float() )
  m = Categorical(prob)
  a = m.sample().item()
  sp, r, done, truncated, info = env.step(a)

  s = sp
  score += r

  if done or truncated:
    print("Score : ", score)
    break

env.play()

Score :  156.0
Moviepy - Building video temp-{start}.mp4.
Moviepy - Writing video temp-{start}.mp4





Moviepy - Done !
Moviepy - video ready temp-{start}.mp4
