<a href="https://colab.research.google.com/github/kimsooyoung/rl_oc_python/blob/main/oc_lec1_dqn/DQN_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Requirements

In [None]:
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install swig
!pip install renderlab
!pip install gymnasium
!pip install gymnasium[box2d]

Collecting swig
  Downloading swig-4.2.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.2.1
Collecting renderlab
  Downloading renderlab-0.1.20230421184216-py3-none-any.whl (4.0 kB)
Collecting gymnasium (from renderlab)
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium->renderlab)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium, renderlab
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1 renderlab-0.1.20230421184216
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (

### Import the Necessary Packages

In [None]:
import gymnasium as gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Initialize Hyper Params

In [None]:
learning_rate = 0.0005
gamma         = 0.98
buffer_limit  = 50000
batch_size    = 32

## Render Test

In [None]:
import renderlab as rl

env = gym.make("CartPole-v1", render_mode = "rgb_array")
env = rl.RenderFrame(env, "./output")

observation, info = env.reset()
score = 0

while True:
  action = env.action_space.sample()
  observation, reward, terminated, truncated, info = env.step(action)
  score += reward

  if terminated:
    print("Score : ", score)
    break

env.play()

Score :  13.0
Moviepy - Building video temp-{start}.mp4.
Moviepy - Writing video temp-{start}.mp4




                                                   

Moviepy - Done !
Moviepy - video ready temp-{start}.mp4




## Define Classes

- `ReplayBuffer`: transition buffer for experience replay
- `Qnet`: DQN Network Structure


## ReplayBuffer

Sample random `n` transitions from the buffer. Transition here means $(s, a, r, s')$.

For future implementation, I'll add `done_mask`, which will be `True` if episode dones.

Methods

- `init` / `put` / `sample` / `size`

In [None]:
class ReplayBuffer():
  def __init__(self):
    self.buffer = collections.deque(maxlen=buffer_limit)

  def put(self, transition):
    self.buffer.append(transition)

  def sample(self, n):
    mini_batch = random.sample(self.buffer, n)
    s_list, a_list, r_list, s_prime_list, dm_list = [], [], [], [], []

    for transition in mini_batch:
      s, a, r, s_p, dm = transition
      s_list.append(s)
      # action (0 or 1) and reward (+1 or 0) are scalar, so wrap it into list with []
      a_list.append([a])
      r_list.append([r])
      s_prime_list.append(s_p)
      dm_list.append([dm])

    return torch.tensor(s_list, dtype=torch.float), torch.tensor(a_list), \
           torch.tensor(r_list), torch.tensor(s_prime_list, dtype=torch.float), \
           torch.tensor(dm_list)

  def size(self):
    return len(self.buffer)

## Qnet

- input: 4 length tensor
- layer structure: (4 * 128) (128 * 128) (128 * 2)
- **Caution** final layer doesn't have activation func
- `sample_action` method: epsilon-greedy policy accoring to network output

In [None]:
class Qnet(nn.Module):
  def __init__(self):
    super(Qnet, self).__init__()
    self.fc1 = nn.Linear(4, 128)
    self.fc2 = nn.Linear(128, 128)
    self.fc3 = nn.Linear(128, 2)

  def forward(self, x):
    x1 = F.relu(self.fc1(x))
    x2 = F.relu(self.fc2(x1))
    x3 = self.fc3(x2)
    return x3

  def sample_action(self, obs, epsilon):
    out = self.forward(obs)
    coin = random.random()

    if coin < epsilon:
      return random.randint(0, 1)
    else:
      return out.argmax().item()

## Training Helper Func

Below is Bellman Func for Q-Learning $Q = Q + R_{t+1} + Q_{target}(S', A') - Q(S,A) $

➾ Error will be difference between $Q_{target}(S', A')$ and $Q(S,A)$

### Pseudo Code

1. sample from `ReplayBuffer`
2. Pass S into Qnet and get Qnet Value
3. Pass S' into Qtarget and get max Qtarget Value
4. Calculate TD target
5. Get error between TD Target and Qnet Value (use `F.smooth_l1_loss`)
6. run `zero_grad` and `step`

In [None]:
def train(q, q_target, memory, optimizer):
  for i in range(10):
    s, a, r, s_p, dm = memory.sample(batch_size)

    q_out = q(s)
    q_a = q_out.gather(1, a)
    max_q_p = q_target(s_p).max(1)[0].unsqueeze(1)
    target = r + gamma * max_q_p * dm

    loss = F.smooth_l1_loss(q_a, target)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

## Write Main Funciton

- define variables

In [None]:
env = gym.make('CartPole-v1')
q = Qnet()
q_target = Qnet()
memory = ReplayBuffer()

optimizer = optim.Adam(q.parameters(), lr=learning_rate)
target_update_interval = 20
print_interval = 1000
epoch = 10000

* $Q_{target}$ must be static during learning step. You can handle it through `load_state_dict`

In [None]:
q_target.load_state_dict(q.state_dict())

<All keys matched successfully>

## Run train loop

- get decayed epsilon every time
- run episode
- during episode, acquire action according to Qnet forwarding (`sample_action` method)
- memory update
- if enought transisions stored into memory, train  Qnet
- print short-term results

In [None]:
for n_epi in range(epoch):
  epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) #Linear annealing from 8% to 1%
  s, _ = env.reset()
  done = False
  score = 0.0

  while not done:
    a = q.sample_action(torch.from_numpy(s).float(), epsilon)
    s_prime, r, done, truncated, info = env.step(a)
    done_mask = 0.0 if done else 1.0
    memory.put((s,a,r/100.0,s_prime, done_mask))
    s = s_prime

    score += r
    if done or truncated:
      break

  if memory.size() > 2000:
    train(q, q_target, memory, optimizer)

  if n_epi % target_update_interval == 0:
    q_target.load_state_dict(q.state_dict())

  if n_epi % print_interval == 0 and n_epi != 0:
    print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(
                                                    n_epi, score, memory.size(), epsilon*100))

env.close()

  return torch.tensor(s_list, dtype=torch.float), torch.tensor(a_list), \



n_episode :1000, score : 135.0, n_buffer : 50000, eps : 3.0%
n_episode :2000, score : 500.0, n_buffer : 50000, eps : 1.0%
n_episode :3000, score : 500.0, n_buffer : 50000, eps : 1.0%
n_episode :4000, score : 500.0, n_buffer : 50000, eps : 1.0%
n_episode :5000, score : 500.0, n_buffer : 50000, eps : 1.0%
n_episode :6000, score : 500.0, n_buffer : 50000, eps : 1.0%
n_episode :7000, score : 500.0, n_buffer : 50000, eps : 1.0%
n_episode :8000, score : 500.0, n_buffer : 50000, eps : 1.0%
n_episode :9000, score : 500.0, n_buffer : 50000, eps : 1.0%


##  Test Qnet with Rendered Animation

[test video](https://github.com/kimsooyoung/robotics_python/assets/12381733/653d107f-cbdd-4538-bac9-e3c625676a43)

In [None]:
import renderlab as rl

env = gym.make("CartPole-v1", render_mode = "rgb_array")
env = rl.RenderFrame(env, "./output")
s, info = env.reset()

while True:
  a = q.sample_action(torch.from_numpy(s).float(), 0.01)
  s_prime, r, done, truncated, info = env.step(a)
  s = s_prime
  if done or truncated:
    break

env.play()

Moviepy - Building video temp-{start}.mp4.
Moviepy - Writing video temp-{start}.mp4





Moviepy - Done !
Moviepy - video ready temp-{start}.mp4


![](https://github.com/kimsooyoung/robotics_python/assets/12381733/653d107f-cbdd-4538-bac9-e3c625676a43
)