In [95]:
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
import os
import shutil
import torch
import time
from torch import functional as F
from collections import deque
import random
import numpy as np
from tqdm import tqdm

In [24]:
def build_env(name = 'LunarLander-v3', record_name = 'lunar_agent'):
    # Delete all contents in lunar-agent folder
    if os.path.exists(record_name):
        shutil.rmtree(record_name)

    # Initialise the environment
    env = gym.make(name, render_mode="rgb_array")

    env = RecordVideo(
        env,
        video_folder=record_name,
        episode_trigger=lambda x: True,  # Record every episode
        name_prefix="training",
        video_length=3000,  # Maximum number of steps to record per episode
    )

    return env

env = build_env()

# Reset the environment to generate the first observation
observation, info = env.reset(seed=42)

for _ in range(1000):
    # this is where you would insert your policy
    action = env.action_space.sample()

    # step (transition) through the environment with the action
    # receiving the next observation, reward and if the episode has terminated or truncated
    observation, reward, terminated, truncated, info = env.step(action)

    # If the episode has ended then we can reset to start a new episode
    if terminated or truncated:
        observation, info = env.reset()

env.close()


# DQN

- ReplayBuffer
- NN
- training formula: Q(s,a) = Q(s,a) + alpha * (R_t+1 + gamma * max Q(s_t+1 ,a) - Q(s_t, a))
https://huggingface.co/learn/deep-rl-course/en/unit3/deep-q-algorithm
- Q-target:

known problems:
- forgetting: agent forgot best behavior when new experience comes in


### Replay Buffer

In [70]:
class ReplayBuffer():
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = deque()
    
    def push(self, state, action, reward, next_state, terminated):
        if len(self.buffer) >= self.capacity:
            self.buffer.popleft()
        self.buffer.append((state, action, reward, next_state, terminated))
    
    def sample(self, batch_size):
        '''state, action, reward, next_state, terminated'''
        state, action, reward, next_state, terminated = zip(*random.sample(self.buffer, batch_size))
        return torch.vstack([torch.tensor(s) for s in state]), torch.tensor(action), torch.tensor(reward), torch.vstack([torch.tensor(ns) for ns in next_state]), torch.tensor(terminated)


In [71]:
env = build_env()
env.reset()
rb = ReplayBuffer(capacity=50)
#warmup
for _ in range(100):
    action = env.action_space.sample()
    next_state, reward, terminated, truncated, info = env.step(action)
    rb.push(state, action, reward, next_state, terminated)
    state = next_state

state, action, reward, next_state, terminated = rb.sample(10)
print(state.shape)
print( action.shape)
print( reward.shape)
print( next_state.shape)
print( terminated)

env.close()


torch.Size([10, 8])
torch.Size([10])
torch.Size([10])
torch.Size([10, 8])
tensor([ True,  True, False, False, False, False, False,  True,  True,  True])


### Neural network

In [75]:
def build_qnet(state_dim, action_dim):
    state_dim = state_dim
    action_dim = action_dim
    model = torch.nn.Sequential(
        torch.nn.Linear(state_dim, 120),
        torch.nn.ReLU(),
        torch.nn.Linear(120, 84),
        torch.nn.ReLU(),
        torch.nn.Linear(84, action_dim)
    )
    return model


In [82]:
testnet = build_qnet(8, 4)
y = testnet(torch.randn(1, 8))
print(y)

tensor([[ 0.2086, -0.0825,  0.1554, -0.1457]], grad_fn=<AddmmBackward0>)


### Greedy policy (DQN is off-policy, use greedy policy for training, but use only model for reference)



In [15]:
def linear_schedule(start, end, duration, t):
    slope = (end - start) / duration
    return max(slope * t + start, end)


### Optimizer
Adam()

In [77]:
qnet = build_qnet(8, 4)
target_qnet = build_qnet(8, 4)
target_qnet.load_state_dict(qnet.state_dict())
replay_buffer = ReplayBuffer(1000)
batch_size = 32
gamma = 0.99
tau = 0.01 #to sync target qnet and training qnet
total_steps = 10000
sync_steps = 100
warm_up = 100
trainng_freq = 10 # traing every 10 steps


optimizer = torch.optim.Adam(qnet.parameters(), lr=0.001)

### Training

1. given a state, take an action
2. add that action, reward, state to buffer
3. if global step > learning starts (pass warmup) and global step % training freq == 0: 
    - get data from buffer
    - Q_max = target_nn(data.obs).max(1)
    - td_target = data.rewards + gamma * Q_max * (1 - dones)
    - old val = q_net(data.obs)

    loss = mse(td_target, old_val)

    optim.zero_grad
    loss.backward()
    optim.step()

    sync q_net and target q_net 




In [92]:
a = torch.tensor([[True, False, False, True]])
print(a.shape)
print(a.flatten().int())
print(1 - a.flatten().int())

torch.Size([1, 4])
tensor([1, 0, 0, 1], dtype=torch.int32)
tensor([0, 1, 1, 0], dtype=torch.int32)


In [None]:
env = build_env()
env.reset()

#warmup
for _ in range(warm_up):
    next_state, reward, terminated, truncated, info = env.step(env.action_space.sample())
    replay_buffer.push(state, action, reward, next_state, terminated)
    state = next_state

for step in tqdm(range(10)):

    epsilon = linear_schedule(1, 0.01, total_steps, step)
    if random.random() < epsilon:
        action = env.action_space.sample()
    else:
        action = qnet(torch.tensor(state).float()).argmax().item()

    # pretraining
    next_state, reward, terminated, truncated, info = env.step(action)
    replay_buffer.push(state, action, reward, next_state, terminated)

    # training
    if step % trainng_freq == 0:
        # greedy epsilon
        
        data = replay_buffer.sample(batch_size)
        rb_state, rb_action, rb_reward, rb_nextstate, rb_done = data

        with torch.no_grad():
            target_max = target_qnet(rb_nextstate).float().max(dim=1).values
            td_target = torch.tensor(rb_reward) + gamma * target_max * (1 - rb_done.flatten().int())
        
        
        old_value = qnet(rb_state).gather(1, rb_action.unsqueeze(1)).squeeze()

        loss = torch.nn.functional.mse_loss(old_value, td_target.float())
        print(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # this is where you would insert your policy
        action = env.action_space.sample()
        
    # sync every sync_steps
    if step % sync_steps == 0:
        target_qnet.load_state_dict(qnet.state_dict())

    #post training
    state = next_state

    # If the episode has ended then we can reset to start a new episode
    if terminated or truncated:
        observation, info = env.reset()

env.close()


  logger.warn("Unable to save last video! Did you call close()?")
  td_target = torch.tensor(rb_reward) + gamma * target_max * (1 - rb_done.flatten().int())
  0%|          | 0/10 [00:00<?, ?it/s]

1565.35595703125



