* Source

https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/dqn.py

In [6]:
import gymnasium as gym
import torch.nn as nn
from stable_baselines3.common.buffers import ReplayBuffer
import wandb
import time
import torch
import torch.optim as optim
import random
import numpy as np
from torch.utils.tensorboard import SummaryWriter

In [9]:
args = {
    'env_id': 'CartPole-v1',
    'seed': 42,
    'cuda': True,
    'learning_rate' : 0.0003,
    'buffer_size' : 10000,
    'total_timesteps' : 500000,
    'start_e' : 1, 
    'end_e' : 0.05, 
    'exploration_fraction' : 0.5,
    
    }

device = torch.device("cuda" if torch.cuda.is_available() and args["cuda"] else "cpu")
run_name=f"{args['env_id']}_{args['seed']}_{device}_{int(time.time())}",

In [4]:
env = gym.make(args['env_id'])

state, info = env.reset()

print(state)
print(info)

[-0.0166534   0.0090017  -0.00060145 -0.02125515]
{}


In [24]:
class DQN(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(env.observation_space.shape[0], 120),
            nn.Relu(),
            nn.Linear(120, 84),
            nn.ReLU(),
            nn.Linear(84, env.action_space.n)
        )
    
    def forward(self, x):
        return self.network(x)

In [25]:
def linear_schedule(start_e: float, end_e:float, duration: int, t: int):
    slope = (end_e - start_e) / duration
    return max(slope * t + start_e, end_e)

In [None]:
wandb.init(
    # set the wandb project where this run will be logged
    name=run_name,
    project="dqn-Cartpole",
    sync_tensorboard=True,
    # track hyperparameters and run metadata
    config=args,
    monitor_gym=True,
    save_code=True
)

writer = SummaryWriter(f'runs/{run_name}')
writer.add_text(
    "hyperparameters",
    "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in args.items()])),
)

In [None]:
q_network = DQN(env).to(device)
optimizer = optim.Adam(q_network.parameters(), lr=args['learning_rate'])
target_network = DQN(env).to(device)
target_network.load_state_dict(q_network.state_dict())

rb = ReplayBuffer(
    args['buffer_size'],
    env.observation_space.shape[0],
    env.action_space.n,
    device,
    handle_timeout_termination=True
)

start_time = time.time()

obs, _ = env.reset()

for global_step in range(args['total_timesteps']):
    epsilon = linear_schedule(args['start_e'], 
                              args['end_e'], 
                              args['exploration_fraction'] * args['total_timesteps'], 
                              global_step)
    if random.random() < epsilon:
        actions = env.action_space.sample()
    else:
        q_values = q_network(torch.Tensor(obs).to(device))
        action = torch.argmax(q_values, dim=1).item()
    
    next_obs, reward, terminate, truncate, info = env.step(action)
    

TypeError: vars() argument must have __dict__ attribute

In [32]:
import random

# simulate training
epochs = 10
offset = random.random() / 5
for epoch in range(2, epochs):
    acc = 1 - 2 ** -epoch - random.random() / epoch - offset
    loss = 2 ** -epoch + random.random() / epoch + offset
    
    # log metrics to wandb
    wandb.log({"acc": acc, "loss": loss})
    time.sleep(5)
# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

0,1
acc,▂▁▂▄▇▇█▇
loss,█▄▃▃▁▁▂▂

0,1
acc,0.91261
loss,0.11162
