In [1]:
from tqdm import tqdm
import gymnasium as gym
from gymnasium import spaces
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
import time

In [3]:
args = {
    'env_id': 'CartPole-v1',
    'seed': 42,
    'cuda': True,
    'learning_rate' : 0.0003,
    'buffer_size' : 10000,
    'total_timesteps' : 300000,
    'start_e' : 1, 
    'end_e' : 0.05, 
    'exploration_fraction' : 0.5,
    'wandb_project_name' : "dqn-Cartpole",
    'wandb_entity' : None,
    'learning_starts' : 10000,
    'train_frequency' : 1,
    'batch_size' : 256,
    'target_network_frequency' : 500,
    'gamma' : 0.95,
    'capture_video' : False
    }

device = torch.device("cuda" if torch.cuda.is_available() and args["cuda"] else "cpu")
env_id = args['env_id_short'] if 'env_id_short' in args else args['env_id']
run_name=f"{env_id}_{args['seed']}_{int(time.time())}"

print(f'device : {device}, run_name : {run_name}')

device : cpu, run_name : CartPole-v1_42_1673699250


In [None]:
class QNetwork(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(env.observation_space.shape[0], 120),
            nn.ReLU(),
            nn.Linear(120, 84),
            nn.ReLU(),
        )
        self.relu = nn.ReLU()
        self.value = nn.Linear(84, 1)
        self.adv = nn.Linear(84, env.action_space.n)
    
    def forward(self, x):
        x = self.network(x)
        value = self.relu(self.value(x))
        adv = self.relu(self.adv(x))
        
        advAverage = torch.mean(adv, dim=1, keepdim=True)
        
        Q = value + adv - advAverage
        
        return Q