In [1]:
from exercise03 import QNet, MC_func_approx, evaluate_greedy_policy, gym_video, DQN
import gymnasium as gym
from tqdm.notebook import trange
import torch
import datetime


## DQN

### Cartpole DQN

In [82]:
# Hyperparameters
hyperparameters = {
    "epsilon": 0.7,
    "nr_episodes": 10_000,
    "max_t": 1000,
    "gamma": 0.99,
    "replay_buffer_size": 1000,
    "warm_start_steps": 500,
    "sync_rate": 128,
    "replay_buffer_size": 1000,
    "train_frequency": 8,
    "batch_size": 128,
}

In [83]:
# Preparations
cartpole_env = gym.make("CartPole-v1", render_mode="rgb_array")
cartpole_observation_space_size = cartpole_env.observation_space.shape[0]
cartpole_nr_actions = cartpole_env.action_space.n
cartpole_qnet = QNet(cartpole_observation_space_size, cartpole_nr_actions, 8, 2)
cartpole_optimizer = torch.optim.RMSprop(cartpole_qnet.parameters(), lr=1e-2)


In [84]:
# Train
DQN_carpole_policy = DQN(
    cartpole_qnet,
    cartpole_env,
    cartpole_optimizer,
    **hyperparameters,
    output_path="output/",
).act_greedy

Train policy with DQN for 10000 episodes using at most 1000 steps, gamma = 0.99, epsilon = 0.7, replay buffer size = 1000, sync rate = 128, warm starting steps for filling the replay buffer = 500


DQN Training: 100%|██████████| 10000/10000 [07:20<00:00, 22.68episodes/s, e return=20.30, e length=20.30]


In [85]:
# Test
print(
    "Mean episode reward from DQN on cartpole policy: ",
    evaluate_greedy_policy(cartpole_env, DQN_carpole_policy, 100, 4_000),
)

Mean episode reward from DQN on cartpole policy:  9.44


In [86]:
# Visualize
video_name = f"{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}-DQN-{cartpole_env.spec.id}"
gym_video(DQN_carpole_policy, cartpole_env, video_name, 5000, output_path="output/")


In [87]:
from IPython.display import HTML

HTML(f"""
    <video width="420" height="320" controls>
        <source src="output/{video_name}-episode-0.mp4" type="video/mp4">
    </video>
""")

## Mountain Car DQN

In [60]:
# Hyperparameters
hyperparameters = {
    "epsilon": 0.05,
    "nr_episodes": 400,
    "max_t": 4000,
    "gamma": 0.99,
    "replay_buffer_size": 50_000,
    "warm_start_steps": 500,
    "sync_rate": 128,
    "train_frequency": 8,
    "batch_size": 128,
}

In [61]:
# Prepare
mountaincar_env = gym.make(
    "MountainCar-v0",
    render_mode="rgb_array",
    max_episode_steps=hyperparameters["max_t"],
)
mountaincar_observation_space_size = mountaincar_env.observation_space.shape[0]
mountaincar_nr_actions = mountaincar_env.action_space.n
mountaincar_qnet = QNet(
    mountaincar_observation_space_size,
    mountaincar_nr_actions,
    8,
    2,
)
mountaincar_optimizer = torch.optim.RMSprop(mountaincar_qnet.parameters(), lr=1e-2)

In [62]:
# Train
DQN_mountaincar_policy = DQN(
    mountaincar_qnet,
    mountaincar_env,
    mountaincar_optimizer,
    **hyperparameters,
    output_path="output/"
).act_greedy

Train policy with DQN for 400 episodes using at most 4000 steps, gamma = 0.99, epsilon = 0.05, replay buffer size = 5000, sync rate = 128, warm starting steps for filling the replay buffer = 500


DQN Training: 100%|██████████| 400/400 [16:53<00:00,  2.53s/episodes, e return=-560.90, e length=560.90]   


In [63]:
# Test
print(
    "Mean episode reward from MC_func_approx on mountaincar policy: ",
    evaluate_greedy_policy(mountaincar_env, DQN_mountaincar_policy, 10, 4_000),
)


Mean episode reward from MC_func_approx on mountaincar policy:  -279.3


In [64]:
# Visualize
video_name = f"{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}-DQN-{mountaincar_env.spec.id}"
gym_video(
    DQN_mountaincar_policy,
    mountaincar_env,
    f"{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}-DQN-{mountaincar_env.spec.id}",
    5000,
    output_path="output/",
)


  logger.warn(


In [65]:
HTML(f"""
    <video width="420" height="320" controls>
        <source src="output/{video_name}-episode-0.mp4" type="video/mp4">
    </video>
""")