In [2]:
from stable_baselines3 import PPO, DQN, A2C
import gymnasium as gym
import ale_py

gym.register_envs(ale_py)  # unnecessary but helpful for IDEs

from stable_baselines3.common.logger import configure
from stable_baselines3.common.evaluation import evaluate_policy

tmp_path = "./results/a2c-breakout"
new_logger = configure(tmp_path, ["stdout", "csv", "tensorboard"])

env = gym.make("ALE/Breakout-v5")
model = A2C(policy = "MlpPolicy", env = env)

model.set_logger(new_logger)
model.learn(total_timesteps=100_000)

mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')

Logging to ./results/a2c-breakout




-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 164       |
|    ep_rew_mean        | 0.667     |
| time/                 |           |
|    fps                | 129       |
|    iterations         | 100       |
|    time_elapsed       | 3         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.37     |
|    explained_variance | 0         |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | -5.08e-07 |
|    value_loss         | 2.95e-13  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 194      |
|    ep_rew_mean        | 1.4      |
| time/                 |          |
|    fps                | 138      |
|    iterations         | 200      |
|    time_elapsed       | 7        |
|    total_timesteps    | 1000     |
| train/             

In [3]:
model.save("./saved_models/a2c-breakout")

In [6]:
tmp_path = "./results/dqn-breakout"
new_logger = configure(tmp_path, ["stdout", "csv", "tensorboard"])

model = DQN(policy = "MlpPolicy", 
            env = env,
            learning_rate=4e-4,
            gamma=0.99,
            exploration_initial_eps=1.0,
            exploration_final_eps=0.01,
            exploration_fraction=0.999,
            buffer_size=10_000,)

model.set_logger(new_logger)
model.learn(total_timesteps=100_000)

mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')

Logging to ./results/dqn-breakout
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 252      |
|    ep_rew_mean      | 3        |
|    exploration_rate | 0.99     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 323      |
|    time_elapsed     | 3        |
|    total_timesteps  | 1008     |
| train/              |          |
|    learning_rate    | 0.0004   |
|    loss             | 0.002    |
|    n_updates        | 226      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 205      |
|    ep_rew_mean      | 1.88     |
|    exploration_rate | 0.984    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 326      |
|    time_elapsed     | 5        |
|    total_timesteps  | 1642     |
| train/              |          |
|    learning_rate    | 0.0004   |
|    loss            

In [7]:
model.save("./saved_models/dqn-breakout")