In [2]:
from stable_baselines3 import PPO, DQN, A2C
import gymnasium as gym
from stable_baselines3.common.logger import configure
from stable_baselines3.common.evaluation import evaluate_policy

tmp_path = "./results/a2c-lunarlander"
new_logger = configure(tmp_path, ["stdout", "csv", "tensorboard"])

env = gym.make("LunarLander-v3")
model = A2C(policy = "MlpPolicy", env = env)

model.set_logger(new_logger)
model.learn(total_timesteps=200_000)

mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')

Logging to ./results/a2c-lunarlander




------------------------------------
| rollout/              |          |
|    ep_len_mean        | 92       |
|    ep_rew_mean        | -346     |
| time/                 |          |
|    fps                | 209      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.16    |
|    explained_variance | -0.181   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -4.89    |
|    value_loss         | 24.7     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 98.8     |
|    ep_rew_mean        | -384     |
| time/                 |          |
|    fps                | 202      |
|    iterations         | 200      |
|    time_elapsed       | 4        |
|    total_timesteps    | 1000     |
| train/                |          |
|

In [3]:
model.save("./saved_models/a2c-lunarlander")

In [4]:
tmp_path = "./results/dqn-lunarlander"
new_logger = configure(tmp_path, ["stdout", "csv", "tensorboard"])

env = gym.make("LunarLander-v3")
model = DQN(policy = "MlpPolicy", 
            env = env,
            learning_rate=4e-4,
            gamma=0.99,
            exploration_initial_eps=1.0,
            exploration_final_eps=0.01,
            exploration_fraction=0.999,)

model.set_logger(new_logger)
model.learn(total_timesteps=200_000)

mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')

Logging to ./results/dqn-lunarlander
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 85.2     |
|    ep_rew_mean      | -229     |
|    exploration_rate | 0.998    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1087     |
|    time_elapsed     | 0        |
|    total_timesteps  | 341      |
| train/              |          |
|    learning_rate    | 0.0004   |
|    loss             | 4.79     |
|    n_updates        | 60       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 88.9     |
|    ep_rew_mean      | -171     |
|    exploration_rate | 0.996    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 932      |
|    time_elapsed     | 0        |
|    total_timesteps  | 711      |
| train/              |          |
|    learning_rate    | 0.0004   |
|    loss         

In [5]:
model.save("./saved_models/dqn-lunarlander")