In [5]:
from stable_baselines3 import PPO, DQN, A2C
import gymnasium as gym
from stable_baselines3.common.logger import configure
from stable_baselines3.common.evaluation import evaluate_policy

tmp_path = "./results/a2c-cartpole"
new_logger = configure(tmp_path, ["stdout", "csv", "tensorboard"])

env = gym.make("CartPole-v1")
model = A2C(policy = "MlpPolicy", env = env)

model.set_logger(new_logger)
model.learn(total_timesteps=100_000)

mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')

Logging to ./results/a2c-cartpole




------------------------------------
| rollout/              |          |
|    ep_len_mean        | 30.6     |
|    ep_rew_mean        | 30.6     |
| time/                 |          |
|    fps                | 168      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.687   |
|    explained_variance | -0.0903  |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 1.88     |
|    value_loss         | 10.1     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 32.1     |
|    ep_rew_mean        | 32.1     |
| time/                 |          |
|    fps                | 155      |
|    iterations         | 200      |
|    time_elapsed       | 6        |
|    total_timesteps    | 1000     |
| train/                |          |
|

In [6]:
model.save("./saved_models/a2c_cartpole")

In [3]:
tmp_path = "./results/dqn-cartpole"
new_logger = configure(tmp_path, ["stdout", "csv", "tensorboard"])

env = gym.make("CartPole-v1")
model = DQN(policy = "MlpPolicy", 
            env = env,
            learning_rate=4e-4,
            gamma=0.99,
            exploration_initial_eps=1.0,
            exploration_final_eps=0.01,
            exploration_fraction=0.999,)

model.set_logger(new_logger)
model.learn(total_timesteps=100_000)

mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')

Logging to ./results/dqn-cartpole
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 20.2     |
|    ep_rew_mean      | 20.2     |
|    exploration_rate | 0.999    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 6721     |
|    time_elapsed     | 0        |
|    total_timesteps  | 81       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 20.1     |
|    ep_rew_mean      | 20.1     |
|    exploration_rate | 0.998    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1214     |
|    time_elapsed     | 0        |
|    total_timesteps  | 161      |
| train/              |          |
|    learning_rate    | 0.0004   |
|    loss             | 0.443    |
|    n_updates        | 15       |
----------------------------------
----------------------------------
| rollout/           

In [4]:
model.save("./saved_models/dqn-cartpole")