In [2]:
from stable_baselines3 import PPO, DQN, A2C
import gymnasium as gym
from stable_baselines3.common.logger import configure
from stable_baselines3.common.evaluation import evaluate_policy

tmp_path = "./results/ppo-baseline-lunarlander"
new_logger = configure(tmp_path, ["stdout", "csv", "tensorboard"])

env = gym.make("LunarLander-v3")
model = PPO(policy="MlpPolicy", env=env)

model.set_logger(new_logger)
model.learn(total_timesteps=400_000)

mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')

Logging to ./results/ppo-baseline-lunarlander
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 94.9     |
|    ep_rew_mean     | -158     |
| time/              |          |
|    fps             | 565      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 94.4        |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 433         |
|    iterations           | 2           |
|    time_elapsed         | 9           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013171007 |
|    clip_fraction        | 0.0303      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   

In [3]:
from stable_baselines3 import PPO, DQN, A2C
import gymnasium as gym
from stable_baselines3.common.logger import configure
from stable_baselines3.common.evaluation import evaluate_policy

tmp_path = "./results/ppo-baseline-lunarlander-long-1"
new_logger = configure(tmp_path, ["stdout", "csv", "tensorboard"])

env = gym.make("LunarLander-v3",
               continuous=False,
               gravity=-10.0,
               enable_wind=True,
               wind_power=15.0,
               turbulence_power=1.5)

model = PPO(policy="MlpPolicy", env=env)

model.set_logger(new_logger)
model.learn(total_timesteps=2_000_000)

mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')

Logging to ./results/ppo-baseline-lunarlander-long-1




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 85.8     |
|    ep_rew_mean     | -283     |
| time/              |          |
|    fps             | 485      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 86.4         |
|    ep_rew_mean          | -262         |
| time/                   |              |
|    fps                  | 382          |
|    iterations           | 2            |
|    time_elapsed         | 10           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0063920487 |
|    clip_fraction        | 0.0335       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | 0.00228      |
|    learning_r

In [4]:
model.save(f"./saved_models/ppo_baseline_lunar_long-1")