In [1]:
from stable_baselines3 import PPO, DQN, A2C
import gymnasium as gym
from stable_baselines3.common.logger import configure
from stable_baselines3.common.evaluation import evaluate_policy

tmp_path = "./results/ppo-baseline-bipedalwalker"
new_logger = configure(tmp_path, ["stdout", "csv", "tensorboard"])

env = gym.make("BipedalWalker-v3")
model = PPO(policy="MlpPolicy", env=env)

model.set_logger(new_logger)
model.learn(total_timesteps=500_000)

mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')

Logging to ./results/ppo-baseline-bipedalwalker




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 461      |
|    ep_rew_mean     | -101     |
| time/              |          |
|    fps             | 577      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 417         |
|    ep_rew_mean          | -105        |
| time/                   |             |
|    fps                  | 440         |
|    iterations           | 2           |
|    time_elapsed         | 9           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008537619 |
|    clip_fraction        | 0.104       |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.68       |
|    explained_variance   | -0.000576   |
|    learning_rate        | 0.

In [1]:
from stable_baselines3 import PPO, DQN, A2C
import gymnasium as gym
from stable_baselines3.common.logger import configure
from stable_baselines3.common.evaluation import evaluate_policy

tmp_path = "./results/ppo-baseline-bipedalwalker-hardcore"
new_logger = configure(tmp_path, ["stdout", "csv", "tensorboard"])

env = gym.make("BipedalWalker-v3",
               hardcore=True)
model = PPO(policy="MlpPolicy", env=env)

model.set_logger(new_logger)
model.learn(total_timesteps=500_000)

mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')

Logging to ./results/ppo-baseline-bipedalwalker-hardcore




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 69.6     |
|    ep_rew_mean     | -108     |
| time/              |          |
|    fps             | 630      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 308          |
|    ep_rew_mean          | -107         |
| time/                   |              |
|    fps                  | 479          |
|    iterations           | 2            |
|    time_elapsed         | 8            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0062741237 |
|    clip_fraction        | 0.0456       |
|    clip_range           | 0.2          |
|    entropy_loss         | -5.68        |
|    explained_variance   | -0.0012      |
|    learning_r

In [2]:
model.save("./saved_models/ppo-baseline-bipedalwalker-hardcore")

## Inference

In [3]:
env_eval = gym.make("BipedalWalker-v3",
                    hardcore=True,
                    render_mode="human")
mean_reward, std_reward = evaluate_policy(model, 
                                          env_eval, 
                                          n_eval_episodes=1,
                                          render=True)
env_eval.close()



## Long

In [1]:
from stable_baselines3 import PPO, DQN, A2C
import gymnasium as gym
from stable_baselines3.common.logger import configure
from stable_baselines3.common.evaluation import evaluate_policy

tmp_path = "./results/ppo-baseline-bipedalwalker-hardcore-long"
new_logger = configure(tmp_path, ["stdout", "csv", "tensorboard"])

env = gym.make("BipedalWalker-v3",
               hardcore=True)
model = PPO(policy="MlpPolicy", env=env)

model.set_logger(new_logger)
model.learn(total_timesteps=2_000_000)

mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')

Logging to ./results/ppo-baseline-bipedalwalker-hardcore-long




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 461      |
|    ep_rew_mean     | -116     |
| time/              |          |
|    fps             | 385      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 512         |
|    ep_rew_mean          | -111        |
| time/                   |             |
|    fps                  | 318         |
|    iterations           | 2           |
|    time_elapsed         | 12          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008919507 |
|    clip_fraction        | 0.0672      |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.68       |
|    explained_variance   | 0.00983     |
|    learning_rate        | 0.

In [2]:
model.save("./saved_models/ppo-baseline-bipedalwalker-hardcore-long")