## Import Dependencies

In [1]:
from stable_baselines3 import PPO, DQN, A2C
import gymnasium as gym
from stable_baselines3.common.logger import configure
from stable_baselines3.common.evaluation import evaluate_policy

## Define Env Function

In [2]:
def make_bipedal_env(stage):
    import gymnasium as gym

    hardcore = stage.get("hardcore", False)

    return gym.make("BipedalWalker-v3",
                    hardcore=hardcore)

bipedal_stages = [
    {
        "name": "easy",
        "hardcore": False,
        "parcial_timesteps": 250_000,
    },
    {
        "name": "hardcore",
        "hardcore": True,
        "parcial_timesteps": 250_000,
    },
]


In [8]:
tmp_path = "./results/ppo-curriculum-bipedalwalker-2"
new_logger = configure(tmp_path, ["stdout", "csv", "tensorboard"])

model = None
    
for i, stage in enumerate(bipedal_stages):
    print(f"🚶‍♂️ Treinando estágio {i+1}: {stage['name']}")
    env = make_bipedal_env(stage)

    if model is None:
        model = PPO("MlpPolicy", env, verbose=1)
        model.set_logger(new_logger)
    else:
        model.set_env(env)

    model.learn(stage.get("parcial_timesteps", 100_000),
                reset_num_timesteps=False)
    print(f"🏆 Avaliando o modelo no estágio {i+1}: {stage['name']}")
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
    print(f"🏆 Avaliação média: {mean_reward:.2f} +/- {std_reward:.2f}")
    model.save(f"./saved_models/ppo_bipedal_stage_{i+1}")


Logging to ./results/ppo-curriculum-bipedalwalker-2
🚶‍♂️ Treinando estágio 1: easy
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 387      |
|    ep_rew_mean     | -112     |
| time/              |          |
|    fps             | 328      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 520          |
|    ep_rew_mean          | -113         |
| time/                   |              |
|    fps                  | 256          |
|    iterations           | 2            |
|    time_elapsed         | 15           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0061618695 |
|    clip

In [10]:
model.save("./saved_models/ppo-curriculum-bipedalwalker-2")

## Inference

In [None]:
env_eval = gym.make("BipedalWalker-v3",
                    hardcore=False,
                    render_mode="human")
mean_reward, std_reward = evaluate_policy(model, 
                                          env_eval, 
                                          n_eval_episodes=1,
                                          render=True)
env_eval.close()

In [18]:
env_eval = gym.make("BipedalWalker-v3",
                    hardcore=True,
                    render_mode="human")
mean_reward, std_reward = evaluate_policy(model, 
                                          env_eval, 
                                          n_eval_episodes=1,
                                          render=True)
env_eval.close()



In [16]:
env_eval.close()

## Long

In [3]:
bipedal_stages = [
    {
        "name": "easy",
        "hardcore": False,
        "parcial_timesteps": 1_000_000,
    },
    {
        "name": "hardcore",
        "hardcore": True,
        "parcial_timesteps": 1_000_000,
    },
]

In [None]:
tmp_path = "./results/ppo-curriculum-bipedalwalker-long"
new_logger = configure(tmp_path, ["stdout", "csv", "tensorboard"])

model = None
    
for i, stage in enumerate(bipedal_stages):
    print(f"🚶‍♂️ Treinando estágio {i+1}: {stage['name']}")
    env = make_bipedal_env(stage)

    if model is None:
        model = PPO("MlpPolicy", env, verbose=1)
        model.set_logger(new_logger)
    else:
        model.set_env(env)

    model.learn(stage.get("parcial_timesteps", 100_000),
                reset_num_timesteps=False)
    print(f"🏆 Avaliando o modelo no estágio {i+1}: {stage['name']}")
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
    print(f"🏆 Avaliação média: {mean_reward:.2f} +/- {std_reward:.2f}")
    model.save(f"./saved_models/ppo_bipedal_long_stage_{i+1}")


Logging to ./results/ppo-curriculum-bipedalwalker-long
🚶‍♂️ Treinando estágio 1: easy
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.6e+03  |
|    ep_rew_mean     | -106     |
| time/              |          |
|    fps             | 197      |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 828        |
|    ep_rew_mean          | -106       |
| time/                   |            |
|    fps                  | 159        |
|    iterations           | 2          |
|    time_elapsed         | 25         |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.01150492 |
|    clip_fraction        | 0.163      |
|    clip_range           | 0.2        |
|    entropy_loss         | -5.66      |
|    explained_variance   | -0.194     |
|    learning_rate        | 0.0003     |
|   

In [None]:
model.save(f"./saved_models/ppo_curriculum_bipedal_walker_long")