<a href="https://colab.research.google.com/github/juliakahan/MountainCarContinuous-with-PPO/blob/main/Mountain_Car_Continuous.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install gymnasium

In [None]:
!pip install "stable-baselines3[extra]>=2.0.0a4"

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.env_util import make_vec_env
import gymnasium as gym
import os
import time
import numpy as np
import matplotlib.pyplot as plt

In [7]:
# Saving logs to visulise in Tensorboard, saving models
models_dir = f"models/Mountain-{time.time()}"
logdir = f"logs/Mountain-{time.time()}"

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)

In [8]:
# Parallel environments
env = make_vec_env("MountainCarContinuous-v0", n_envs=1)

In [9]:
# Define multiple sets of hyperparameters
hyperparams = [
    {"ent_coef": 0.001, "learning_rate": 0.0001, "n_epochs": 10},
    {"ent_coef": 0.005, "learning_rate": 0.0005, "n_epochs": 20},
    {"ent_coef": 0.01, "learning_rate": 0.001, "n_epochs": 30}
]
results = []

In [10]:
for hyperparam in hyperparams:
    model = PPO(
        policy=MlpPolicy,
        env=env,
        seed=0,
        batch_size=256,
        ent_coef=hyperparam["ent_coef"],
        learning_rate=hyperparam["learning_rate"],
        n_epochs=hyperparam["n_epochs"],
        n_steps=8,
        gae_lambda=0.9,
        gamma=0.9999,
        clip_range=0.1,
        max_grad_norm=5,
        vf_coef=0.19,
        use_sde=True,
        policy_kwargs=dict(log_std_init=-3.29, ortho_init=False),
        verbose=1,
        tensorboard_log=logdir
    )

    rewards_per_run = []

    TIMESTEPS = 50000
    for _ in range(10):
        model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name="PPO")
        ep_info = model.ep_info_buffer
        mean_reward = np.mean([info['r'] for info in ep_info])
        rewards_per_run.append(mean_reward)

    results.append({
        "hyperparams": hyperparam,
        "mean_reward": np.mean(rewards_per_run),
        "std_reward": np.std(rewards_per_run)
    })

Using cpu device
Logging to logs/Mountain-1687120523.2720852/PPO_0
----------------------------
| time/              |     |
|    fps             | 148 |
|    iterations      | 1   |
|    time_elapsed    | 0   |
|    total_timesteps | 8   |
----------------------------


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=8 and n_envs=1)


[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
|    loss                 | -0.147        |
|    n_updates            | 1601790       |
|    policy_gradient_loss | 1.15e-07      |
|    std                  | 1.76e+16      |
|    value_loss           | 20.5          |
-------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 540          |
|    ep_rew_mean          | 17           |
| time/                   |              |
|    fps                  | 50           |
|    iterations           | 3395         |
|    time_elapsed         | 533          |
|    total_timesteps      | 427160       |
| train/                  |              |
|    approx_kl            | 8.940697e-07 |
|    clip_fraction        | 0            |
|    clip_range           | 0.1          |
|    entropy_loss         | -42.9        |
|    explained_variance   | -28.3        |
|    learnin

ValueError: ignored

In [None]:
# Plotting the learning curves
x = range(1, len(hyperparams) + 1)
y_mean = [result["mean_reward"] for result in results]
y_std = [result["std_reward"] for result in results]

plt.errorbar(x, y_mean, y_std, linestyle="-", marker="o")
plt.xlabel("Zestaw hiperparametrów")
plt.ylabel("Nagroda")
plt.xticks(x, [str(hyperparam) for hyperparam in hyperparams])
plt.title("Krzywe uczenia dla różnych zestawów hiperparametrów")
plt.grid(True)
plt.show()