In [9]:
import gymnasium
from stable_baselines3 import A2C
import matplotlib.pyplot as plt
import pandas as pd

In [10]:
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 10)
plt.rcParams['image.interpolation'] = 'nearest'

In [11]:
from typing import Final, Any, Dict, List

SEED: Final[int] = 42

HYPERPARAMETERS: List[Dict[str, Any]] = [
    {
        "learning_rate": 0.0005,
        "n_steps": 15,
        "gamma": 0.99,
    },
    {
        "learning_rate": 0.0003,
        "n_steps": 10,
        "gamma": 0.999,
    },
    {
        "learning_rate": 0.0008,
        "n_steps": 25,
        "gamma": 0.97,
    },
]

EPOCHS: Final[int] = 10
STEPS_PER_EPOCH: Final[int] = 50000
TOTAL_TIME_STEPS: Final[int] = STEPS_PER_EPOCH * EPOCHS

steps: List = []
rewards: List = []
means: List = []
s_devs: List = []
last_in_epoch: List = []

In [12]:
def generate_with_hyperparameters(hyperparams: Dict[str, Any]) -> None:
    global steps, rewards

    env = gymnasium.make("LunarLander-v2")
    env.reset()

    steps.append([])
    rewards.append([])
    means.append([])
    s_devs.append([])
    last_in_epoch.append([])

    for epoch in range(EPOCHS):
        model = A2C("MlpPolicy", env, verbose=1, seed=SEED, **hyperparams)
        model.learn(total_timesteps=STEPS_PER_EPOCH)

        vec_env = model.get_env()
        obs = vec_env.reset()

        r = 0
        for i in range(STEPS_PER_EPOCH):
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = env.step(action.item())
            
            r = reward
            steps[-1].append(epoch * STEPS_PER_EPOCH + i + 1)
            rewards[-1].append(reward)
            
            if terminated or truncated:
                env.reset()
                break

        last_in_epoch[-1].append(r)
        means[-1].append(sum(rewards[-1][-STEPS_PER_EPOCH:])/STEPS_PER_EPOCH)
        s_devs[-1].append((sum([((x - means[-1][-1]) ** 2) for x in rewards[-1][-STEPS_PER_EPOCH:]]) / STEPS_PER_EPOCH)**2)

    env.close()

generate_with_hyperparameters(HYPERPARAMETERS[0])



Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 96.1     |
|    ep_rew_mean        | -206     |
| time/                 |          |
|    fps                | 978      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1500     |
| train/                |          |
|    entropy_loss       | -1.22    |
|    explained_variance | -0.0372  |
|    learning_rate      | 0.0005   |
|    n_updates          | 99       |
|    policy_loss        | 7.84     |
|    value_loss         | 68.3     |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 115       |
|    ep_rew_mean        | -305      |
| time/                 |           |
|    fps                | 1005      |
|    iterations         | 200       |
|    time_e

KeyboardInterrupt: 