In [8]:
import gymnasium as gym
import numpy as np

from stable_baselines3 import PPO
from stable_baselines3.common.base_class import BaseAlgorithm
from stable_baselines3.ppo.policies import MlpPolicy

### Define custom evaluation function

In [4]:
def evaluate(
    model: BaseAlgorithm,
    num_episodes: int = 100,
    deterministic: bool = True,
) -> float:
    """
    Evaluate an RL agent averaged across 'num_episodes'.

    :param model: the RL agent
    :param env: the gymnasium Environment
    :param num_episodes: number of episodes to evaluate
    :param deterministic: choice of deterministic or stochastic actions
    :return: mean reward over 'num_episodes'
    """

    vec_env = model.get_env()
    obs = vec_env.reset()
    all_episode_rewards = []

    for _ in range(num_episodes):
        episode_rewards = []
        done = False

        while not done:
            action, _states = model.predict(obs, deterministic=deterministic)
            obs, reward, done, _info = vec_env.step(action)
            episode_rewards.append(reward)

        all_episode_rewards.append(sum(episode_rewards))

    mean_episode_reward = np.mean(all_episode_rewards)
    print(f"Mean reward: {mean_episode_reward:.2f} - Num episodes: {num_episodes}")

    return mean_episode_reward

### Evaluate untrained agent

In [9]:
# setup env and initialize untrained agent
env = gym.make("CartPole-v1")
model = PPO(MlpPolicy, env, verbose=0)

In [10]:
# evaluate untrained agent with defined evoluation function
mean_reward_untrained = evaluate(model)

Mean reward: 9.02 - Num episodes: 100


In [11]:
# compare with built-in sb3 helper
from stable_baselines3.common.evaluation import evaluate_policy

In [13]:
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100, warn=False)
print(f"Mean reward: {mean_reward:.2f}, std reward: {std_reward:.2f}")

Mean reward: 9.05, std reward: 0.70


### Train and evaluate agent

In [14]:
model.learn(total_timesteps=10_000)

<stable_baselines3.ppo.ppo.PPO at 0x15afaefd0>

In [16]:
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100, warn=False)

print(f"Mean reward: {mean_reward:.2f}, std reward: {std_reward:.2f}")

Mean reward: 399.94, std reward: 111.18


### Prepare video recording

In [17]:
# setup fake display
import os 
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

sh: Xvfb: command not found


In [23]:
import base64
from pathlib import Path

from IPython import display as ipythondisplay


def show_videos(video_path="", prefix=""):
    """
    Taken from https://github.com/eleurent/highway-env

    :param video_path: (str) Path to the folder containing videos
    :param prefix: (str) Filter the video, showing only the only starting with this prefix
    """
    html = []
    for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append(
            """<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>""".format(
                mp4, video_b64.decode("ascii")
            )
        )
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

In [28]:
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv


def record_video(env_id, model, video_length=500, prefix="", video_folder="videos/"):
    """
    :param env_id: (str)
    :param model: (RL model)
    :param video_length: (int)
    :param prefix: (str)
    :param video_folder: (str)
    """
    eval_env = DummyVecEnv([lambda: gym.make(env_id, render_mode="rgb_array")])
    # Start the video at step=0 and record 500 steps
    eval_env = VecVideoRecorder(
        eval_env,
        video_folder=video_folder,
        record_video_trigger=lambda step: step == 0,
        video_length=video_length,
        name_prefix=prefix,
    )


In [33]:
eval_env = DummyVecEnv([lambda: gym.make("CartPole-v1", render_mode="rgb_array")])
obs = eval_env.reset()
action, _  = model.predict(obs)
obs, _, _, _ = eval_env.step(action)

### Visualize trained agent

In [26]:
record_video("CartPole-v1", model, video_length=500, prefix="ppo-cartpole")

Saving video to /Users/ben/code/sb3/videos/ppo-cartpole-step-0-to-step-500.mp4
Moviepy - Building video /Users/ben/code/sb3/videos/ppo-cartpole-step-0-to-step-500.mp4.
Moviepy - Writing video /Users/ben/code/sb3/videos/ppo-cartpole-step-0-to-step-500.mp4



TypeError: must be real number, not NoneType