# 安装环境

In [5]:
!pip install gymnasium stable-baselines3

Collecting gymnasium
  Using cached gymnasium-0.29.1-py3-none-any.whl (953 kB)
Collecting stable-baselines3
  Downloading stable_baselines3-2.2.1-py3-none-any.whl (181 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.7/181.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium, stable-baselines3
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1 stable-baselines3-2.2.1


# 导入gym库和numpy库

In [6]:
import gymnasium as gym
import numpy as np

# 从stable_baselines3中导入PPO库

In [8]:
from stable_baselines3 import PPO

# 从stable_baselines3.ppo.policies中导入MplPolicy库

In [9]:
from stable_baselines3.ppo.policies import MlpPolicy

# 创建环境和agent

In [10]:
env = gym.make("CartPole-v1")

model = PPO(MlpPolicy, env, verbose=0)

# 创建一个助手函数来进化agent

In [15]:
from stable_baselines3.common.base_class import BaseAlgorithm

def evaluate(
    model : BaseAlgorithm,
    num_episodes : int = 100,
    deterministic : bool = True,
) -> float:
  """
  Evaluate an RL agent for `num_episodes`.

  :param model: the RL Agent
  :param env: the gym Environment
  :param num_episodes: number of episodes to evaluate it
  :param deterministic: Whether to use deterministic or stochastic actions
  :return: Mean reward for the last `num_episodes`
  """

  vec_env = model.get_env()
  obs = vec_env.reset()
  all_episode_rewards = []
  for _ in range(num_episodes):
    episode_rewards = []
    done = False
    while not done:
      action, _states = model.predict(obs, deterministic=deterministic)
      obs, reward, done, _info = vec_env.step(action)
      episode_rewards.append(reward)
    all_episode_rewards.append(sum(episode_rewards))
  mean_episode_reward = np.mean(all_episode_rewards)
  print(f"Mean reward: {mean_episode_reward:.2f} - Num episodes: {num_episodes}")

  return mean_episode_reward

# 评估一个未训练的agent

In [16]:
mean_reward_before_train = evaluate(model, num_episodes=100, deterministic=True)

Mean reward: 9.29 - Num episodes: 100


# 在Stable_Baselines3中已经写好了这样的助手函数

In [20]:
from stable_baselines3.common.evaluation import evaluate_policy

mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100, warn = False)
print(f"mean_reward: {mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward: 9.36 +/- 0.70


# 训练agent

In [32]:
model.learn(total_timesteps=100_000)

<stable_baselines3.ppo.ppo.PPO at 0x7a392c877280>

# 评估训练后的agent

In [33]:
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100, warn = False)
print(f"mean_reward: {mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward: 500.00 +/- 0.00


# 准备屏幕录制

In [23]:
import os
os.system("Xvfb :1 -screen 0 1024*768*24 &")
os.environ['DISPLAY'] = ':1'

In [38]:
import base64
from pathlib import Path

from IPython import display as ipythondisplay

def show_videos(video_path="", prefix=""):

  """
  Taken from https://github.com/eleurent/highway-env

  :param video_path: (str) Path to the folder containing videos
  :param prefix: (str) Filter the video, showing only the only starting with this prefix
  """
  html = []
  for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
    video_b64 = base64.b64decode(mp4.read_bytes())
    html.append(
      """<video alt="{}" autoplay
          loop controls style="height: 400px;">
          <source src="data:video/mp4;base64,{}" type="video/mp4" />
        </video>""".format(
        mp4, video_b64.decode("ascii")
      )
    )
  ipythondisplay.display(ipythondisplay.HTML(data=",<br>".join(html)))

# 使用VecVideoRecorder包装器录制视频

In [27]:
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv

def record_video(env_id, model, video_length=500, prefix="", video_folder="videos/"):
    """
    :param env_id: (str)
    :param model: (RL model)
    :param video_length: (int)
    :param prefix: (str)
    :param video_folder: (str)
    """
    eval_env = DummyVecEnv([lambda: gym.make(env_id, render_mode="rgb_array")])
    # Start the video at step=0 and record 500 steps
    eval_env = VecVideoRecorder(
        eval_env,
        video_folder=video_folder,
        record_video_trigger=lambda step: step == 0,
        video_length=video_length,
        name_prefix=prefix,
    )

    obs = eval_env.reset()
    for _ in range(video_length):
        action, _ = model.predict(obs)
        obs, _, _, _ = eval_env.step(action)

    # Close the video recorder
    eval_env.close()

# 可视化训练后的agent

In [34]:
record_video("CartPole-v1", model, video_length=500, prefix="ppo-carpole")

Saving video to /content/videos/ppo-carpole-step-0-to-step-500.mp4
Moviepy - Building video /content/videos/ppo-carpole-step-0-to-step-500.mp4.
Moviepy - Writing video /content/videos/ppo-carpole-step-0-to-step-500.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/ppo-carpole-step-0-to-step-500.mp4


In [37]:
show_videos("videos", prefix="ppo")

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8a in position 3: invalid start byte