# Gym Wrappers
了解如何使用 Gym Wrappers，它可以进行监控、标准化、限制步数、功能增强等

加载和保存功能，以及如何读取输出的文件以进行可能的导出。

In [1]:
!pip install swig
!pip install stable-baselines3



# 导入gym和RL算法库

In [2]:
import gymnasium as gym
from stable_baselines3 import A2C, SAC, PPO, TD3

# 保存和加载模型

In [3]:
import os

save_dir = "/tmp/gym/"
os.makedirs(save_dir, exist_ok=True)

model = PPO("MlpPolicy", "Pendulum-v1", verbose=0).learn(8_000)
model.save(f"{save_dir}/PPO_tutorial")

obs = model.env.observation_space.sample()

print("pre saved", model.predict(obs, deterministic=True))

del model

loaded_model = PPO.load(f"{save_dir}/PPO_tutorial")
print("loaded", loaded_model.predict(obs, deterministic=True))

pre saved (array([0.00035608], dtype=float32), None)
loaded (array([0.00035608], dtype=float32), None)


# Gym 和 VecEnv 包装器

## 自定义一个简单的包装器

In [4]:
class CustomWrapper(gym.Wrapper):
    """
    :param env: (gym.Env) Gym environment that will be wrapped
    """
    def __init__(self, env):
        # 调用父构造函数，这样我们就可以稍后访问 self.env
        super().__init__(env)
    
    def reset(self, **kwargs):
        """
        Reset the environment
        """
        obs, info = self.env.reset(**kwargs)
        return obs, info 
    
    def step(self, action):
        """
        :param action: ([float] or int) Action taken by the agent
        :return: (np.ndarray, float, bool, bool, dict) observation, reward, is this a final state (episode finished),
        is the max number of steps reached (episode finished artificially), additional informations
        """
        obs, reward, terminnated, truncated, info = self.env.step(action)
        return obs, reward, terminnated, truncated, info

## 限制episode长度
包装器的一个实际用例是当您想要按情节限制步骤数时，因为达到限制时您需要覆盖完成信号。在信息字典中传递该信息也是一个很好的做法。

In [5]:
class TimeLimitWrapper(gym.Wrapper):
    """
    :param env: (gym.Env) Gym environment that will be wrapped
    :param max_steps: (int) Max number of steps per episode
    """
    def __init__(self, env, max_steps=100):
        super(TimeLimitWrapper, self).__init__(env)
        self.max_steps = max_steps
        self.current_step = 0
    
    def reset(self, **kwargs):
        self.current_step = 0
        return self.env.reset(**kwargs)
    
    def step(self, action):
        self.current_step += 1
        obs, reward, terminated, truncated, info = self.env.step(action)
        if self.current_step >= self.max_steps:
            truncated = True
        return obs, reward, terminated, truncated, info

测试这个wrapper

In [9]:
from gymnasium.envs.classic_control.pendulum import PendulumEnv

env = PendulumEnv()
env = TimeLimitWrapper(env, max_steps=100)

In [10]:
obs, _ = env.reset()
done = False
n_steps = 0
while not done:
    random_action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(random_action)
    done = terminated or truncated
    n_steps += 1
print(n_steps, info)

100 {}


## 归一化actions

In [11]:
import numpy as np


class NormalizeActionWrapper(gym.Wrapper):
    """
    :param env: (gym.Env) Gym environment that will be wrapped
    """

    def __init__(self, env):
        action_space = env.action_space
        assert isinstance(
            action_space, gym.spaces.Box
        ), "This wrapper only works with continuous action space (spaces.Box)"
        self.low, self.high = action_space.low, action_space.high

        env.action_space = gym.spaces.Box(
            low=-1, high=1, shape=action_space.shape, dtype=np.float32
        )

        super(NormalizeActionWrapper, self).__init__(env)

    def rescale_action(self, scaled_action):
        """
        Rescale the action from [-1, 1] to [low, high]
        (no need for symmetric action space)
        :param scaled_action: (np.ndarray)
        :return: (np.ndarray)
        """
        return self.low + (0.5 * (scaled_action + 1.0) * (self.high - self.low))

    def reset(self, **kwargs):
        """
        Reset the environment
        """
        return self.env.reset(**kwargs)

    def step(self, action):
        """
        :param action: ([float] or int) Action taken by the agent
        :return: (np.ndarray, float,bool, bool, dict) observation, reward, final state? truncated?, additional informations
        """
        rescaled_action = self.rescale_action(action)
        obs, reward, terminated, truncated, info = self.env.step(rescaled_action)
        return obs, reward, terminated, truncated, info

重新缩放操作之前进行测试

In [12]:
original_env = gym.make("Pendulum-v1")

print(original_env.action_space.low)
for _ in range(10):
    print(original_env.action_space.sample())

[-2.]
[0.64706296]
[0.9676171]
[-0.06857944]
[-0.1365258]
[-1.8450676]
[-1.32972]
[0.20542334]
[1.1188657]
[-0.3101325]
[0.6103838]


测试 NormalizeAction 包装器

In [13]:
env = NormalizeActionWrapper(gym.make("Pendulum-v1"))

print(env.action_space.low)

for _ in range(10):
    print(env.action_space.sample())

[-1.]
[-0.45077792]
[0.89661294]
[-0.78943926]
[0.2788606]
[0.2453965]
[0.82822716]
[-0.9136288]
[-0.0630197]
[-0.5492888]
[-0.45697314]


## 用一个RL算法测试
使用stable baselines3的监控包装器，它允许监控训练统计数据（平均剧集奖励、平均剧集长度）

In [14]:
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv

In [16]:
env = Monitor(gym.make("Pendulum-v1"))
env = DummyVecEnv([lambda: env]) # 矢量化环境

In [18]:
model = A2C("MlpPolicy", env, verbose=1).learn(int(1000))

Using cpu device
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 200      |
|    ep_rew_mean        | -1.6e+03 |
| time/                 |          |
|    fps                | 921      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.4     |
|    explained_variance | 0.00807  |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -30.5    |
|    std                | 0.985    |
|    value_loss         | 425      |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 200       |
|    ep_rew_mean        | -1.69e+03 |
| time/                 |           |
|    fps                | 920       |
|    iterations         | 200       |
|    time_elapsed       | 1         |
|    total_ti

使用action wrapper

In [19]:
normalized_env = Monitor(gym.make("Pendulum-v1"))

normalized_env = NormalizeActionWrapper(normalized_env)
normalized_env = DummyVecEnv([lambda: normalized_env])

In [20]:
model_2 = A2C("MlpPolicy", normalized_env, verbose=1).learn(int(1000))

Using cpu device
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 200      |
|    ep_rew_mean        | -978     |
| time/                 |          |
|    fps                | 942      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.43    |
|    explained_variance | 0.247    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -4.99    |
|    std                | 1.01     |
|    value_loss         | 20       |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 200       |
|    ep_rew_mean        | -1.02e+03 |
| time/                 |           |
|    fps                | 1007      |
|    iterations         | 200       |
|    time_elapsed       | 0         |
|    total_ti

# VecEnvWrappersVecEnvWrappers

In [21]:
from stable_baselines3.common.vec_env import VecNormalize, VecFrameStack

env = DummyVecEnv([lambda: gym.make("Pendulum-v1")])
normalized_vec_env = VecNormalize(env)

In [22]:
obs = normalized_vec_env.reset()
for _ in range(10):
    action = [normalized_vec_env.action_space.sample()]
    obs, reward, _, _ = normalized_vec_env.step(action)
    print(obs, reward)

[[-0.4231898   0.04533154  0.9997644 ]] [-10.]
[[-1.3616405   0.37256944  1.2525516 ]] [-2.0198185]
[[-1.6208427  0.5309697  1.4246747]] [-1.3024957]
[[-1.7093897 -0.6508579  1.3418101]] [-1.0785282]
[[-1.7995896 -2.059525   1.4887828]] [-0.9382913]
[[-1.8205574 -2.2691145  1.416078 ]] [-0.90341836]
[[-1.8002975 -2.317051   1.3381401]] [-0.8397551]
[[-1.7582386 -2.3461304  1.3607904]] [-0.77818394]
[[-1.6702554 -2.3396945  1.2964357]] [-0.7442872]
[[-1.5345291 -2.3299363  1.3419609]] [-0.6995449]


# 练习：编写您自己的监视器包装器的代码

In [23]:
class MyMonitorWrapper(gym.Wrapper):
    """
    :param env: (gym.Env) Gym environment that will be wrapped
    """

    def __init__(self, env):
        # Call the parent constructor, so we can access self.env later
        super().__init__(env)
        # === YOUR CODE HERE ===#
        # Initialize the variables that will be used
        # to store the episode length and episode reward

        # ====================== #

    def reset(self, **kwargs):
        """
        Reset the environment
        """
        obs = self.env.reset(**kwargs)
        # === YOUR CODE HERE ===#
        # Reset the variables

        # ====================== #
        return obs

    def step(self, action):
        """
        :param action: ([float] or int) Action taken by the agent
        :return: (np.ndarray, float, bool, bool, dict)
            observation, reward, is the episode over?, is the episode truncated?, additional information
        """
        obs, reward, terminated, truncated, info = self.env.step(action)
        # === YOUR CODE HERE ===#
        # Update the current episode reward and episode length

        # ====================== #

        if terminated or truncated:
            # === YOUR CODE HERE ===#
            # Store the episode length and episode reward in the info dict
            pass

            # ====================== #
        return obs, reward, terminated, truncated, info

测试你的wrapper

In [26]:
!pip install box2d-py

Collecting box2d-py
  Using cached box2d-py-2.3.8.tar.gz (374 kB)
  Installing build dependencies ... [?25l

done
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: box2d-py
  Building wheel for box2d-py (pyproject.toml) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for box2d-py [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[20 lines of output][0m
  [31m   [0m Using setuptools (version 69.2.0).
  [31m   [0m running bdist_wheel
  [31m   [0m running build
  [31m   [0m running build_py
  [31m   [0m creating build
  [31m   [0m creating build/lib.linux-x86_64-cpython-310
  [31m   [0m creating build/lib.linux-x86_64-cpython-310/Box2D
  [31m   [0m copying library/Box2D/__init__.py -> build/lib.linux-x86_64-cpython-310/Box2D
  [31m   [0m copying library/Box2D/Box2D.py

In [25]:
env = gym.make("LunarLander-v2")
# === YOUR CODE HERE ===#
# Wrap the environment

# Reset the environment

# Take random actions in the environment and check
# that it returns the correct values after the end of each episode

# ====================== #

DependencyNotInstalled: Box2D is not installed, run `pip install gymnasium[box2d]`