In [None]:
# # 准备环境 pip install ale_py gymnasium[accept-rom-license,atari]==1.0.0
# !wget https://raw.githubusercontent.com/lhiqwj173/dl_helper/master/envs/rl.py > /dev/null 2>&1
# !python rl.py not_install_dl_helper > /dev/null 2>&1
# !pip install /kaggle/working/3rd/dl_helper > /dev/null 2>&1

In [1]:
import numpy as np
import gymnasium as gym
import ale_py
gym.register_envs(ale_py)
import ray
print("ray 版本:", ray.__version__)

ray 版本: 2.40.0


## RLlib 中的模型的生成逻辑
1. 生成配置对象 config 
2. 根据配置生成模型 config.build()

## 模型类型
- PPO(通用IMPALA/APPO)
- DQN(以及变种)

In [None]:
# 相关类
from ray.rllib.core.rl_module.rl_module import RLModule
from ray.rllib.core.rl_module.torch import TorchRLModule
from ray.rllib.algorithms.ppo.ppo_rl_module import PPORLModule
from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import PPOTorchRLModule
# API 类
from ray.rllib.core.rl_module.apis import InferenceOnlyAPI, ValueFunctionAPI
# 配置类，用于生成模型
from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog
from ray.rllib.core.models.catalog import Catalog

In [33]:
from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog
from ray.rllib.core.models.catalog import Catalog

# Catalog
配置的基类，通用所有的强化学习算法  
主要负责通用的编码器生成
- _get_encoder_config方法生成编码器配置
    会自动针对输入的维度选择编码器
    - use_lstm > RecurrentEncoderConfig
    - 1D-Box > MLPEncoderConfig
    - 3D-Box > CNNEncoderConfig
- build_encoder方法生成编码器

自动生成的编码器涵盖 MLP/CNN/LSTM  
基本上都可以通过配置调整满足需求,如下示意  
详细的配置参考 Catalog._get_encoder_config 方法中对配置字段的使用  
（或官方文档 https://docs.ray.io/en/latest/rllib/rllib-catalogs.html MODEL_DEFAULTS）

```
config.rl_module(
    model_config={
        # MLPEncoderConfig
        "fcnet_hiddens": [5, 3, 3],
        "fcnet_kernel_initializer": None,
        "fcnet_kernel_initializer_kwargs": {},
        "fcnet_bias_initializer": None,
        "fcnet_bias_initializer_kwargs": {},

        # CNNEncoderConfig
        "conv_filters": [
            [32, [8, 8], 4],  # [输出通道数, [kernel_size_h, kernel_size_w], stride]
            [64, [4, 4], 2],  # [64个通道, 4x4卷积核, stride=2]
            [64, [3, 3], 1],  # [64个通道, 3x3卷积核, stride=1] 
        ],

        # RecurrentEncoderConfig
        ...
    },
)

```



In [None]:
# 相关源码（截取）
class Catalog:
    """描述用于 RL 模块的子模块架构。

    RLlib 的原生 RL 模块从 Catalog 对象获取其模型。
    默认情况下，该 Catalog 会构建其作为属性拥有的配置。
    此组件被构建为可hack和可扩展的。您可以通过重写此类的 `build_xxx` 方法，
    将自定义组件注入到 RL 模块中。
    请注意，建议为单个用例编写自定义 RL 模块。
    对 Catalog 的修改主要在您想要为不同的 RL 模块重用相同的 Catalog 时才有意义。
    例如，如果您编写了一个自定义编码器并希望将其注入到不同的 RL 模块
    （例如，PPO、DQN 等）。您可以通过修改
    `Catalog._determine_components_hook` 来影响决定子组件的决策树。

    使用示例：

    # 定义一个自定义的 catalog

    .. testcode::

        import torch
        import gymnasium as gym
        from ray.rllib.core.models.configs import MLPHeadConfig
        from ray.rllib.core.models.catalog import Catalog

        class MyCatalog(Catalog):
            def __init__(
                self,
                observation_space: gym.Space,
                action_space: gym.Space,
                model_config_dict: dict,
            ):
                super().__init__(observation_space, action_space, model_config_dict)
                self.my_model_config = MLPHeadConfig(
                    hidden_layer_dims=[64, 32],
                    input_dims=[self.observation_space.shape[0]],
                )

            def build_my_head(self, framework: str):
                return self.my_model_config.build(framework=framework)

        # 有了这个，RLlib 可以像这样从这个 catalog 构建和使用模型：
        catalog = MyCatalog(gym.spaces.Box(0, 1), gym.spaces.Box(0, 1), {})
        my_head = catalog.build_my_head(framework="torch")

        # 对构建的模型进行调用。
        out = my_head(torch.Tensor([[1]]))
    """
    @OverrideToImplementCustomLogic_CallToSuperRecommended
    def _determine_components_hook(self):
        """Decision tree hook for subclasses to override.

        By default, this method executes the decision tree that determines the
        components that a Catalog builds. You can extend the components by overriding
        this or by adding to the constructor of your subclass.

        Override this method if you don't want to use the default components
        determined here. If you want to use them but add additional components, you
        should call `super()._determine_components()` at the beginning of your
        implementation.

        This makes it so that subclasses are not forced to create an encoder config
        if the rest of their catalog is not dependent on it or if it breaks.
        At the end of this method, an attribute `Catalog.latent_dims`
        should be set so that heads can be built using that information.
        """
        self._encoder_config = self._get_encoder_config(
            observation_space=self.observation_space,
            action_space=self.action_space,
            model_config_dict=self._model_config_dict,
        )

        # Create a function that can be called when framework is known to retrieve the
        # class type for action distributions
        self._action_dist_class_fn = functools.partial(
            self._get_dist_cls_from_action_space, action_space=self.action_space
        )

        # The dimensions of the latent vector that is output by the encoder and fed
        # to the heads.
        self.latent_dims = self._encoder_config.output_dims

    @OverrideToImplementCustomLogic
    def build_encoder(self, framework: str) -> Encoder:
        """Builds the encoder.

        By default, this method builds an encoder instance from Catalog._encoder_config.

        You should override this if you want to use RLlib's default RL Modules but
        only want to change the encoder. For example, if you want to use a custom
        encoder, but want to use RLlib's default heads, action distribution and how
        tensors are routed between them. If you want to have full control over the
        RL Module, we recommend writing your own RL Module by inheriting from one of
        RLlib's RL Modules instead.

        Args:
            framework: The framework to use. Either "torch" or "tf2".

        Returns:
            The encoder.
        """
        """构建编码器。

        默认情况下，此方法从 Catalog._encoder_config 构建一个编码器实例。

        如果您想使用 RLlib 的默认 RL 模块，但只想更改编码器，则应重写此方法。
        例如，如果您想使用自定义编码器，但想使用 RLlib 的默认头、动作分布以及张量如何在它们之间路由。
        如果您想完全控制 RL 模块，
        我们建议您通过继承 RLlib 的 RL 模块之一来编写自己的 RL 模块。

        Args:
        framework: 要使用的框架。可以是 "torch" 或 "tf2"。

        Returns:
        编码器。
        """
        assert hasattr(self, "_encoder_config"), (
            "You must define a `Catalog._encoder_config` attribute in your Catalog "
            "subclass or override the `Catalog.build_encoder` method. By default, "
            "an encoder_config is created in the __post_init__ method."
        )
        return self._encoder_config.build(framework=framework)

    @classmethod
    def _get_encoder_config(
        cls,
        observation_space: gym.Space,
        model_config_dict: dict,
        action_space: gym.Space = None,
    ) -> ModelConfig:
        """Returns an EncoderConfig for the given input_space and model_config_dict.

        Encoders are usually used in RLModules to transform the input space into a
        latent space that is then fed to the heads. The returned EncoderConfig
        objects correspond to the built-in Encoder classes in RLlib.
        For example, for a simple 1D-Box input_space, RLlib offers an
        MLPEncoder, hence this method returns the MLPEncoderConfig. You can overwrite
        this method to produce specific EncoderConfigs for your custom Models.

        The following input spaces lead to the following configs:
        - 1D-Box: MLPEncoderConfig
        - 3D-Box: CNNEncoderConfig
        # TODO (Artur): Support more spaces here
        # ...

        Args:
            observation_space: The observation space to use.
            model_config_dict: The model config to use.
            action_space: The action space to use if actions are to be encoded. This
                is commonly the case for LSTM models.

        Returns:
            The encoder config.
        """
        """返回给定 input_space 和 model_config_dict 的 EncoderConfig。

        编码器通常在 RLModules 中使用，用于将输入空间转换为一个潜在空间，然后将其传递给头部。
        返回的 EncoderConfig 对象对应于 RLlib 中的内置编码器类。
        例如，对于一个简单的 1D-Box 输入空间，RLlib 提供了 MLPEncoder
        ，因此此方法返回 MLPEncoderConfig。
        您可以重写此方法以生成特定的 EncoderConfig 以用于您的自定义模型。

        以下输入空间会导致以下配置：

        1D-Box: MLPEncoderConfig
        3D-Box: CNNEncoderConfig
        TODO (Artur): 在此处支持更多空间
        ...
        Args:
        observation_space: 要使用的观测空间。
        model_config_dict: 要使用的模型配置。
        action_space: 如果动作需要编码，则要使用的动作空间。这在 LSTM 模型的情况下通常是这样。

        Returns:
        编码器配置。
        """

        activation = model_config_dict["fcnet_activation"]
        output_activation = model_config_dict["fcnet_activation"]
        use_lstm = model_config_dict["use_lstm"]

        if use_lstm:
            encoder_config = RecurrentEncoderConfig(
                input_dims=observation_space.shape,
                recurrent_layer_type="lstm",
                hidden_dim=model_config_dict["lstm_cell_size"],
                hidden_weights_initializer=model_config_dict["lstm_kernel_initializer"],
                hidden_weights_initializer_config=model_config_dict[
                    "lstm_kernel_initializer_kwargs"
                ],
                hidden_bias_initializer=model_config_dict["lstm_bias_initializer"],
                hidden_bias_initializer_config=model_config_dict[
                    "lstm_bias_initializer_kwargs"
                ],
                batch_major=True,
                num_layers=1,
                tokenizer_config=cls.get_tokenizer_config(
                    observation_space,
                    model_config_dict,
                ),
            )
        else:
            # TODO (Artur): Maybe check for original spaces here
            # input_space is a 1D Box
            if isinstance(observation_space, Box) and len(observation_space.shape) == 1:
                # In order to guarantee backward compatability with old configs,
                # we need to check if no latent dim was set and simply reuse the last
                # fcnet hidden dim for that purpose.
                hidden_layer_dims = model_config_dict["fcnet_hiddens"][:-1]
                encoder_latent_dim = model_config_dict["fcnet_hiddens"][-1]
                encoder_config = MLPEncoderConfig(
                    input_dims=observation_space.shape,
                    hidden_layer_dims=hidden_layer_dims,
                    hidden_layer_activation=activation,
                    hidden_layer_weights_initializer=model_config_dict[
                        "fcnet_kernel_initializer"
                    ],
                    hidden_layer_weights_initializer_config=model_config_dict[
                        "fcnet_kernel_initializer_kwargs"
                    ],
                    hidden_layer_bias_initializer=model_config_dict[
                        "fcnet_bias_initializer"
                    ],
                    hidden_layer_bias_initializer_config=model_config_dict[
                        "fcnet_bias_initializer_kwargs"
                    ],
                    output_layer_dim=encoder_latent_dim,
                    output_layer_activation=output_activation,
                    output_layer_weights_initializer=model_config_dict[
                        "fcnet_kernel_initializer"
                    ],
                    output_layer_weights_initializer_config=model_config_dict[
                        "fcnet_kernel_initializer_kwargs"
                    ],
                    output_layer_bias_initializer=model_config_dict[
                        "fcnet_bias_initializer"
                    ],
                    output_layer_bias_initializer_config=model_config_dict[
                        "fcnet_bias_initializer_kwargs"
                    ],
                )

            # input_space is a 3D Box
            elif (
                isinstance(observation_space, Box) and len(observation_space.shape) == 3
            ):
                if not model_config_dict.get("conv_filters"):
                    model_config_dict["conv_filters"] = get_filter_config(
                        observation_space.shape
                    )

                encoder_config = CNNEncoderConfig(
                    input_dims=observation_space.shape,
                    cnn_filter_specifiers=model_config_dict["conv_filters"],
                    cnn_activation=model_config_dict["conv_activation"],
                    cnn_kernel_initializer=model_config_dict["conv_kernel_initializer"],
                    cnn_kernel_initializer_config=model_config_dict[
                        "conv_kernel_initializer_kwargs"
                    ],
                    cnn_bias_initializer=model_config_dict["conv_bias_initializer"],
                    cnn_bias_initializer_config=model_config_dict[
                        "conv_bias_initializer_kwargs"
                    ],
                )
            # input_space is a 2D Box
            elif (
                isinstance(observation_space, Box) and len(observation_space.shape) == 2
            ):
                # RLlib used to support 2D Box spaces by silently flattening them
                raise ValueError(
                    f"No default encoder config for obs space={observation_space},"
                    f" lstm={use_lstm} found. 2D Box "
                    f"spaces are not supported. They should be either flattened to a "
                    f"1D Box space or enhanced to be a 3D box space."
                )
            # input_space is a possibly nested structure of spaces.
            else:
                # NestedModelConfig
                raise ValueError(
                    f"No default encoder config for obs space={observation_space},"
                    f" lstm={use_lstm} found."
                )

        return encoder_config


# 默认编码器使用

In [34]:
import torch
from ray.rllib.core.models.configs import (
    CNNEncoderConfig,
    MLPEncoderConfig,
    RecurrentEncoderConfig,
)

# MLP
config = MLPEncoderConfig(
    input_dims=[2],
    hidden_layer_dims=[8, 8],
    hidden_layer_activation="silu",
    hidden_layer_use_layernorm=True,
    hidden_layer_use_bias=False,
    output_layer_dim=4,
    output_layer_activation="tanh",
    output_layer_use_bias=False,
)
model = config.build(framework="torch")
print(model)

# CNN
config = CNNEncoderConfig(
    input_dims=[84, 84, 3],  # must be 3D tensor (image: w x h x C)
    cnn_filter_specifiers=[
        [16, [8, 8], 4],
        [32, [4, 4], 2],
    ],
    cnn_activation="relu",
    cnn_use_layernorm=False,
    cnn_use_bias=True,
)
model = config.build(framework="torch")
print(model)

# 创建一个batch的输入数据
batch_size = 2
input_tensor = torch.randn(batch_size, 84, 84, 3)  # PyTorch格式
print(input_tensor.shape)

# 前向传播
output = model({'obs': input_tensor})
print(output)


TorchMLPEncoder(
  (net): TorchMLP(
    (mlp): Sequential(
      (0): Linear(in_features=2, out_features=8, bias=False)
      (1): LayerNorm((8,), eps=0.001, elementwise_affine=True)
      (2): SiLU()
      (3): Linear(in_features=8, out_features=8, bias=False)
      (4): LayerNorm((8,), eps=0.001, elementwise_affine=True)
      (5): SiLU()
      (6): Linear(in_features=8, out_features=4, bias=False)
      (7): Tanh()
    )
  )
)
TorchCNNEncoder(
  (net): Sequential(
    (0): TorchCNN(
      (cnn): Sequential(
        (0): ZeroPad2d((2, 2, 2, 2))
        (1): Conv2d(3, 16, kernel_size=(8, 8), stride=(4, 4))
        (2): ReLU()
        (3): ZeroPad2d((1, 2, 1, 2))
        (4): Conv2d(16, 32, kernel_size=(4, 4), stride=(2, 2))
        (5): ReLU()
      )
    )
    (1): Flatten(start_dim=1, end_dim=-1)
  )
)
torch.Size([2, 84, 84, 3])
{'encoder_out': tensor([[0.1949, 0.0000, 0.0301,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.3109, 0.0576,  ..., 0.0000, 0.0000, 0.0000]],
       grad

In [4]:
from ray.rllib.algorithms.ppo import PPOConfig

config = (
    PPOConfig()
    .api_stack(# 使用新的api
        enable_rl_module_and_learner=True,
        enable_env_runner_and_connector_v2=True,
    )
    .environment("CartPole-v1")# 输入的是1d，自动采用 MLP 编码器
    .environment("pong")# 输入的是3d(4, 42, 42)，自动采用 CNN 编码器
    .rl_module(
        model_config={
            # MLP 参数
            "fcnet_hiddens": [5, 3, 3],

            # CNN 参数, 会自动计算输出维度
            # "conv_filters": [
            #     [32, [8, 8], 4],  # [输出通道数, [kernel_size_h, kernel_size_w], stride]
            #     [64, [4, 4], 2],  # [64个通道, 4x4卷积核, stride=2]
            #     [64, [3, 3], 1],  # [64个通道, 3x3卷积核, stride=1] 
            # ],
        },
    )
)

# 构建算法
algo = config.build()

# 查看模型
algo.get_module()

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2025-01-08 14:24:25,358	INFO worker.py:1821 -- Started a local Ray instance.
2025-01-08 14:24:45,387	INFO trainable.py:161 -- Trainable.setup took 28.843 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


PPOTorchRLModule(
  (encoder): TorchActorCriticEncoder(
    (actor_encoder): TorchMLPEncoder(
      (net): TorchMLP(
        (mlp): Sequential(
          (0): Linear(in_features=4, out_features=5, bias=True)
          (1): Tanh()
          (2): Linear(in_features=5, out_features=3, bias=True)
          (3): Tanh()
          (4): Linear(in_features=3, out_features=3, bias=True)
          (5): Tanh()
        )
      )
    )
  )
  (pi): TorchMLPHead(
    (net): TorchMLP(
      (mlp): Sequential(
        (0): Linear(in_features=3, out_features=2, bias=True)
      )
    )
  )
  (vf): TorchMLPHead(
    (net): TorchMLP(
      (mlp): Sequential(
        (0): Linear(in_features=3, out_features=1, bias=True)
      )
    )
  )
)

# 自定义编码器


In [35]:
# 自定义 ModelConfig
from ray.rllib.core.models.configs import ModelConfig
from ray.rllib.core.models.torch.encoder import TorchModel, Encoder
from dataclasses import dataclass
from ray.rllib.core.models.base import ENCODER_OUT
from ray.rllib.core.columns import Columns
import torch
import torch.nn as nn
import torch.nn.functional as F

class ConvFCNetEncoder(TorchModel, Encoder):
    """
    自定义的编码器
    - 继承 TorchModel, Encoder
    - 初始化函数接收参数 config（也就是 ModelConfig重写类 的示例本身）
        TorchModel.__init__(self, config)
        Encoder.__init__(self, config)
    - 重写方法 def _forward(self, inputs: dict, **kwargs) -> dict:
        注意输入 / 输出
    """
    def __init__(self, config) -> None:
        TorchModel.__init__(self, config)
        Encoder.__init__(self, config)
    
        self.conv1 = nn.Conv2d(in_channels=config.input_dims[0], out_channels=16, kernel_size=3, stride=1, padding=1)
        conv1_out = (config.input_dims[1] // 1) * (config.input_dims[2] // 1) * 16  # 64 * 64 * 16
        self.fc = nn.Linear(conv1_out, config.out_dim)

    def _forward(self, inputs: dict, **kwargs) -> dict:
        x = F.relu(self.conv1(inputs[Columns.OBS]))
        x = x.view(x.size(0), -1)  # Flatten the output of convolutional layer
        x = self.fc(x)

        return {ENCODER_OUT: x}

@dataclass# input_dims / out_dim 为实例参数
class test_EncoderConfig(ModelConfig):
    """
    output_dims函数 返回编码器输出的维度，用于其他构造 head模型 的输入
    """
    input_dims = None
    out_dim = 10
    def build(self, framework: str = "torch"):
        if framework == "torch":
            # 一个卷积层 + 全连接层
            return ConvFCNetEncoder(self)

        else:
            raise ValueError(f'only torch ModelConfig')

    @property
    def output_dims(self):
        """Read-only `output_dims` are inferred automatically from other settings."""
        return (int(self.out_dim),)# 注意返回的是维度，不是int
    
net = test_EncoderConfig([3, 64, 64], 10).build()
print(net)

batch_size = 2
input_tensor = torch.randn(batch_size, 3, 64, 64)  # PyTorch格式
print(input_tensor.shape)

# 前向传播
output = net({Columns.OBS: input_tensor})# 与默认编码器一致输入/输出
print(output)

ConvFCNetEncoder(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc): Linear(in_features=65536, out_features=10, bias=True)
)
torch.Size([2, 3, 64, 64])
{'encoder_out': tensor([[ 0.0140,  0.0622,  0.1498, -0.0791, -0.1749, -0.1062,  0.1390,  0.1778,
          0.1347, -0.2741],
        [-0.2605,  0.0714, -0.2527, -0.2216, -0.1182, -0.3389,  0.1329,  0.6081,
         -0.1720,  0.1413]], grad_fn=<AddmmBackward0>)}


In [24]:
import functools
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.core.rl_module.rl_module import RLModuleSpec
from ray.tune.registry import get_trainable_cls, register_env
from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack

from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog
class custom_PPOCatalog(PPOCatalog):
    """
    - 重写 _determine_components_hook 生成配置
    """
    def _determine_components_hook(self):
        # 获取输入参数 可设置参数 input_dims / out_dim
        input_dims = self._model_config_dict["input_dims"]
        out_dim = self._model_config_dict["out_dim"]
        # 生成配置
        self._encoder_config = test_EncoderConfig(input_dims, out_dim)

        # 不变
        # Create a function that can be called when framework is known to retrieve the
        # class type for action distributions
        self._action_dist_class_fn = functools.partial(
            self._get_dist_cls_from_action_space, action_space=self.action_space
        )

        # 不变
        # The dimensions of the latent vector that is output by the encoder and fed
        # to the heads.
        self.latent_dims = self._encoder_config.output_dims


register_env(
    "pong",
    lambda cfg: wrap_atari_for_new_api_stack(
        gym.make("ale_py:ALE/Pong-v5", **cfg),
        dim=42,  # <- need images to be "tiny" for our custom model
        framestack=4,
    ),
)

config = (
    PPOConfig()
    .api_stack(# 使用新的api
        enable_rl_module_and_learner=True,
        enable_env_runner_and_connector_v2=True,
    )
    .environment("pong")# 输入的是3d(4, 42, 42)
    .rl_module(
        rl_module_spec=RLModuleSpec(catalog_class=custom_PPOCatalog),# 使用自定义配置
        model_config={
            # # MLP 参数
            # "fcnet_hiddens": [5, 3, 3],

            # CNN 参数
            # "conv_filters": [
            #     [32, [8, 8], 4],  # [输出通道数, [kernel_size_h, kernel_size_w], stride]
            #     [64, [4, 4], 2],  # [64个通道, 4x4卷积核, stride=2]
            #     [64, [3, 3], 1],  # [64个通道, 3x3卷积核, stride=1] 
            # ],

            # 自定义编码器参数
            'input_dims' : [3, 210, 160],
            'out_dim' : 10,
        },
    )
)

# 构建算法
algo = config.build()

# 查看模型
algo.get_module()



PPOTorchRLModule(
  (encoder): TorchActorCriticEncoder(
    (actor_encoder): ConvFCNetEncoder(
      (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (fc): Linear(in_features=537600, out_features=10, bias=True)
    )
  )
  (pi): TorchMLPHead(
    (net): TorchMLP(
      (mlp): Sequential(
        (0): Linear(in_features=10, out_features=6, bias=True)
      )
    )
  )
  (vf): TorchMLPHead(
    (net): TorchMLP(
      (mlp): Sequential(
        (0): Linear(in_features=10, out_features=1, bias=True)
      )
    )
  )
)