In [None]:
# from typing import TYPE_CHECKING
# from ...utils import _LazyModule
# from ...utils.import_utils import define_import_structure
# if TYPE_CHECKING:
#     from .configuration_t5 import *
#     from .modeling_flax_t5 import *
#     from .modeling_t5 import *
#     from .modeling_tf_t5 import *
#     from .tokenization_t5 import *
#     from .tokenization_t5_fast import *
# else:
#     import sys
#     _file = globals()["__file__"]
#     sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

In [1]:
from typing import Mapping
from transformers.configuration_utils import PretrainedConfig
from transformers.onnx import OnnxSeq2SeqConfigWithPast
from transformers.utils import logging

In [2]:
logger = logging.get_logger(__name__)

In [3]:
class T5Config(PretrainedConfig):
    model_type = "t5" # 指定模型类型
    # 如果你是自己调用 model(input_ids)，不会有任何影响，past_key_values 总会返回（除非你手动 use_cache=False）
    # 但如果你用的是 .generate() 或 pipeline，则可能自动帮你裁剪掉past_key_values。
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {
        "hidden_size": "d_model",
        "num_attention_heads": "num_heads",
        "num_hidden_layers": "num_layers",
        "head_dim": "d_kv",
    }

    def __init__(
        self,
        vocab_size=32128, # 词汇表大小
        d_model=512,  # 模型隐藏维度
        d_kv=64, # 每个注意力头的维度大小
        d_ff=2048, # 中间前馈层的维度大小
        num_layers=6,  # Transformer 编码器中的隐藏层数量
        num_decoder_layers=None, # Transformer 解码器中的隐藏层数量。若未设置，将使用 `num_layers` 的值
        num_heads=8,  # 多头注意力中 head 的数量
        relative_attention_num_buckets=32, # 每个注意力层使用的相对位置编码的桶（bucket）数量。
        relative_attention_max_distance=128, # 为 bucket 分隔设置的最大序列距离。
        dropout_rate=0.1, # 所有 dropout 层的比率。
        layer_norm_epsilon=1e-6,  # LayerNorm 层中使用的 epsilon 值。
        initializer_factor=1.0, # 所有权重矩阵初始化时的因子
        feed_forward_proj="relu",  # 前馈网络中使用的激活函数
        is_encoder_decoder=True, # 是否是编码器-解码器架构
        use_cache=True, # 是否启用缓存，用于加速生成任务
        pad_token_id=0,
        eos_token_id=1,
        classifier_dropout=0.0,# 分类器dropout
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.d_kv = d_kv
        self.d_ff = d_ff
        self.num_layers = num_layers
        self.num_decoder_layers = (
            num_decoder_layers if num_decoder_layers is not None else self.num_layers
        )  # default = symmetry
        self.num_heads = num_heads
        self.relative_attention_num_buckets = relative_attention_num_buckets
        self.relative_attention_max_distance = relative_attention_max_distance
        self.dropout_rate = dropout_rate
        self.classifier_dropout = classifier_dropout
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_factor = initializer_factor
        self.feed_forward_proj = feed_forward_proj
        self.use_cache = use_cache
        act_info = self.feed_forward_proj.split("-")
        self.dense_act_fn = act_info[-1] # 前馈中间层的激活函数
        self.is_gated_act = act_info[0] == "gated" # 是否是门控型的
        if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2:
            raise ValueError(
                f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer. "
                "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. "
                "'gated-gelu' or 'relu'"
            )

        # 向后兼容
        if feed_forward_proj == "gated-gelu":
            self.dense_act_fn = "gelu_new"
        super().__init__(
            pad_token_id=pad_token_id,
            eos_token_id=eos_token_id,
            is_encoder_decoder=is_encoder_decoder,
            **kwargs,
        )

In [4]:
config = T5Config()

In [5]:
config

T5Config {
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "transformers_version": "4.51.3",
  "use_cache": true,
  "vocab_size": 32128
}

In [7]:
class T5OnnxConfig(OnnxSeq2SeqConfigWithPast):
    # 这种装饰器叫做属性装饰器，它将一个方法转化为一个只读属性
    # 返回一个字典，定义了模型在导出为 ONNX 时的输入张量维度含义（即 dynamic axes），
    # 用于指示哪些维度是可变的，并在 ONNX 导出时进行动态 shape 处理。
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 对于 encoder 输入：
        #   - "input_ids": shape 为 [batch_size, encoder_seq_len]
        #   - "attention_mask": shape 为 [batch_size, encoder_seq_len]
        common_inputs = {
            "input_ids": {0: "batch", 1: "encoder_sequence"},
            "attention_mask": {0: "batch", 1: "encoder_sequence"},
        }
        # 对于 decoder 输入：
        #   - 如果启用 use_past
        #       - decoder_input_ids 是仅包含最后一个 token，形状为 [batch_size]，因为是增量推理
        #       - decoder_attention_mask 对应的是 [batch_size, past_decoder_seq_len + current_token]
        #       - encoder 侧的 attention_mask 也需要扩展成 past_encoder_sequence + current_sequence
        #       - 还需要注入 past key/value 相关的 dynamic axes（通过 fill_with_past_key_values_）
        if self.use_past:
            # 增量推理时 encoder 侧输入可视为 past + 当前输入，便于兼容长序列拼接推理
            common_inputs["attention_mask"][1] = "past_encoder_sequence + sequence"
            # decoder 只输入一个当前 token，而不是整个序列
            common_inputs["decoder_input_ids"] = {0: "batch"}
            # decoder_attention_mask 也要反映出当前 token 在拼接后的 decoder 序列中位置
            common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
        # 如果未启用 use_past（即一次性完整解码）：
            # - decoder_input_ids: [batch_size, decoder_seq_len]
            # - decoder_attention_mask: [batch_size, decoder_seq_len]
        else: # 非增量推理时 decoder 需要整个输入序列
            common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
            common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
        # 如果启用了 use_past，则还要加入 past_key_values 的 dynamic axes 映射
        if self.use_past:
            self.fill_with_past_key_values_(common_inputs, direction="inputs")
        return common_inputs
    # 指定导出为 ONNX 时使用的默认 opset 版本。opset 决定了导出模型可使用的 ONNX 算子版本。 
    @property
    def default_onnx_opset(self) -> int:
        return 13


__all__ = ["T5Config", "T5OnnxConfig"]

In [8]:
onnxConfig=T5OnnxConfig(config)

In [9]:
onnxConfig.num_layers

(6, 6)

In [10]:
onnxConfig.inputs

{'input_ids': {0: 'batch', 1: 'encoder_sequence'},
 'attention_mask': {0: 'batch', 1: 'encoder_sequence'},
 'decoder_input_ids': {0: 'batch', 1: 'decoder_sequence'},
 'decoder_attention_mask': {0: 'batch', 1: 'decoder_sequence'}}

In [13]:
onnxConfig=T5OnnxConfig(config,use_past=True)

In [14]:
onnxConfig.inputs

{'input_ids': {0: 'batch', 1: 'encoder_sequence'},
 'attention_mask': {0: 'batch', 1: 'past_encoder_sequence + sequence'},
 'decoder_input_ids': {0: 'batch'},
 'decoder_attention_mask': {0: 'batch', 1: 'past_decoder_sequence + sequence'},
 'past_key_values.0.decoder.key': {0: 'batch', 2: 'past_decoder_sequence'},
 'past_key_values.0.decoder.value': {0: 'batch', 2: 'past_decoder_sequence'},
 'past_key_values.0.encoder.key': {0: 'batch', 2: 'past_encoder_sequence'},
 'past_key_values.0.encoder.value': {0: 'batch', 2: 'past_encoder_sequence'},
 'past_key_values.1.decoder.key': {0: 'batch', 2: 'past_decoder_sequence'},
 'past_key_values.1.decoder.value': {0: 'batch', 2: 'past_decoder_sequence'},
 'past_key_values.1.encoder.key': {0: 'batch', 2: 'past_encoder_sequence'},
 'past_key_values.1.encoder.value': {0: 'batch', 2: 'past_encoder_sequence'},
 'past_key_values.2.decoder.key': {0: 'batch', 2: 'past_decoder_sequence'},
 'past_key_values.2.decoder.value': {0: 'batch', 2: 'past_decoder_seq

In [15]:
onnxConfig.num_attention_heads

(8, 8)

In [16]:
onnxConfig.default_onnx_opset

13