In [None]:
from typing import TYPE_CHECKING # 类型检查

from transformers.utils import _LazyModule # 延迟模块加载工具，用于提升启动速度或避免不必要的依赖导入
from transformers.utils.import_utils import define_import_structure # 用于定义延迟导入的模块结构

# 在静态类型检查工具（如 mypy, pyright, pyre, pylance）分析代码时，会被认为是 True。
if TYPE_CHECKING:
    from transformers.models.align.configuration_align import * # 配置对齐
    from transformers.models.align.modeling_align import * # 建模对齐
    from transformers.models.align.processing_align import * # 处理对齐
else: 
    import sys
    # 使用 LazyModule 在运行时延迟导入模块，仅在实际访问时才真正加载，提高性能并减少依赖冲突
    _file = globals()["__file__"]
    sys.modules[__name__] = _LazyModule(
        __name__, _file, define_import_structure(_file), module_spec=__spec__) # 模块规格

In [3]:
print(__spec__)

None


In [5]:
# ALIGN 模型配置
from typing import TYPE_CHECKING, List # 类型检查

In [6]:
if TYPE_CHECKING:
    pass

In [7]:
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging

In [8]:
logger = logging.get_logger(__name__) # 日志对象

In [11]:
# 对齐文本配置
# ALIGN文本模型的配置类，用于定义其结构参数。
# 该配置类基于BERT默认值，构建与[kakaobrain/align-base]相似的结构。
class AlignTextConfig(PretrainedConfig):
    r"""
    Example:

    ```python
    >>> from transformers import AlignTextConfig, AlignTextModel

    >>> # Initializing a AlignTextConfig with kakaobrain/align-base style configuration
    >>> configuration = AlignTextConfig()

    >>> # Initializing a AlignTextModel (with random weights) from the kakaobrain/align-base style configuration
    >>> model = AlignTextModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "align_text_model" # 指定模型类型字符串
    base_config_key = "text_config"  # 用于多模态模型配置中区分子模块配置 

    def __init__(
        self,
        vocab_size=30522, # 词表大小，定义可处理的唯一token数量
        hidden_size=768, # Transformer隐藏层的维度，也是embedding维度
        num_hidden_layers=12,  # Transformer的层数（编码器层数量）
        num_attention_heads=12,  # 每层中的多头注意力头数量
        intermediate_size=3072, # FFN中间层的维度（即前馈层维度）
        hidden_act="gelu", # 激活函数类型，可选: gelu, relu, silu, gelu_new
        hidden_dropout_prob=0.1,  # embedding/FFN的dropout比例
        attention_probs_dropout_prob=0.1,  # 多头注意力中的dropout比例
        max_position_embeddings=512, # 最大位置编码长度，控制序列最大长度
        type_vocab_size=2,  # token_type_ids的词表大小（用于区分句子）
        initializer_range=0.02,   # 权重初始化的标准差，采用截断正态分布
        layer_norm_eps=1e-12,  # LayerNorm中的epsilon，避免除零错误
        pad_token_id=0,  # padding的token id
        position_embedding_type="absolute", # 位置编码类型，支持absolute和relative
        use_cache=True,  # 是否启用缓存key/value（仅对解码器模型有效）
        **kwargs,
    ):
        super().__init__(**kwargs)
        # 将各参数保存为实例属性
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.position_embedding_type = position_embedding_type
        self.use_cache = use_cache
        self.pad_token_id = pad_token_id

In [13]:
from transformers import AlignTextModel

2025-05-24 11:29:23.201647: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748086163.517040      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748086163.605276      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [14]:
configuration = AlignTextConfig()
model = AlignTextModel(configuration)

In [15]:
model.config

AlignTextConfig {
  "_attn_implementation_autoset": true,
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "align_text_model",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.51.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [16]:
# ALIGN视觉模型配置类，定义了模型的整体架构参数，仿照EfficientNet（尤其是efficientnet-b7）设计。
# 实例化该类可构造ALIGN视觉编码器的结构超参数，适用于[`AlignVisionModel`]模型初始化。
# 默认配置近似Huggingface上[kakaobrain/align-base]模型的视觉编码器部分。
class AlignVisionConfig(PretrainedConfig):
    r"""
    Example:

    ```python
    >>> from transformers import AlignVisionConfig, AlignVisionModel

    >>> # Initializing a AlignVisionConfig with kakaobrain/align-base style configuration
    >>> configuration = AlignVisionConfig()

    >>> # Initializing a AlignVisionModel (with random weights) from the kakaobrain/align-base style configuration
    >>> model = AlignVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "align_vision_model"
    base_config_key = "vision_config"

    def __init__(
        self,
        num_channels: int = 3,  # 输入图像的通道数，默认RGB图像为3
        image_size: int = 600,  # 输入图像的边长（假设为方形图像）
        width_coefficient: float = 2.0, # 控制网络宽度的缩放因子，用于构造不同规模模型
        depth_coefficient: float = 3.1,  # 控制网络深度的缩放因子
        depth_divisor: int = 8,  # 保证channel数能被该值整除，有助于硬件优化
        kernel_sizes: List[int] = [3, 3, 5, 3, 5, 5, 3], # 每个MBConv block对应的卷积核大小
        in_channels: List[int] = [32, 16, 24, 40, 80, 112, 192],  # 每个block输入通道数
        out_channels: List[int] = [16, 24, 40, 80, 112, 192, 320], # 每个block输出通道数
        depthwise_padding: List[int] = [],  # 指定某些block使用square padding，可为空
        strides: List[int] = [1, 2, 2, 2, 1, 2, 1], # 每个block的stride
        num_block_repeats: List[int] = [1, 2, 2, 3, 3, 4, 1],  # 每个stage中block的重复次数
        expand_ratios: List[int] = [1, 6, 6, 6, 6, 6, 6], # 每个block的expand比例（MBConv结构）
        squeeze_expansion_ratio: float = 0.25,  # Squeeze-and-Excitation模块中压缩比率
        hidden_act: str = "swish", # 激活函数类型，可选值包括"silu", "relu", "swish"等
        hidden_dim: int = 2560,  # 最后全连接层前的隐藏维度，默认较大用于ALIGN结构
        pooling_type: str = "mean", # 图像特征汇聚方式，可选"mean"或"max"
        initializer_range: float = 0.02,  # 初始化权重时使用的截断正态分布标准差
        batch_norm_eps: float = 0.001, # BatchNorm中的epsilon，用于防止除零
        batch_norm_momentum: float = 0.99, # BatchNorm中的动量项
        drop_connect_rate: float = 0.2, # 用于skip连接的随机drop率，增强正则化
        **kwargs,
    ):
        super().__init__(**kwargs)
        # 保存所有参数为成员变量
        self.num_channels = num_channels
        self.image_size = image_size
        self.width_coefficient = width_coefficient
        self.depth_coefficient = depth_coefficient
        self.depth_divisor = depth_divisor
        self.kernel_sizes = kernel_sizes
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.depthwise_padding = depthwise_padding
        self.strides = strides
        self.num_block_repeats = num_block_repeats
        self.expand_ratios = expand_ratios
        self.squeeze_expansion_ratio = squeeze_expansion_ratio
        self.hidden_act = hidden_act
        self.hidden_dim = hidden_dim
        self.pooling_type = pooling_type
        self.initializer_range = initializer_range
        self.batch_norm_eps = batch_norm_eps
        self.batch_norm_momentum = batch_norm_momentum
        self.drop_connect_rate = drop_connect_rate
        # 隐含层数量估算：每个MBConv block含有4个子层（depthwise + SE + projection + residual）
        # 总层数 = 所有block重复次数之和 × 4（非精确统计，但便于统一管理）
        self.num_hidden_layers = sum(num_block_repeats) * 4

In [17]:
from transformers import AlignVisionConfig, AlignVisionModel

In [18]:
configuration = AlignVisionConfig()
model = AlignVisionModel(configuration)

In [19]:
model.config

AlignVisionConfig {
  "_attn_implementation_autoset": true,
  "batch_norm_eps": 0.001,
  "batch_norm_momentum": 0.99,
  "depth_coefficient": 3.1,
  "depth_divisor": 8,
  "depthwise_padding": [],
  "drop_connect_rate": 0.2,
  "expand_ratios": [
    1,
    6,
    6,
    6,
    6,
    6,
    6
  ],
  "hidden_act": "swish",
  "hidden_dim": 2560,
  "image_size": 600,
  "in_channels": [
    32,
    16,
    24,
    40,
    80,
    112,
    192
  ],
  "initializer_range": 0.02,
  "kernel_sizes": [
    3,
    3,
    5,
    3,
    5,
    5,
    3
  ],
  "model_type": "align_vision_model",
  "num_block_repeats": [
    1,
    2,
    2,
    3,
    3,
    4,
    1
  ],
  "num_channels": 3,
  "num_hidden_layers": 64,
  "out_channels": [
    16,
    24,
    40,
    80,
    112,
    192,
    320
  ],
  "pooling_type": "mean",
  "squeeze_expansion_ratio": 0.25,
  "strides": [
    1,
    2,
    2,
    2,
    1,
    2,
    1
  ],
  "transformers_version": "4.51.3",
  "width_coefficient": 2.0
}

In [20]:
# ALIGN模型的整体配置类（包含文本和视觉配置），用于实例化Align模型架构。
# 提供统一接口，整合AlignTextConfig与AlignVisionConfig，用于初始化完整的跨模态对比学习模型。
class AlignConfig(PretrainedConfig):
    r"""
    Example:

    ```python
    >>> from transformers import AlignConfig, AlignModel

    >>> # Initializing a AlignConfig with kakaobrain/align-base style configuration
    >>> configuration = AlignConfig()

    >>> # Initializing a AlignModel (with random weights) from the kakaobrain/align-base style configuration
    >>> model = AlignModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a AlignConfig from a AlignTextConfig and a AlignVisionConfig
    >>> from transformers import AlignTextConfig, AlignVisionConfig

    >>> # Initializing ALIGN Text and Vision configurations
    >>> config_text = AlignTextConfig()
    >>> config_vision = AlignVisionConfig()

    >>> config = AlignConfig.from_text_vision_configs(config_text, config_vision)
    ```"""

    model_type = "align"
     # 指定子配置，便于模型框架自动识别并管理text_config和vision_config
    sub_configs = {"text_config": AlignTextConfig, "vision_config": AlignVisionConfig}

    def __init__(
        self,
        text_config=None, # 文本编码器的配置，类型为AlignTextConfig或其dict表示
        vision_config=None,  # 视觉编码器的配置，类型为AlignVisionConfig或其dict表示
        projection_dim=640, # 文本与图像特征的共同投影维度，用于对比学习
        temperature_init_value=1.0, # 初始温度参数，用于对比损失中的softmax温度缩放
        initializer_range=0.02,  # 权重初始化范围（正态分布标准差）
        **kwargs,
    ):
        super().__init__(**kwargs)
        # 若text_config未提供，使用默认配置并提示
        if text_config is None:
            text_config = {}
            logger.info("text_config is None. Initializing the AlignTextConfig with default values.")
        # 若vision_config未提供，使用默认配置并提示
        if vision_config is None:
            vision_config = {}
            logger.info("vision_config is None. Initializing the AlignVisionConfig with default values.")
        # 实例化文本和视觉子配置
        self.text_config = AlignTextConfig(**text_config)
        self.vision_config = AlignVisionConfig(**vision_config)
        # 存储用于投影层和对比学习的关键参数
        self.projection_dim = projection_dim
        self.temperature_init_value = temperature_init_value
        self.initializer_range = initializer_range
    # 使用AlignTextConfig和AlignVisionConfig对象构造一个AlignConfig配置实例。
    # 返回:AlignConfig: 整合文本与图像配置后的完整配置对象
    @classmethod  # 类方法
    def from_text_vision_configs(
        cls, text_config: AlignTextConfig, vision_config: AlignVisionConfig, **kwargs):
        # 将子配置转换为dict，便于序列化或用于from_pretrained等机制
        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)

In [21]:
__all__ = ["AlignTextConfig", "AlignVisionConfig", "AlignConfig"]

In [22]:
from transformers import AlignConfig, AlignModel

In [23]:
configuration = AlignConfig()

In [25]:
model = AlignModel(configuration)

In [None]:
configuration = model.config
configuration

In [27]:
config_text = AlignTextConfig()
config_vision = AlignVisionConfig()

In [28]:
config = AlignConfig.from_text_vision_configs(config_text, config_vision)