In [1]:
import math  # 数学函数库，如sqrt、log等
import os # 操作系统接口，用于路径等操作
from dataclasses import dataclass  # 简化数据类定义
from typing import Dict, List, Optional, Tuple, Union,Callable # 类型注解支持
import torch  # PyTorch 主库
from torch import nn # 神经网络模块，如Linear, Conv, etc.
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss  # 常用损失函数
from transformers.activations import ACT2FN # 激活函数映射，如gelu、relu等
from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa # 构建SDPA注意力mask
from transformers.modeling_outputs import (  # HF定义的标准模型输出格式
    BaseModelOutput,
    BaseModelOutputWithPooling,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from transformers.modeling_utils import PreTrainedModel  # HF所有模型的基类
from transformers.pytorch_utils import ( # PyTorch相关实用函数
    apply_chunking_to_forward,  # 前向分块处理
    find_pruneable_heads_and_indices, # 找到可剪枝的头
    is_torch_greater_or_equal_than_2_2, # 检查torch版本
    prune_linear_layer,  # 剪枝线性层
)
from transformers.utils import ( # HF通用工具
    ModelOutput,  # 所有模型输出的基类
    add_code_sample_docstrings,  # 添加样例代码
    add_start_docstrings, # 添加类或方法文档开头
    add_start_docstrings_to_model_forward, # 给forward方法加文档注释
    logging,  # 日志工具
    replace_return_docstrings,  # 替换返回值文档说明
)
from transformers.models.albert.configuration_albert import AlbertConfig  # ALBERT模型的配置类

2025-05-23 07:57:39.303812: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747987059.335592      79 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747987059.344674      79 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
logger = logging.get_logger(__name__) # 获取日志类对象
_CHECKPOINT_FOR_DOC = "albert/albert-base-v2"
_CONFIG_FOR_DOC = "AlbertConfig"

In [3]:
# 将 TensorFlow 训练的权重加载到 PyTorch 的 ALBERT 模型中
def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
    """Load tf checkpoints in a pytorch model."""
    try:
        import re
        import numpy as np
        import tensorflow as tf
    except ImportError:
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise
    tf_path = os.path.abspath(tf_checkpoint_path)
    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
    ## 读取 TensorFlow checkpoint 中所有变量
    init_vars = tf.train.list_variables(tf_path)
    names = []
    arrays = []
    for name, shape in init_vars:
        logger.info(f"Loading TF weight {name} with shape {shape}")
        array = tf.train.load_variable(tf_path, name)
        names.append(name)
        arrays.append(array)
     # 打印所有变量名（可注释掉）
    for name, array in zip(names, arrays):
        print(name)

    for name, array in zip(names, arrays):
        original_name = name

        # 去掉 TF Hub 的前缀
        name = name.replace("module/", "")

        # 各种命名替换以匹配 PyTorch 模型结构
        name = name.replace("ffn_1", "ffn")
        name = name.replace("bert/", "albert/")
        name = name.replace("attention_1", "attention")
        name = name.replace("transform/", "")
        name = name.replace("LayerNorm_1", "full_layer_layer_norm")
        name = name.replace("LayerNorm", "attention/LayerNorm")
        name = name.replace("transformer/", "")

        # The feed forward layer had an 'intermediate' step which has been abstracted away
        name = name.replace("intermediate/dense/", "")
        name = name.replace("ffn/intermediate/output/dense/", "ffn_output/")

        # ALBERT attention was split between self and output which have been abstracted away
        name = name.replace("/output/", "/")
        name = name.replace("/self/", "/")

        # The pooler is a linear layer
        name = name.replace("pooler/dense", "pooler")

        # The classifier was simplified to predictions from cls/predictions
        name = name.replace("cls/predictions", "predictions")
        name = name.replace("predictions/attention", "predictions")

        # Naming was changed to be more explicit
        name = name.replace("embeddings/attention", "embeddings")
        name = name.replace("inner_group_", "albert_layers/")
        name = name.replace("group_", "albert_layer_groups/")

        # 分类器权重特殊处理
        if len(name.split("/")) == 1 and ("output_bias" in name or "output_weights" in name):
            name = "classifier/" + name

        # No ALBERT model currently handles the next sentence prediction task
        # SOP 任务（代替NSP）处理
        if "seq_relationship" in name:
            name = name.replace("seq_relationship/output_", "sop_classifier/classifier/")
            name = name.replace("weights", "weight")

        name = name.split("/")

        #  忽略优化器状态权重（如 Adam 的动量）
        if (
            "adam_m" in name
            or "adam_v" in name
            or "AdamWeightDecayOptimizer" in name
            or "AdamWeightDecayOptimizer_1" in name
            or "global_step" in name
        ):
            logger.info(f"Skipping {'/'.join(name)}")
            continue
        # 遍历模型层级，逐层定位对应的 PyTorch 参数
        pointer = model
        for m_name in name:
            if re.fullmatch(r"[A-Za-z]+_\d+", m_name): # 形如 layer_11 的结构
                scope_names = re.split(r"_(\d+)", m_name)
            else:
                scope_names = [m_name]

            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
                pointer = getattr(pointer, "bias")
            elif scope_names[0] == "output_weights":
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "squad":
                pointer = getattr(pointer, "classifier")
            else:
                try:
                    pointer = getattr(pointer, scope_names[0])
                except AttributeError:
                    logger.info(f"Skipping {'/'.join(name)}")
                    continue
            if len(scope_names) >= 2:  # 索引到某个具体层
                num = int(scope_names[1])
                pointer = pointer[num]
                
        # 特殊处理嵌入层和 kernel 权重（转置）
        if m_name[-11:] == "_embeddings":
            pointer = getattr(pointer, "weight")
        elif m_name == "kernel":
            array = np.transpose(array)
         # 检查形状匹配
        try:
            if pointer.shape != array.shape:
                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
        except ValueError as e:
            e.args += (pointer.shape, array.shape)
            raise
        print(f"Initialize PyTorch weight {name} from {original_name}")
        pointer.data = torch.from_numpy(array)

    return model

In [4]:
# .expand((1, -1)) 会返回一个新的 view（共享内存，不复制数据），将原始 shape (8,) 扩展为 shape (1, 8)。
# 创建了一个新的 Tensor 对象
# 该对象的 .data 部分指向相同的底层内存
torch.arange(8).expand((1, -1))

tensor([[0, 1, 2, 3, 4, 5, 6, 7]])

In [5]:
a = torch.arange(8)
b = a.view(2, 4)  # 或 b = a.expand(1, -1)（如果合法）
b[0][0] = 999
print(a)  # 原始 a 也会变，说明共享内存
print(b) 

tensor([999,   1,   2,   3,   4,   5,   6,   7])
tensor([[999,   1,   2,   3],
        [  4,   5,   6,   7]])


In [6]:
torch.zeros(8, dtype=torch.long)

tensor([0, 0, 0, 0, 0, 0, 0, 0])

In [7]:
from transformers import AlbertConfig

In [8]:
# 获取 config（自动识别为 AlbertConfig）
config = AlbertConfig.from_pretrained(_CHECKPOINT_FOR_DOC)

In [9]:
config

AlbertConfig {
  "architectures": [
    "AlbertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 12,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.51.3",
  "type_vocab_size": 2,
  "vocab_size": 30000
}

In [None]:
# 获取 model（一般是 Bert 类的基础模型结构）
# model = AutoModel.from_pretrained(checkpoint, config=config)

In [10]:
# 构建来自词、位置、类型的嵌入层
class AlbertEmbeddings(nn.Module):    
    def __init__(self, config: AlbertConfig):
        super().__init__()
        # 词嵌入层（包含填充索引）
        self.word_embeddings = nn.Embedding(
            config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
        # 位置嵌入层（绝对位置）
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.embedding_size)
        # token 类型嵌入层（如区分句子对 A/B）
        self.token_type_embeddings = nn.Embedding(
            config.type_vocab_size, config.embedding_size)
         # 层归一化（使用原始变量名以便兼容 TensorFlow 权重）
        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
        # dropout，用于正则化
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # # 注册绝对位置索引 buffer，不持久化保存（用于位置嵌入索引）
        # PyTorch 中的 register_buffer 方法用于注册非参数性张量（不会作为 model.parameters() 返回，但会和模型
        # 一起保存和加载，比如 .to(device) 时也会自动迁移）
        # persistent=True（默认） 保存模型时也保存 buffer，比如位置编码。
        # persistent=False 不希望保存，如：推理中可以重新生成、与权重无关的缓存型 buffer。
        # 在 ALBERT 中position_ids 只是一个辅助张量（可在加载时重新生成），所以不需要写入模型文件。
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
         # 设置位置嵌入方式（通常为"absolute"）
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 注册 token 类型索引 buffer，初始化为全 0
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

    # 从 transformers 的 BERT 实现复制过来，用于生成词嵌入表示
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        past_key_values_length: int = 0,
    ) -> torch.Tensor:
        # 获取输入序列的 shape，input_ids 优先
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1] # 切片 (batch_size, seq_len)

        seq_length = input_shape[1] 
        # 若未传入 position_ids，则从缓冲区中截取对应位置位置编码（支持 past_key_values_length）
        # 为啥截取是从past_key_values开始？
        # 这是为了支持 Transformer 解码器的增量生成（incremental generation），尤其是在 自回归推理（
        # autoregressive inference） 中，处理 past_key_values 时的位置对齐问题。
        # 当你进行 增量生成 时，比如已经生成了 5 个 token，接下来只要生成第 6 个，前面的 position embedding 不能
        # 重新计算，新生成的 token 的位置应该是 5，而不是从 0 开始。
        # 举例说明：
        # seq_length = 1（当前只生成一个 token）
        # past_key_values_length = 5（前面已生成了 5 个 token）
        #  你总共的输出是 [Token_0, Token_1, ..., Token_5]，现在要生成第 6 个（即 Token_5）
        # 那么你需要的位置编码是 position_ids = [5]，而不是从 0 开始。
        # self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
        # 就变成：self.position_ids[:, 5:6]  # 只取第6个位置的 position id
        # past_key_values_length 表示已经生成的 token 长度。
        # 截取时从 past_key_values_length 开始，是为了让当前 token 对应的位置编码连续递增，与生成顺序一致。
        # 这是为了兼容生成式模型在解码阶段逐 token 推理时的位置编码
        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        # 若未传入 token_type_ids，尝试从已注册的缓冲区自动填充（通常为全0）
        if token_type_ids is None: # 如果还没有设置token_type_ids
            if hasattr(self, "token_type_ids"):
                buffered_token_type_ids = self.token_type_ids[:, :seq_length] # 从缓冲区取前 seq_length 长度
                # 扩展为 batch_size x seq_length
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
        # 若未传入预编码的 inputs_embeds，则从 input_ids 查表获得词向量
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        # 获取 token_type 的嵌入向量
        token_type_embeddings = self.token_type_embeddings(token_type_ids)
        # 融合词嵌入和 token_type 嵌入
        embeddings = inputs_embeds + token_type_embeddings
         # 如果是绝对位置编码，叠加位置嵌入
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings
        # 层归一化 + dropout
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings


In [11]:
albertEmbeddings=AlbertEmbeddings(config)

In [12]:
albertEmbeddings

AlbertEmbeddings(
  (word_embeddings): Embedding(30000, 128, padding_idx=0)
  (position_embeddings): Embedding(512, 128)
  (token_type_embeddings): Embedding(2, 128)
  (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0, inplace=False)
)

In [13]:
config.type_vocab_size

2

In [None]:
# 词嵌入是为了让token具有数学表示,以用数学方法来建模

In [14]:
# dim=0是指调整layer的输出维度,dim=1是指调整layer的输入维度
def prune_linear_layer(layer: nn.Linear, index: torch.LongTensor, dim: int = 0) -> nn.Linear:
    index = index.to(layer.weight.device)  # 将索引转移到与层权重相同的设备
    # 选择权重矩阵的指定列或行（根据 dim 参数），并克隆得到新的权重 W
    W = layer.weight.index_select(dim, index).clone().detach()
    # 如果有偏置项，根据 dim 选择相应的部分并克隆
    if layer.bias is not None:
        if dim == 1:  # 偏置没有改变维度，只需克隆
            b = layer.bias.clone().detach()
        else: # 根据索引选择对应的偏置部分
            b = layer.bias[index].clone().detach()
    # 形容列表 [384, 512]
    new_size = list(layer.weight.size())
    # index是裁剪后的索引
    # 将维度 dim 上的大小替换为被保留的索引数 len(index)，也就是一个具体的整数值
    # 这里需要修改成剪枝后的形状,dim轴变成剪枝后的,其他轴不变
    new_size[dim] = len(index) 
    # 构建新的线性层
    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
    new_layer.weight.requires_grad = False # 需要修改权重,这里禁用梯度
    new_layer.weight.copy_(W.contiguous()) # 复制表示
    new_layer.weight.requires_grad = True # 重新启用梯度
    if layer.bias is not None: # 如果需要偏距
        new_layer.bias.requires_grad = False 
        new_layer.bias.copy_(b.contiguous())
        new_layer.bias.requires_grad = True
    return new_layer

In [15]:
aa=nn.Linear(512,384)
print(aa.weight.shape,aa.bias.shape)

torch.Size([384, 512]) torch.Size([384])


In [16]:
list(aa.weight.size())

[384, 512]

In [17]:
def find_pruneable_heads_and_indices(
    heads: list[int], n_heads: int, head_size: int, already_pruned_heads: set[int]
) -> tuple[set[int], torch.LongTensor]:
   
    mask = torch.ones(n_heads, head_size)
    heads = set(heads) - already_pruned_heads  # Convert to set and remove already pruned heads
    for head in heads:
        # Compute how many pruned heads are before the head and move the index accordingly
        head = head - sum(1 if h < head else 0 for h in already_pruned_heads)
        mask[head] = 0
    mask = mask.view(-1).contiguous().eq(1)
    index: torch.LongTensor = torch.arange(len(mask))[mask].long()
    return heads, index

In [18]:
class AlbertAttention(nn.Module):
    def __init__(self, config: AlbertConfig):
        super().__init__()
         # 检查 hidden_size 是否能整除注意力头数，除非有 embedding_size（表明是 ALBERT 的共享结构）
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads}"
            )

        self.num_attention_heads = config.num_attention_heads  # 注意力头数量
        self.hidden_size = config.hidden_size # 总的隐藏层维度
        self.attention_head_size = config.hidden_size // config.num_attention_heads  # 每个头的维度
        self.all_head_size = self.num_attention_heads * self.attention_head_size # 一般等于 hidden_size
         # QKV 三个投影层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)
        # attention score dropout
        self.attention_dropout = nn.Dropout(config.attention_probs_dropout_prob)
        # attention输出后的dropout
        self.output_dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dense = nn.Linear(config.hidden_size, config.hidden_size) # 输出层的线性投影
        # 残差连接后的 LayerNorm
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.pruned_heads = set() # 存储已经裁剪的头
        # 位置信息类型（支持绝对或相对位置编码）
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 如果使用相对位置编码，则初始化距离嵌入
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding( # 注意：相对位置范围为 [-max+1, max-1]，共 2*max-1 个位置
                2 * config.max_position_embeddings - 1, self.attention_head_size)

    # 将线性变换后的张量 x 转换为多头注意力所需的形状
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        # 形状从 (batch, seq_len, all_head_size) → (batch, seq_len, num_heads, head_dim)
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        # 交换轴 → (batch, num_heads, seq_len, head_dim)，符合注意力计算要求
        return x.permute(0, 2, 1, 3)
        
    # 剪枝指定的注意力头（可移除部分注意力头以加速推理）
    def prune_heads(self, heads: List[int]) -> None:
        if len(heads) == 0:
            return
        # 计算需要剪枝的头和对应的 index 索引（内部还会排除已经剪掉的头）
        # 返回已经裁剪的头,和裁剪后的头索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.num_attention_heads, self.attention_head_size, self.pruned_heads
        )

        # 对 QKV 线性层进行剪枝  修改权重参数等
        self.query = prune_linear_layer(self.query, index) # 输出维度剪枝
        self.key = prune_linear_layer(self.key, index)
        self.value = prune_linear_layer(self.value, index)
        self.dense = prune_linear_layer(self.dense, index, dim=1) # dense 层输入维度也要剪

         # 更新头的数量和总维度
        self.num_attention_heads = self.num_attention_heads - len(heads)
        # 所有头的总表示
        self.all_head_size = self.attention_head_size * self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)   # 记录已剪掉的头

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: bool = False,
    ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
        # 线性变换生成 QKV
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)
        # 形状调整为 (batch, heads, seq_len, head_dim)
        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        # 原始注意力打分 Q*K^T / sqrt(d)
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        
        if attention_mask is not None:
            # 应用注意力掩码（在 BertModel forward() 函数中为所有层预先计算
            attention_scores = attention_scores + attention_mask
        # 相对位置编码增强
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            seq_length = hidden_states.size()[1] # 序列长度
            # 构造位置索引矩阵，左边为行，右边为列，用于计算任意两个token间的相对位置偏移
            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
            # 得到相对距离矩阵（左 token 相对于右 token 的偏移量）
            distance = position_ids_l - position_ids_r # shape: [seq_len, seq_len]
            # 映射到 embedding 空间，distance_embedding 是一个类似 nn.Embedding 的查表操作
            # 注意相对位置范围是 [-max_pos+1, max_pos-1]，需要平移为正整数索引
            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
            # 保持数据类型一致（避免 fp16 模式下精度冲突）
            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 兼容性
            # 本质上是 query 和“模拟 key 的相对位置信息”之间的点积，从作用上是在增强 key 的相对位置编码，因此得名 relative_key
            if self.position_embedding_type == "relative_key":
                # 仅 key 编码相对位置信息，类比于 Transformer-XL 的相对位置策略
                # query 与相对位置向量点乘，表示当前 query 对各个距离的偏好
                # einsum: 对于每个 head，query: [B, H, L, D], pos: [L, L, D] -> [B, H, L, L]
                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
                attention_scores = attention_scores + relative_position_scores
            elif self.position_embedding_type == "relative_key_query":
                # 同时对 query 和 key 引入相对位置信息，是更对称和完整的相对位置编码方式
                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
                # 两部分分数加在一起形成对称的相对位置信息增强
                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key

        # Softmax 得到注意力概率
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 这实际上是丢弃需要处理的整个 token，这可能看起来有点不寻常，但它取自原始的 Transformer 论文
        # Dropout 处理
        attention_probs = self.attention_dropout(attention_probs)

        # 如果我们愿意的话,Mask heads
        if head_mask is not None:
            attention_probs = attention_probs * head_mask
        # 加权求和得到上下文表示
        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.transpose(2, 1).flatten(2)  # 恢复为原始维度
        # 线性映射 + Dropout + 残差连接 + LayerNorm
        projected_context_layer = self.dense(context_layer)
        projected_context_layer_dropout = self.output_dropout(projected_context_layer)
        layernormed_context_layer = self.LayerNorm(hidden_states + projected_context_layer_dropout)
        return (layernormed_context_layer, attention_probs) if output_attentions else (layernormed_context_layer,)

In [19]:
position_ids_l =torch.arange(8, dtype=torch.long).view(-1, 1) # (8,1) 8:seq_length
position_ids_r = torch.arange(8, dtype=torch.long).view(1, -1) # (1,8)

In [20]:
position_ids_r

tensor([[0, 1, 2, 3, 4, 5, 6, 7]])

In [21]:
distance = position_ids_l - position_ids_r

In [23]:
attention=AlbertAttention(config)

In [24]:
distance
# 每一行 i 表示：第 i 个 token 和所有其他 token 的相对距离。
# distance[2][0] = 2 - 0 = 2，表示第3个 token 距离第1个 token 向右偏移2个位置
# distance[2][3] = 2 - 3 = -1，表示第3个 token 距离第4个 token 向左偏移1个位置
# 这段代码是为了计算序列中所有 token 对所有其他 token 的相对位置，也就是一个完整的 [L, L] 相对位置矩阵，
# 用于构建 attention 中的相对位置打分。
# 不是仅限相邻 token，恰恰相反，是对所有 token 两两之间都建模。

tensor([[ 0, -1, -2, -3, -4, -5, -6, -7],
        [ 1,  0, -1, -2, -3, -4, -5, -6],
        [ 2,  1,  0, -1, -2, -3, -4, -5],
        [ 3,  2,  1,  0, -1, -2, -3, -4],
        [ 4,  3,  2,  1,  0, -1, -2, -3],
        [ 5,  4,  3,  2,  1,  0, -1, -2],
        [ 6,  5,  4,  3,  2,  1,  0, -1],
        [ 7,  6,  5,  4,  3,  2,  1,  0]])

In [25]:
distance.shape

torch.Size([8, 8])

In [26]:
config.position_embedding_type ="relative_key"

In [27]:
attention=AlbertAttention(config)

In [28]:
attention.max_position_embeddings

512

In [29]:
distance + attention.max_position_embeddings - 1

tensor([[511, 510, 509, 508, 507, 506, 505, 504],
        [512, 511, 510, 509, 508, 507, 506, 505],
        [513, 512, 511, 510, 509, 508, 507, 506],
        [514, 513, 512, 511, 510, 509, 508, 507],
        [515, 514, 513, 512, 511, 510, 509, 508],
        [516, 515, 514, 513, 512, 511, 510, 509],
        [517, 516, 515, 514, 513, 512, 511, 510],
        [518, 517, 516, 515, 514, 513, 512, 511]])

In [30]:
attention.distance_embedding

Embedding(1023, 64)

In [31]:
positional_embedding = attention.distance_embedding(distance + attention.max_position_embeddings - 1)

In [32]:
positional_embedding.shape

torch.Size([8, 8, 64])

In [33]:
heads=[1]

In [34]:
heads, index=find_pruneable_heads_and_indices(heads,attention.num_attention_heads,attention.attention_head_size,attention.pruned_heads)

In [35]:
query = prune_linear_layer(attention.query, index)

In [36]:
query

Linear(in_features=768, out_features=704, bias=True)

In [37]:
is_torch_greater_or_equal_than_2_2

True

In [38]:
torch.__version__

'2.6.0+cu124'

In [39]:
class AlbertSdpaAttention(AlbertAttention):
    def __init__(self, config):
        super().__init__(config)
        self.dropout_prob = config.attention_probs_dropout_prob
        # 在 torch < 2.2 的环境中，SDPA 对非连续 QKV 输入+mask 有 bug，需要特殊处理
        self.require_contiguous_qkv = not is_torch_greater_or_equal_than_2_2

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: bool = False,
    ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
         # 如果存在非绝对位置编码、要求输出注意力、或应用 head mask，则 fallback 到原始（eager）实现
        # torch.nn.functional.scaled_dot_product_attention（SDPA）是 PyTorch >= 2.0 引入的内建高性能 
        # attention 实现，但它不支持相对位置编码（如 Transformer-XL 或 T5 的策略）、head mask、或返回 attention weights。
        # 因此这段逻辑检测是否能使用 SDPA，否则 fallback 到手动实现。
        if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
            logger.warning(
                "AlbertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
                "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to "
                "the eager attention implementation, but specifying the eager implementation will be required from "
                "Transformers version v5.0.0 onwards. This warning can be removed using the argument "
                '`attn_implementation="eager"` when loading the model.'
            )
            return super().forward(hidden_states, attention_mask, head_mask, output_attentions)

        batch_size, seq_len, _ = hidden_states.size()
        query_layer = self.transpose_for_scores(self.query(hidden_states)) # shape: [B, H, L, D]
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        value_layer = self.transpose_for_scores(self.value(hidden_states))

        # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
        # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
        # Reference: https://github.com/pytorch/pytorch/issues/112577
        # 在旧版本 PyTorch（2.1.2 及以下）上，GPU 上 SDPA + mask + 非contiguous tensor 会导致错误或性能退化，因
        # 此做了兼容性判断，显式 .contiguous()。
        if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None:
            query_layer = query_layer.contiguous()
            key_layer = key_layer.contiguous()
            value_layer = value_layer.contiguous()
        # 使用 PyTorch 内置的高效 SDPA 算子进行注意力计算：
        # 自动做 qk^T / sqrt(d)、softmax、dropout、softmax @ v，更高效
        # 由于 SDPA 内部是 fused kernel，可以显著减少内存访问开销和中间态保存
        attention_output = torch.nn.functional.scaled_dot_product_attention(
            query=query_layer,
            key=key_layer,
            value=value_layer,
            attn_mask=attention_mask, # 已处理好的 mask，一般 shape: [B, 1, 1, L]
            dropout_p=self.dropout_prob if self.training else 0.0,
            is_causal=False,  # 非自回归
        )

        attention_output = attention_output.transpose(1, 2) # 还原维度顺序：[B, L, H, D]
        attention_output = attention_output.reshape(batch_size, seq_len, self.all_head_size) # 合并头
        # Transformer 模块尾部标准操作：dropout →残差连接 →  LayerNorm。
        projected_context_layer = self.dense(attention_output)
        projected_context_layer_dropout = self.output_dropout(projected_context_layer)
        layernormed_context_layer = self.LayerNorm(hidden_states + projected_context_layer_dropout)
        return (layernormed_context_layer,)

In [None]:
# AlbertSdpaAttention 是为了在支持条件下替换掉原本的手写 attention 逻辑，使用 PyTorch 官方 fused 实现 
# scaled_dot_product_attention，带来性能优势，同时保留对旧特性的 fallback 兼容处理。
# 是否采用 SDPA 主要受以下三者限制：
# 必须是 absolute 位置编码
# 不要求 output_attentions
# 不应用 head_mask
# 这是 HuggingFace 正在逐步推动的优化方向，未来（如 v5.0.0）将需要显式指定是否使用 eager 实现。

In [None]:
# is_causal 参数控制 注意力是否为因果注意力（causal attention），即是否阻止当前 token 看到未来 token。
# is_causal = True 的情况（自回归模式）：
# 用于 语言模型推理、生成任务、训练 autoregressive 模型，如：
# GPT 系列（GPT-2, GPT-3, GPT-4 等）
# Transformer Decoder
# LLM 推理或训练阶段
# 目的是确保：
# 第 i 个 token 只能看到前 i 个位置，不能“偷看”未来 token。
# 🔁 等效于注意力 mask 为下三角矩阵。
# ✅ is_causal = False 的情况（非自回归，双向注意力）：
# 用于 编码器模型或非生成任务，如：
# BERT、RoBERTa、ALBERT
# 任何 encoder-only 架构
# classification / embedding / span prediction 等任务
# 这种情况可以自由看到整个输入序列，不加因果遮挡。

In [40]:
ALBERT_ATTENTION_CLASSES = {
    "eager": AlbertAttention,
    "sdpa": AlbertSdpaAttention,
}

In [41]:
config._attn_implementation

'eager'

In [42]:
config.hidden_act

'gelu_new'

In [43]:
import inspect

In [225]:
# 该函数用于将输入张量按指定维度 `chunk_dim` 分块，每块大小为 `chunk_size`，对每块分别调用 forward_fn，最后拼接结果。
# 目的是减少单次计算的内存占用，特别适用于 FFN 这类对每个 token 独立处理的函数（位置无关）。
# 设计意图：
# - 避免显存溢出：尤其是在处理长序列或大模型时，显存成为瓶颈。
# - 不改变计算结果：在 `forward_fn` 不依赖序列上下文的前提下，chunk 后结果与原计算一致。
def apply_chunking_to_forward(
    forward_fn: Callable[..., torch.Tensor],
    chunk_size: int,
    chunk_dim: int,
    *input_tensors,
) -> torch.Tensor:
    """
    Examples:

    ```python
    # rename the usual forward() fn to forward_chunk()
    def forward_chunk(self, hidden_states):
        hidden_states = self.decoder(hidden_states)
        return hidden_states


    # implement a chunked forward function
    def forward(self, hidden_states):
        return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
    ```"""
    # input_tensors必须是一个元组或列表的张量
    assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors"
    # inspect.signature 自 python 3.5 以来就存在，并且是一个 python 方法 -> 向后兼容没有问题
     # 校验 forward_fn 的参数数量必须与输入张量数量一致，否则报错
    num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
    # print(num_args_in_forward_chunk_fn,len(input_tensors)) 1 1
    if num_args_in_forward_chunk_fn != len(input_tensors):
        raise ValueError(
            f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input "
            "tensors are given"
        )
    # print(chunk_size,chunk_dim) 0 1 # 在序列上分块
    if chunk_size > 0:
         # 获取要分块维度的长度,就是序列维度的时间步长度
        tensor_shape = input_tensors[0].shape[chunk_dim]
         # 遍历批次内的每个输入tensor
        for input_tensor in input_tensors:
            # 确保整个批次中每个样本的序列长度一致
            if input_tensor.shape[chunk_dim] != tensor_shape:
                raise ValueError(
                    f"All input tenors have to be of the same shape: {tensor_shape}, "
                    f"found shape {input_tensor.shape[chunk_dim]}"
                )
        # 只有批次内样本的序列长度是单位块的整数倍才能分块
        if input_tensors[0].shape[chunk_dim] % chunk_size != 0:
            raise ValueError(
                f"The dimension to be chunked {input_tensors[0].shape[chunk_dim]} has to be a multiple of the chunk "
                f"size {chunk_size}"
            )
        # 按块大小来对序列长度维度分块
        num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size
        
         # 对批次内每个样本在序列轴分块 
        input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, dim=chunk_dim) for input_tensor in input_tensors)
        # zip 每组 chunk（同一位置），逐组调用 forward_fn，生成对应的输出块
        output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks))
        # print(type(output_chunks)) <class 'tuple'>
        # 5 torch.Size([2, 2, 768])
        # print(len(output_chunks),output_chunks[0].shape)
        # 最终在相同维度拼接所有输出块，还原成完整输出张量
        return torch.cat(output_chunks, dim=chunk_dim)
    # 如果 chunk_size 不合法，直接正常调用 forward_fn，不分块
    return forward_fn(*input_tensors)

In [227]:
# 该类是 ALBERT Transformer block 的核心组成：Attention + FeedForward + Residual + LayerNorm。与 BERT 的结
# 构一致，只是参数共享（在后续模块体现）
class AlbertLayer(nn.Module):
    def __init__(self, config: AlbertConfig):
        super().__init__()
        self.config = config # 保存配置，用于后续模块构建或forward中使用
        # Feed-Forward模块支持chunk方式处理长序列以节省显存
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        self.seq_len_dim = 1  # 表示序列维度的索引（B, L, D中的L）
        # 层输出的最终 LayerNorm（Residual + FFN/Attention 后
        self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         # 注意力子层（标准或SDPA），从实现类字典中按实现方式选择（如标准/eager/sdpa等）
        self.attention = ALBERT_ATTENTION_CLASSES[config._attn_implementation](config)
        # 前馈网络第一层（输入 -> 中间层），线性变换后再激活
        self.ffn = nn.Linear(config.hidden_size, config.intermediate_size)
        # 前馈网络第二层（中间层 -> 输出），将维度映射回 hidden_size
        self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size)
        self.activation = ACT2FN[config.hidden_act] # 激活函数（如 gelu、relu 等），根据配置设定
        self.dropout = nn.Dropout(config.hidden_dropout_prob) # dropout，用于前馈和残差路径防止过拟合

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
         # 调用注意力模块，返回 attention_output 和（可选）注意力分数
        # attention_output 是 residual add 之前的结果，维度 [B, L, D]
        attention_output = self.attention(hidden_states, attention_mask, head_mask, output_attentions)
        # 对 FFN 层使用 chunk 机制处理长序列，节省显存
        # 实际执行的是 self.ff_chunk(attention_output[0])，其中 attention_output[0] 是上下文表示
        ffn_output = apply_chunking_to_forward(
            self.ff_chunk,
            self.chunk_size_feed_forward,
            # 2,
            self.seq_len_dim,
            attention_output[0],
        )
        # print(type(ffn_output))
        # print(ffn_output.shape) torch.Size([2, 10, 768])
        # 残差连接 + LayerNorm，形成该层最终输出,attention_output[0]是hidden_states
        hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0])
        # 返回当前层输出 + 可选的注意力权重（注意这里的 attention_output 是元组）
        return (hidden_states,) + attention_output[1:]  # add attentions if we output them

    def ff_chunk(self, attention_output: torch.Tensor) -> torch.Tensor:
        ffn_output = self.ffn(attention_output)  # 前馈网络第一层：线性变换提升维度
        ffn_output = self.activation(ffn_output) # 激活函数（如 gelu）引入非线性
        ffn_output = self.ffn_output(ffn_output)  # 前馈网络第二层：映射回原始 hidden_size 维度
        return ffn_output

In [153]:
albertLayer=AlbertLayer(config)

In [154]:
albertLayer

AlbertLayer(
  (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (attention): AlbertAttention(
    (query): Linear(in_features=768, out_features=768, bias=True)
    (key): Linear(in_features=768, out_features=768, bias=True)
    (value): Linear(in_features=768, out_features=768, bias=True)
    (attention_dropout): Dropout(p=0, inplace=False)
    (output_dropout): Dropout(p=0, inplace=False)
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (distance_embedding): Embedding(1023, 64)
  )
  (ffn): Linear(in_features=768, out_features=3072, bias=True)
  (ffn_output): Linear(in_features=3072, out_features=768, bias=True)
  (activation): NewGELUActivation()
  (dropout): Dropout(p=0, inplace=False)
)

In [155]:
albertLayer.chunk_size_feed_forward

0

In [156]:
aa=torch.randn((2,10,768))

In [157]:
aa[0].shape

torch.Size([10, 768])

In [158]:
aav=tuple(cc.chunk(2, dim=0) for cc in aa)

In [159]:
print(type(albertLayer(aa)),len(albertLayer(aa)))

5 torch.Size([2, 2, 768])
5 torch.Size([2, 2, 768])
<class 'tuple'> 1


In [162]:
aav[0][0].shape

torch.Size([5, 768])

In [163]:
print(len(aav),type(aav[0]),len(aav[0]))

2 <class 'tuple'> 2


In [164]:
albertLayer(aa)[0].shape

5 torch.Size([2, 2, 768])


torch.Size([2, 10, 768])

In [166]:
albertLayer(aa)[0].shape

5 torch.Size([2, 2, 768])


torch.Size([2, 10, 768])

In [172]:
[_ for _ in range(config.inner_group_num)]

[0]

In [173]:
nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)])

ModuleList(
  (0): AlbertLayer(
    (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (attention): AlbertAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (attention_dropout): Dropout(p=0, inplace=False)
      (output_dropout): Dropout(p=0, inplace=False)
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (distance_embedding): Embedding(1023, 64)
    )
    (ffn): Linear(in_features=768, out_features=3072, bias=True)
    (ffn_output): Linear(in_features=3072, out_features=768, bias=True)
    (activation): NewGELUActivation()
    (dropout): Dropout(p=0, inplace=False)
  )
)

In [174]:
class AlbertLayerGroup(nn.Module):
    def __init__(self, config: AlbertConfig):
        super().__init__()
        # 创建一个子层列表，包含 inner_group_num 个 AlbertLayer 层
        # ALBERT 的参数共享特性体现在多个组之间共享该组的层结构
        self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)])

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
        layer_hidden_states = ()  # 用于收集每一层的隐藏状态（如果需要）
        layer_attentions = ()  # 用于收集每一层的注意力权重（如果需要）
        # 遍历本组内的每一层（通常1层，多组共享）
        for layer_index, albert_layer in enumerate(self.albert_layers):
             # 调用当前层的前向传播
            layer_output = albert_layer(
                hidden_states, attention_mask, head_mask[layer_index], output_attentions)
            hidden_states = layer_output[0] # 在迭代内更新 hidden_states 以供下一层使用
            # 收集注意力权重
            if output_attentions:
                layer_attentions = layer_attentions + (layer_output[1],)
             # 收集中间隐藏状态
            if output_hidden_states:
                layer_hidden_states = layer_hidden_states + (hidden_states,)
        # 最终输出构成：当前组最后一层的输出 + 可选的中间隐藏状态 + 可选的注意力权重
        outputs = (hidden_states,)
        if output_hidden_states:
            outputs = outputs + (layer_hidden_states,)
        if output_attentions:
            outputs = outputs + (layer_attentions,)
        return outputs  # last-layer hidden state, (layer hidden states), (layer attentions)

In [175]:
layerGroup=AlbertLayerGroup(config)

In [178]:
layerGroup.albert_layers[0]

AlbertLayer(
  (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (attention): AlbertAttention(
    (query): Linear(in_features=768, out_features=768, bias=True)
    (key): Linear(in_features=768, out_features=768, bias=True)
    (value): Linear(in_features=768, out_features=768, bias=True)
    (attention_dropout): Dropout(p=0, inplace=False)
    (output_dropout): Dropout(p=0, inplace=False)
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (distance_embedding): Embedding(1023, 64)
  )
  (ffn): Linear(in_features=768, out_features=3072, bias=True)
  (ffn_output): Linear(in_features=3072, out_features=768, bias=True)
  (activation): NewGELUActivation()
  (dropout): Dropout(p=0, inplace=False)
)

In [None]:
# ALBERT 的参数共享结构：通过多个 AlbertLayerGroup 实现跨层共享。每组内部层数由 inner_group_num 控制。
# 模块化设计：AlbertLayerGroup 封装了一个参数共享单元，为整体模型提供层级复用结构。
# 可选调试功能：支持按需输出注意力权重和中间层隐藏状态，便于调试和分析模型行为。

In [180]:
print(config.embedding_size, config.hidden_size,config.num_hidden_groups)

128 768 1


In [182]:
[None] *config.num_hidden_layers

[None, None, None, None, None, None, None, None, None, None, None, None]

In [184]:
class AlbertTransformer(nn.Module):
    def __init__(self, config: AlbertConfig):
        super().__init__()

        self.config = config
        # 将 embedding 层输出的低维表示（embedding_size）映射到 Transformer 所需的高维表示（hidden_size）
        self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size)
        # 构建多个参数共享的 LayerGroup（组），每组内层结构共享
        self.albert_layer_groups = nn.ModuleList(
            [AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)])

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ) -> Union[BaseModelOutput, Tuple]:
         # 将 embedding 结果映射到 transformer 的 hidden_size 维度
        hidden_states = self.embedding_hidden_mapping_in(hidden_states)
         # 如果需要记录中间层输出，初始化为 tuple，记录每层输出
        all_hidden_states = (hidden_states,) if output_hidden_states else None
        # 如果需要记录注意力权重，初始化为 tuple
        all_attentions = () if output_attentions else None
        # 如果没有提供 head_mask，默认设置为 None 列表
        head_mask = [None] * self.config.num_hidden_layers if head_mask is None else head_mask
         # 遍历每一层（实际层数由 config.num_hidden_layers 决定）
        for i in range(self.config.num_hidden_layers):
            # 每组包含的层数
            layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups)

            # 当前层属于哪一组（组索引）
            group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
             # 传入组索引获取当前的组,并前向传播
            layer_group_output = self.albert_layer_groups[group_idx](
                hidden_states,
                attention_mask,
                # 为该组选择对应的 head mask 子集
                head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
                output_attentions,
                output_hidden_states,
            )
            # 更新 hidden_states 以供下一层使用
            hidden_states = layer_group_output[0]
             # 累积 attentions（如果需要）
            if output_attentions:
                all_attentions = all_attentions + layer_group_output[-1]
             # 累积中间隐藏状态（如果需要）
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)
        # 根据是否使用字典返回结构，组织输出
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states,   # 最后一层的输出
            hidden_states=all_hidden_states,   # 所有层的隐藏状态（可选）
            attentions=all_attentions   # 所有层的注意力权重（可选）
        )

In [None]:
#  设计意图说明：
# 参数共享：ALBERT 的核心思想是通过分组 LayerGroup 来共享参数，减少参数规模，提高效率。
# 层数控制：num_hidden_layers 决定总层数，num_hidden_groups 控制参数共享的粒度。
# 调试支持：通过 output_attentions 与 output_hidden_states，可以在训练或分析过程中提取中间结果。
# 兼容性输出：支持 return_dict=False 以返回 tuple，兼容老式调用方式。
# 这是 ALBERT 模型结构中的 Transformer 主体模块，核心是对 embedding 后的表示进行多层共享 Transformer 编码。

In [None]:
# 模型版本	Embedding Size (embedding_size)	Hidden Size (hidden_size)	Layers (num_hidden_layers)	Parameters (M)
# ALBERT-base	128	768	12	~12M
# ALBERT-large	128	1024	24	~18M
# ALBERT-xlarge	128	2048	24	~60M
# ALBERT-xxlarge	128	4096	12	~235M

In [185]:
config.initializer_range

0.02

In [None]:
from 

In [186]:
# ALBERT 的预训练模型基类，用于提供权重初始化逻辑，以及下载和加载预训练模型的接口封装。
# 所有 ALBERT 模型应继承自该类，以保持统一的结构和权重处理方式。
class AlbertPreTrainedModel(PreTrainedModel):

    config_class = AlbertConfig # 指定使用的配置类
    load_tf_weights = load_tf_weights_in_albert  # 支持从 TensorFlow 权重加载
    base_model_prefix = "albert"  # 模型名称前缀，用于加载权重时识别
    _supports_sdpa = True  # 是否支持 SDPA（缩放点积注意力优化）
    # 为模型各个子模块初始化权重，初始化策略如下
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            # Linear 层权重采用正态分布初始化，均值 0，标准差来自 config，偏置初始化为 0
            # 与 TensorFlow 的 truncated_normal 不同
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # Embedding 层也使用正态分布初始化
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 对于 padding_idx，置为 0，避免梯度更新
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # LayerNorm 初始化：偏置为 0，权重为 1
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        elif isinstance(module, AlbertMLMHead):
            # ALBERT 的 masked language model 输出头的 bias 初始化为 0
            module.bias.data.zero_()

In [188]:
# 用于表示 AlbertForPreTraining 模型的输出结构，继承自 HuggingFace 的 ModelOutput，
# 支持通过属性名访问字段，同时保持向后兼容的字典形式。
# 设计目的是将预训练模型所有可能的输出信息（如损失、预测 logits、SOP 分类结果、
# 隐藏状态、注意力权重）整合成一个统一的数据结构，便于用户访问与调试。
@dataclass
class AlbertForPreTrainingOutput(ModelOutput):
    # 总损失：包含掩码语言模型损失（MLM loss）和下一句预测任务（SOP loss）
    loss: Optional[torch.FloatTensor] = None
    # MLM 任务的输出 logits，shape 为 (batch_size, sequence_length, vocab_size)
    # 表示每个位置上对词表中每个词的预测分数（未 softmax）
    prediction_logits: Optional[torch.FloatTensor] = None
    # SOP（Sentence Order Prediction）任务的分类输出 logits，shape 为 (batch_size, 2)
    # 表示两个标签（句子顺序正确 / 错误）的预测分数（未 softmax）
    sop_logits: Optional[torch.FloatTensor] = None
    # 模型在每层输出的隐藏状态序列（包括 embedding 层输出）
    # 每个元素 shape 为 (batch_size, sequence_length, hidden_size)
    # 当设置 output_hidden_states=True 时返回，用于调试或分析内部表示
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 每层的注意力权重，shape 为 (batch_size, num_heads, seq_len, seq_len)
    # 当设置 output_attentions=True 时返回，用于可视化注意力分布
    attentions: Optional[Tuple[torch.FloatTensor]] = None
# 该结构为预训练任务（MLM + SOP）提供了完整的输出接口设计，便于灵活访问每部分结果，同时能与 
# transformers 的其他模型输出格式保持一致。

In [189]:
ALBERT_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Args:
        config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

ALBERT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

In [191]:
config.inner_group_num

1

In [192]:
@add_start_docstrings(
    "The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
    ALBERT_START_DOCSTRING,
)
class AlbertModel(AlbertPreTrainedModel):
    # ALBERT 模型的主干类，继承自预训练模型基类，支持权重加载与初始化等通用功能。
    config_class = AlbertConfig
    base_model_prefix = "albert"

    def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True):
        super().__init__(config)
        self.config = config
        # 嵌入层，包括词嵌入、位置嵌入、类型嵌入（即 segment embedding）
        self.embeddings = AlbertEmbeddings(config)
        # Transformer 编码器部分，负责处理嵌入后的输入序列
        self.encoder = AlbertTransformer(config)
        # 池化层（可选）：用于提取句子级别的表示（如 [CLS] token）
        if add_pooling_layer:
            self.pooler = nn.Linear(config.hidden_size, config.hidden_size)
            self.pooler_activation = nn.Tanh()  # 池化后通常接 Tanh 激活作为句子向量
        else:
            self.pooler = None
            self.pooler_activation = None
         # 注意力机制实现方式（用于兼容 SDPA 等）
        self.attn_implementation = config._attn_implementation
        # 位置嵌入类型（例如绝对或相对位置编码）
        self.position_embedding_type = config.position_embedding_type
        # 执行权重初始化和其他后处理操作
        self.post_init()

    def get_input_embeddings(self) -> nn.Embedding:
         # 获取当前模型使用的词嵌入层，常用于模型嵌入替换
        return self.embeddings.word_embeddings
    # 设置词嵌入层，使用户可以自定义嵌入矩阵（如共享或重新训练）
    def set_input_embeddings(self, value: nn.Embedding) -> None:
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
        """
        裁剪注意力头（prune attention heads），用于模型压缩和加速推理。    
        参数 heads_to_prune: dict，格式为 {层索引: 要裁剪的头部索引列表}
        ALBERT 的结构与传统 Transformer 不同，它采用了参数共享的机制：
        - 例如 12 层 Transformer 可以被划分为若干“组”（hidden groups）；
        - 每组内部又有多个“子层”（inner groups）；
        - 实际层数 = num_hidden_groups * inner_group_num。
        为了方便裁剪，所有子层被“平铺”成一个线性索引：
        - 假设 inner_group_num=2，那么索引 0,1 对应 group 0 的两个子层，
          索引 2,3 对应 group 1 的两个子层。    
        任何不在有效范围的索引（如非 0,1,2,3）都会报错。
        实际裁剪调用 attention 层的 prune_heads 方法，具体实现在 attention 层中。
        """
        #  假设layer=2,inner_group_num=3 这个是每组的层数,group_idx=0这时是当前层对应的组索引
        for layer, heads in heads_to_prune.items(): # 对应层索引-->每层要微调的头的列表
            # 通过线性索引反推：属于第几个组（group_idx）每层对应的组
            group_idx = int(layer / self.config.inner_group_num) 
            # 当前层在当前所在组的层列表中的索引
            inner_group_idx = int(layer - group_idx * self.config.inner_group_num)
            # self.encoder.albert_layer_groups[group_idx] 获取指定的组
            # .albert_layers[inner_group_idx] 获取指定层
            self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[BaseModelOutputWithPooling, Tuple]:
        # 如果未显式传入，则使用 config 中的默认参数
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 不允许同时传入 input_ids 和 inputs_embeds，两者只能选其一
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
             # 提示用户未传 attention_mask 时可能会导致 pad 被误处理为有效 token
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            input_shape = input_ids.size() # (b,s)
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        batch_size, seq_length = input_shape
        device = input_ids.device if input_ids is not None else inputs_embeds.device
         # 若未显式传入 attention_mask，则默认全为 1（表示所有位置都参与 attention
        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)
        # 若未显式传入 token_type_ids（即 segment embedding），则自动构造
        if token_type_ids is None:
            if hasattr(self.embeddings, "token_type_ids"):
                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
        # 计算嵌入表示，支持 input_ids 或 inputs_embeds 两种输入方式
        embedding_output = self.embeddings(
            input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
        )
        # 判断是否使用 SDPA（高效 attention 实现），需满足多个条件
        use_sdpa_attention_mask = (
            self.attn_implementation == "sdpa"
            and self.position_embedding_type == "absolute"
            and head_mask is None
            and not output_attentions
        )
        # 构造 SDPA 所需的 4D attention mask
        if use_sdpa_attention_mask:
            extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
                attention_mask, embedding_output.dtype, tgt_len=seq_length
            )
        else:
            # 普通 attention mask 的构造逻辑
            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
            extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)   # 为了 fp16 兼容
            extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(self.dtype).min  # 将 pad 位置设为 -inf
        # 获取注意力头 mask，支持按需裁剪注意力头
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
        # 调用 Transformer 编码器进行上下文建模
        encoder_outputs = self.encoder(
            embedding_output,
            extended_attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
         # 经过tranformer编码后的输出 (b,s,d)
        sequence_output = encoder_outputs[0]
        # 若设置了 pooler，则对 [CLS] 位置做一个全连接 + tanh，作为 pooled_output（句向量）
        pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0])) if self.pooler is not None else None
        # 返回结构化输出或元组，依据 return_dict 参数决定
        if not return_dict:
            return (sequence_output, pooled_output) + encoder_outputs[1:]

        return BaseModelOutputWithPooling(
            last_hidden_state=sequence_output,  # 所有 token 的最后一层输出
            pooler_output=pooled_output,    # 句子级向量（来自 [CLS]）
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )


In [194]:
from transformers.models.albert.modeling_albert import AlbertMLMHead

In [195]:
albertModel=AlbertModel(config)

In [None]:
# 设计意图总结：
# 兼容多种输入形式（id 或嵌入）；
# 自动补齐缺省字段（如 attention_mask）；
# 支持高效 attention 实现（SDPA）；
# 统一返回结构支持后续任务（如分类、QA）；
# 灵活开启中间层输出与注意力权重输出。

In [197]:
class AlbertMLMHead(nn.Module):
    def __init__(self, config: AlbertConfig):
        super().__init__()
        # LayerNorm 作用于 embedding_size，统一输出分布，增强训练稳定性
        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
        # decoder 的 bias 参数，独立管理以便于 weight tying（共享 embedding 和 decoder 权重）
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
        # 将 transformer 输出从 hidden_size 映射到 embedding_size，便于对 decoder 权重共享
        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
        # 最终用于生成词表维度输出的线性层（可与 embedding 权重共享）
        self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
        self.activation = ACT2FN[config.hidden_act]  # 激活函数，一般为 GELU 或其他配置指定的非线性函数
        # 显式绑定 decoder.bias 到独立 bias 参数，实现 bias 权重共享
        self.decoder.bias = self.bias

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将隐藏状态映射回 embedding 空间
        hidden_states = self.dense(hidden_states) 
        hidden_states = self.activation(hidden_states) # 应用非线性变换
        hidden_states = self.LayerNorm(hidden_states)  # 正则化处理
        hidden_states = self.decoder(hidden_states)  # 投影回词表空间，得到每个 token 的 logits

        prediction_scores = hidden_states
         # 返回预测每个 token 的 logits
        return prediction_scores

    def _tie_weights(self) -> None:
        # 用于确保 decoder.bias 与 self.bias 始终共享（在部分设备或加载过程中可能失效）
        if self.decoder.bias.device.type == "meta":
            # "meta" 类型仅用于初始化，不应实际训练，用于兼容加速框架
            self.decoder.bias = self.bias
        else:
            # 如果这两个权重断开连接（在 TPU 上或调整偏距大小时），则将它们绑定在一起
            self.bias = self.decoder.bias

In [198]:
# SOP（Sentence Order Prediction）是 ALBERT 特有的句子顺序预测任务
class AlbertSOPHead(nn.Module): 
    def __init__(self, config: AlbertConfig):
        super().__init__()
         # dropout 用于减少过拟合，提高泛化能力
        self.dropout = nn.Dropout(config.classifier_dropout_prob)
         # 分类器，输入为 pooled_output（句子级表示），输出为两个类别的 logits（顺序正确 or 错误）
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, pooled_output: torch.Tensor) -> torch.Tensor:
        # 对 pooled_output 应用 dropout，防止过拟合
        dropout_pooled_output = self.dropout(pooled_output)
        logits = self.classifier(dropout_pooled_output) # 使用线性分类器得到 SOP 任务的 logits
        return logits # 返回表示两个类别的 logits（用于顺序预测）

In [200]:
# @add_start_docstrings(
#     """
#     Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
#     `sentence order prediction (classification)` head.
#     """,
#     ALBERT_START_DOCSTRING,
# )
class AlbertForPreTraining(AlbertPreTrainedModel):
    # 指定权重共享的关键路径，用于参数绑定（tie weights），确保 decoder 权重和 embedding 权重一致
    _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]

    def __init__(self, config: AlbertConfig):
        super().__init__(config)
        # 主体部分为基础的 AlbertModel，用于提取通用表示
        self.albert = AlbertModel(config)
        # MLM 头，用于预测被 mask 掉的 token
        self.predictions = AlbertMLMHead(config)
         # Sentence Order Prediction 头，用于判断两句是否是上下文连接（是否交换）
        self.sop_classifier = AlbertSOPHead(config) 

        # 权重初始化及最终处理（继承自 PreTrainedModel）
        self.post_init()

    def get_output_embeddings(self) -> nn.Linear:
        # 返回 decoder 权重，用于 tie weights
        return self.predictions.decoder

    def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
        # 设置新的 decoder 权重，用于 tie weights 或模型迁移
        self.predictions.decoder = new_embeddings
     # 返回嵌入层权重，用于 tie weights 或自定义初始化
    def get_input_embeddings(self) -> nn.Embedding:
        return self.albert.embeddings.word_embeddings

    # @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # @replace_return_docstrings(output_type=AlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        sentence_order_label: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[AlbertForPreTrainingOutput, Tuple]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        sentence_order_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`. `0` indicates original order (sequence A, then
            sequence B), `1` indicates switched order (sequence B, then sequence A).

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, AlbertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForPreTraining.from_pretrained("albert/albert-base-v2")

        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
        >>> # Batch size 1
        >>> outputs = model(input_ids)

        >>> prediction_logits = outputs.prediction_logits
        >>> sop_logits = outputs.sop_logits
        ```"""
        # 如果未显式设置 return_dict，则使用 config 中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 调用主干 AlbertModel，获得 sequence_output 和 pooled_output
       # sequence_output: 所有 token 的表示，用于 MLM
       # pooled_output: 句子级表示（通常为第一个 token 的表示），用于 SOP
        outputs = self.albert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output, pooled_output = outputs[:2]
         # MLM 预测：对每个 token 进行词预测
        prediction_scores = self.predictions(sequence_output)
       # SOP 分类：判断句子顺序是否被打乱
        sop_scores = self.sop_classifier(pooled_output)
        total_loss = None
        # 如果提供了标签，则计算 MLM loss 和 SOP loss
        if labels is not None and sentence_order_label is not None:
            loss_fct = CrossEntropyLoss()
            # 计算 masked language model 的交叉熵损失
            masked_lm_loss = loss_fct(
                prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
            # 计算 sentence order prediction 的交叉熵损失
            sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1))
            total_loss = masked_lm_loss + sentence_order_loss  # 总损失为两者之和
            
        # 若不使用字典形式返回结果，则使用元组方式返回
        if not return_dict:
            output = (prediction_scores, sop_scores) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output
        # 使用结构化字典返回结果，方便下游使用
        return AlbertForPreTrainingOutput(
            loss=total_loss,  # 总损失（可为 None）
            prediction_logits=prediction_scores,  # MLM 输出 logits
            sop_logits=sop_scores,   # SOP 输出 logits
            hidden_states=outputs.hidden_states,  # 隐藏层状态（可选）
            attentions=outputs.attentions,    # 注意力权重（可选）
        )

In [202]:
from transformers import AutoTokenizer, AlbertForPreTraining

In [203]:
tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

In [204]:
model = AlbertForPreTraining.from_pretrained("albert/albert-base-v2")

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForPreTraining were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['sop_classifier.classifier.bias', 'sop_classifier.classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [206]:
tokenizer.encode("Hello, my dog is cute")

[2, 10975, 15, 51, 1952, 25, 10901, 3]

In [207]:
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)

In [208]:
input_ids

tensor([[    2, 10975,    15,    51,  1952,    25, 10901,     3]])

In [209]:
outputs = model(input_ids)

In [212]:
prediction_logits = outputs.prediction_logits
prediction_logits.shape

torch.Size([1, 8, 30000])

In [213]:
sop_logits = outputs.sop_logits
sop_logits.shape

torch.Size([1, 2])

In [230]:
# @add_start_docstrings(
#     "Albert Model with a `language modeling` head on top.",
#     ALBERT_START_DOCSTRING,
# )
class AlbertForMaskedLM(AlbertPreTrainedModel):
    # 指定需要共享权重的参数键（通常用于权重绑定 tied weights）
    _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]

    def __init__(self, config):
        super().__init__(config)
         # 主体 encoder，不包含池化层，因为 masked LM 不需要句子级表示
        self.albert = AlbertModel(config, add_pooling_layer=False)
        self.predictions = AlbertMLMHead(config) # MLM 头部，用于预测 masked 的词

         # 初始化权重等操作
        self.post_init()
     # 获取输出层权重（用于 tied weights 或导出）
    def get_output_embeddings(self) -> nn.Linear:
        return self.predictions.decoder
     # 设置输出层权重，同时绑定 bias
    def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
        self.predictions.decoder = new_embeddings
        self.predictions.bias = new_embeddings.bias
     # 获取输入词嵌入层（用于 tied weights）
    def get_input_embeddings(self) -> nn.Embedding:
        return self.albert.embeddings.word_embeddings

    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[MaskedLMOutput, Tuple]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Returns:

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, AlbertForMaskedLM

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForMaskedLM.from_pretrained("albert/albert-base-v2")

        >>> # add mask_token
        >>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt")
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> # retrieve index of [MASK]
        >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
        >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
        >>> tokenizer.decode(predicted_token_id)
        'france'
        ```

        ```python
        >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
        >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
        >>> outputs = model(**inputs, labels=labels)
        >>> round(outputs.loss.item(), 2)
        0.81
        ```
        """
        # 设置是否使用字典方式返回结果（兼容旧接口）
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # 编码器前向传播，返回 sequence_output 和其它信息
        outputs = self.albert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_outputs = outputs[0] # 获取 token 级别表示（B, L, H）
        # MLM 预测 logits (B, L, Vocab)
        prediction_scores = self.predictions(sequence_outputs)

        masked_lm_loss = None
        if labels is not None:
             # 有监督训练时计算交叉熵损失（忽略 label=-100 的位置）
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
             # 返回 tuple 格式结果
            output = (prediction_scores,) + outputs[2:] # outputs[2:] 包含 hidden_states 和 attentions（如果开启）
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
        # 返回字典格式结果
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [231]:
model = AlbertForMaskedLM.from_pretrained("albert/albert-base-v2")

Some weights of the model checkpoint at albert/albert-base-v2 were not used when initializing AlbertForMaskedLM: ['albert.pooler.bias', 'albert.pooler.weight']
- This IS expected if you are initializing AlbertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [222]:
model

AlbertForMaskedLM(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertSdpaAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True

In [232]:
inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt")

In [233]:
with torch.no_grad():
    logits = model(**inputs).logits

In [235]:
logits.shape

torch.Size([1, 9, 30000])

In [236]:
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

In [237]:
mask_token_index

tensor([4])

In [238]:
predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)

In [239]:
tokenizer.decode(predicted_token_id)

'france'

In [240]:
labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]

In [241]:
labels

tensor([[   2,   14, 1057,   16,  714,   25, 1162,    9,    3]])

In [242]:
labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)

In [243]:
labels

tensor([[-100, -100, -100, -100,  714, -100, -100, -100, -100]])

In [248]:
outputs = model(**inputs, labels=labels)
print(outputs.logits.shape,outputs.loss)

torch.Size([1, 9, 30000]) tensor(0.8129, grad_fn=<NllLossBackward0>)


In [249]:
round(outputs.loss.item(), 2)

0.81

In [250]:
config.num_labels 

2

In [252]:
print(config.problem_type)

None


In [None]:
@add_start_docstrings(
    """
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    """,
    ALBERT_START_DOCSTRING,
)
class AlbertForSequenceClassification(AlbertPreTrainedModel):
    def __init__(self, config: AlbertConfig):
        super().__init__(config)
        self.num_labels = config.num_labels  # 类别数量（用于分类或回归任务）
        self.config = config  # 保存配置以供后续使用

        self.albert = AlbertModel(config)   # 主体模型，输出hidden_states和pooled_output
        self.dropout = nn.Dropout(config.classifier_dropout_prob)  # 防止过拟合的Dropout层
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)  # 分类器，根据pooled_output输出最终logits

        # Initialize weights and apply final processing
        self.post_init()  # 初始化权重和其它后处理

    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="textattack/albert-base-v2-imdb",
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="'LABEL_1'",
        expected_loss=0.12,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[SequenceClassifierOutput, Tuple]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            标签，用于计算损失函数。如果为1类问题，使用回归损失；否则使用交叉熵或BCE。
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 调用Albert模型获得encoder的输出
        outputs = self.albert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]  # 获取句子的整体表示（通常是[CLS]向量）

        pooled_output = self.dropout(pooled_output)  # 加Dropout增强泛化能力
        logits = self.classifier(pooled_output) # 分类器输出预测logits

        loss = None
        if labels is not None:
              # 动态确定任务类型（支持回归、单标签、多标签分类）
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"
             # 根据任务类型选择合适的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
         # 返回格式支持字典或元组两种风格
        if not return_dict:
            output = (logits,) + outputs[2:] # outputs[2:] = (hidden_states, attentions)
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [None]:
# 简要设计意图总结：
# 本模型基于 AlbertModel，并在 pooled_output 后加一层线性变换，实现分类或回归。
# 通过 problem_type 自动适应不同类型的下游任务。
# 支持多种输入格式和输出格式，兼容 Huggingface 的训练与推理流程。

In [None]:




@add_start_docstrings(
    """
    Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    ALBERT_START_DOCSTRING,
)
class AlbertForTokenClassification(AlbertPreTrainedModel):
    def __init__(self, config: AlbertConfig):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.albert = AlbertModel(config, add_pooling_layer=False)
        classifier_dropout_prob = (
            config.classifier_dropout_prob
            if config.classifier_dropout_prob is not None
            else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[TokenClassifierOutput, Tuple]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.albert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


@add_start_docstrings(
    """
    Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    ALBERT_START_DOCSTRING,
)
class AlbertForQuestionAnswering(AlbertPreTrainedModel):
    def __init__(self, config: AlbertConfig):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.albert = AlbertModel(config, add_pooling_layer=False)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="twmkn9/albert-base-v2-squad2",
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
        qa_target_start_index=12,
        qa_target_end_index=13,
        expected_output="'a nice puppet'",
        expected_loss=7.36,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[AlbertForPreTrainingOutput, Tuple]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.albert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        logits: torch.Tensor = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


@add_start_docstrings(
    """
    Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    ALBERT_START_DOCSTRING,
)
class AlbertForMultipleChoice(AlbertPreTrainedModel):
    def __init__(self, config: AlbertConfig):
        super().__init__(config)

        self.albert = AlbertModel(config)
        self.dropout = nn.Dropout(config.classifier_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, 1)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[AlbertForPreTrainingOutput, Tuple]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
            *input_ids* above)
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )
        outputs = self.albert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits: torch.Tensor = self.classifier(pooled_output)
        reshaped_logits = logits.view(-1, num_choices)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [187]:
__all__ = [
    "load_tf_weights_in_albert",
    "AlbertPreTrainedModel",
    "AlbertModel",
    "AlbertForPreTraining",
    "AlbertForMaskedLM",
    "AlbertForSequenceClassification",
    "AlbertForTokenClassification",
    "AlbertForQuestionAnswering",
    "AlbertForMultipleChoice",
]