In [1]:
import math
import os
import warnings
from dataclasses import dataclass
from typing import Callable, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers.activations import ACT2FN
from transformers.generation import GenerationMixin
from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa
from transformers.modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel, SequenceSummary
from transformers.pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
from transformers.utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
from transformers.models.gpt2.configuration_gpt2 import GPT2Config

2025-05-31 02:43:05.397065: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748659385.854641      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748659385.974600      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
logger = logging.get_logger(__name__) # 日志对象
_CHECKPOINT_FOR_DOC = "openai-community/gpt2"
_CONFIG_FOR_DOC = "GPT2Config"

In [3]:
# 在pytorch模型中使用tf检查点
# model:pytorch模型实例 gpt2_checkpoint_path:本地tf检查点路径
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
    try:
        import re
        import tensorflow as tf
    except ImportError:  # 捕获导入错误,之后抛出
        logger.error( 
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise
    tf_path = os.path.abspath(gpt2_checkpoint_path) # 绝对路径
    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
    # 从tf检查点加载 返回name,shape 
    init_vars = tf.train.list_variables(tf_path)
    names = [] # 用来存放检查点中权重的name
    arrays = [] # 
    for name, shape in init_vars: # 遍历每个权重的名称和形状
        logger.info(f"Loading TF weight {name} with shape {shape}")
        array = tf.train.load_variable(tf_path, name) # 对应的具体的值
        names.append(name)
        arrays.append(array.squeeze())
    for name, array in zip(names, arrays): # 遍历对应的权重名和值
        name = name[6:]  # 跳过 "model/" 
        name = name.split("/") # 按/切分为列表形式
        pointer = model # 从模型根部开始
        for m_name in name: # 遍历其中的每个name分片
            if re.fullmatch(r"[A-Za-z]+\d+", m_name): 
                scope_names = re.split(r"(\d+)", m_name)
            else:
                scope_names = [m_name]
            if scope_names[0] == "w" or scope_names[0] == "g":
                pointer = getattr(pointer, "weight") # 获取具体的模块值,这时是随机初始化值
            elif scope_names[0] == "b":
                pointer = getattr(pointer, "bias")
            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
                pointer = getattr(pointer, scope_names[0])
                pointer = getattr(pointer, "weight")
            else:
                pointer = getattr(pointer, scope_names[0])
            if len(scope_names) >= 2: # 更新pointer指针指向子层
                num = int(scope_names[1])
                pointer = pointer[num]
        try: 
            if pointer.shape != array.shape:
                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
        except ValueError as e:
            e.args += (pointer.shape, array.shape)
            raise # 抛出错误
        logger.info(f"Initialize PyTorch weight {name}")
        pointer.data = torch.from_numpy(array) # 更新pointer(这时是随机的权重)为检查点中指定的值
    return model

In [4]:
torch.full( [], 64** 0.5)

tensor(8.)

In [5]:
torch.finfo(torch.float32).min

-3.4028234663852886e+38

In [6]:
def eager_attention_forward(module, query, key, value, attention_mask, head_mask=None, **kwargs):
    attn_weights = torch.matmul(query, key.transpose(-1, -2)) # q@k -->(b,h,q_len,k_len)
    if module.scale_attn_weights:  # 如果设定要缩放权重矩阵
        attn_weights = attn_weights / torch.full(
            [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
        )
    # scale_attn_by_inverse_layer_idx = True：表示要对每一层的注意力权重按层编号缩放。
    # layer_idx + 1：当前层编号（从 0 开始，所以 +1 避免除以 0）。
    # 除以它，相当于 越靠后的层，缩放因子越小。
    # 防止高层注意力过强：
    # 因为多层 Transformer 堆叠后，如果不做处理，越往后的层容易产生较大注意力权重（尤其是在残差连接后），可能导
    # 致模型训练不稳定或过度依赖最后几层。
    # 引入层的归一化因子：
    # 这种缩放方式就像 ResNet 中的技巧，用层数反比因子平衡每层的影响，属于 一种手工归一化的策略。
    # 学术和工程习惯中，“缩放因子”通常指用于乘/除的数值本身，即这里应是 1 / (layer_idx + 1)
    if module.scale_attn_by_inverse_layer_idx:
        attn_weights = attn_weights / float(module.layer_idx + 1)
    # 如果不是交叉注意力
    if not module.is_cross_attention:
        # 如果只有“正常”注意层实现因果掩码
        query_length, key_length = query.size(-2), key.size(-2) # q_len,k_len
        # 从模块掩码中切分出(:,:,q_len,k_len)
        causal_mask = module.bias[:, :, key_length - query_length : key_length, :key_length]
        mask_value = torch.finfo(attn_weights.dtype).min # 很大的负数
        # 掩码值:很大的负数
        mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
        # 因果掩码中True的地方用attn_weights中对应的值,False的地方用mask_value(很大的负数)
        attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
    # 如果有传入attention_mask
    if attention_mask is not None:
        # 加上填充掩码,这时attention_mask里面应该是0的地方是不遮挡,而遮挡位置会是很大的负数
        attn_weights = attn_weights + attention_mask
    # 在k_len上归一化,k_len上所有的值的和等于1,这样得到的就是每行表示q中的token和k中所有token的相似性
    # 越大表示语义越近，越小表示越无关
    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
    # # 向下转换（如有必要）回 V 的 dtype（如果是混合精度）——否则为无操作
    attn_weights = attn_weights.type(value.dtype)
    attn_weights = module.attn_dropout(attn_weights) # dropout
    # 遮盖某些头,head_mask会是[1,0,1,...]的形式,0的地方会忽略,权重矩阵被置为0
    if head_mask is not None:
        attn_weights = attn_weights * head_mask 
    # 对value加权,(b,h,q_len,k_len)@(b,h,v_len,hd)-->(b,h,q_len,hd)
    # 这样q中每行token被加权表示成value中各个token的加权形式,语义相似的权重大,
    # 语义无关的几乎不被加权,这样得到的上下文表示中每个token就具有了上下文整体表示的能力
    attn_output = torch.matmul(attn_weights, value)
    attn_output = attn_output.transpose(1, 2) # -->(b,q_len,h,hd)
    return attn_output, attn_weights # 返回注意力输出和权重矩阵

In [7]:
from transformers import GPT2Config

In [8]:
config=GPT2Config()

In [9]:
config.max_position_embeddings 

1024

In [10]:
# 这种下三角矩阵常用于 causal mask（因果掩码）场景，确保位置 i 只能看见 ≤i 的位置，防止模型“偷看未来”
torch.tril(torch.ones((8,8), dtype=torch.bool)).view(
                1, 1,8, 8
            )

tensor([[[[ True, False, False, False, False, False, False, False],
          [ True,  True, False, False, False, False, False, False],
          [ True,  True,  True, False, False, False, False, False],
          [ True,  True,  True,  True, False, False, False, False],
          [ True,  True,  True,  True,  True, False, False, False],
          [ True,  True,  True,  True,  True,  True, False, False],
          [ True,  True,  True,  True,  True,  True,  True, False],
          [ True,  True,  True,  True,  True,  True,  True,  True]]]])

In [11]:
torch.tensor(-1e4)

tensor(-10000.)

In [12]:
config.scale_attn_by_inverse_layer_idx

False

In [13]:
config.reorder_and_upcast_attn

False

In [None]:
# 在 OPT 模型中，相关代码如下：
# if self.reorder_and_upcast_attn:
#     attn_weights = torch.matmul(query, key.transpose(-1, -2)).float()  # upcast to float32
#     ...
#     attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1).type_as(query)  # softmax in float32
# else:
#     attn_weights = torch.matmul(query, key.transpose(-1, -2))
#     attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)

In [14]:
ALL_ATTENTION_FUNCTIONS

<transformers.modeling_utils.AttentionInterface at 0x7ee98aab6b50>

In [15]:
class GPT2Attention(nn.Module): 
    def __init__(self, config, is_cross_attention=False, layer_idx=None):
        super().__init__() 
        self.config = config
        max_positions = config.max_position_embeddings # 最大位置
        self.register_buffer( # 全局因果掩码 使用时切片 
            "bias",
            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                1, 1, max_positions, max_positions
            ),
            persistent=False, # 不会作为权重参数保存到检查点
        )
        # masked_bias 用来作为很大负数  persistent=False就是不会作为检查点中键保存
        # 但仍然可以在前向传播中使用。
        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads # hd
        self.split_size = self.embed_dim 
        # 嵌入维度必须能被头数整除
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        # 是否缩放注意力权重矩阵
        self.scale_attn_weights = config.scale_attn_weights
        self.is_cross_attention = is_cross_attention # 是否是交叉注意力
        # 是否按层索引缩放注意力权重矩阵
        self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
        self.layer_idx = layer_idx # 层索引
        # 是否对注意力权重进行重排序并在计算前进行类型提升（通常到 float32），以提高数值稳定性，尤其是在
        # float16/bfloat16 精度下推理时。
        self.reorder_and_upcast_attn = config.reorder_and_upcast_attn 
        if self.is_cross_attention: # 如果是交叉注意力
            # 用来投影key,value的线性层
            self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
        else: # 如果是自注意力 
            self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
        self.c_proj = Conv1D(self.embed_dim, self.embed_dim) # 输出线性层
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
        self.is_causal = True # 自回归
        self.pruned_heads = set() # 存放已经修剪的头索引
    def prune_heads(self, heads):
        if len(heads) == 0: # 如果没有需要修剪的头,直接返回
            return
        # 返回已经修剪过的头索引,index是没有被修剪的头索引中的表示位置
        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
        # 修剪 conv1d 层 更改其中的权重和截距
        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
        # Update hyper params
        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
        self.num_heads = self.num_heads - len(heads) # 这时模块的头数变成减去修剪后的头之后的
        self.pruned_heads = self.pruned_heads.union(heads) # 更新已修剪的头集合

    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
        bsz, num_heads, q_seq_len, dk = query.size()
        _, _, k_seq_len, _ = key.size()
        # Preallocate attn_weights for `baddbmm`
        attn_weights = torch.empty(
            bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
        # Compute Scale Factor
        scale_factor = 1.0
        if self.scale_attn_weights:
            scale_factor /= float(value.size(-1)) ** 0.5

        if self.scale_attn_by_inverse_layer_idx:
            scale_factor /= float(self.layer_idx + 1)
        # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
        with torch.amp.autocast(query.device.type, enabled=False):
            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
        if not self.is_cross_attention:
            # if only "normal" attention layer implements causal mask
            query_length, key_length = query.size(-2), key.size(-2)
            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
            mask_value = torch.finfo(attn_weights.dtype).min
            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
        if attention_mask is not None:
            # Apply the attention mask
            attn_weights = attn_weights + attention_mask
        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
        if attn_weights.dtype != torch.float32:
            raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
        attn_weights = attn_weights.type(value.dtype)
        attn_weights = self.attn_dropout(attn_weights)
        # Mask heads if we want to
        if head_mask is not None:
            attn_weights = attn_weights * head_mask
        attn_output = torch.matmul(attn_weights, value)
        attn_output = attn_output.transpose(1, 2)
        return attn_output, attn_weights

    def forward(
        self,
        hidden_states: Optional[Tuple[torch.FloatTensor]],
        layer_past: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
        **kwargs,
    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
        # 如果传入了encoder_hidden_states(编码器输出),表示是交叉注意力,这时必须有q_attn属性
        if encoder_hidden_states is not None:
            if not hasattr(self, "q_attn"):
                raise ValueError(
                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
                )
            # 设定跨注意力时的q线性输出,k,v线性输出,这时的注意力掩码为编码器填充掩码
            query_states = self.q_attn(hidden_states)
            key_states, value_states = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
            attention_mask = encoder_attention_mask
        else: # 自注意力 先投影到3d之后拆分
            query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2)
        # -->(b,q_len,h,hd)
        shape_q = (*query_states.shape[:-1], -1, self.head_dim)
        shape_kv = (*key_states.shape[:-1], -1, self.head_dim) # -->(b,k_len,h,hd)
        query_states = query_states.view(shape_q).transpose(1, 2) # -->(b,h,q_len,hd)
        key_states = key_states.view(shape_kv).transpose(1, 2)
        value_states = value_states.view(shape_kv).transpose(1, 2)
        if layer_past is not None: # 如果有缓存
            past_key, past_value = layer_past # 缓存的k,v states
            key_states = torch.cat((past_key, key_states), dim=-2) # 在序列维度合并
            value_states = torch.cat((past_value, value_states), dim=-2)
        if use_cache is True: # 如果设置使用缓存
            present = (key_states, value_states) # 当前的k,v states
        else: 
            present = None
        is_cross_attention = encoder_hidden_states is not None # 是否是跨注意力
        # 判断是否是解码器自注意力 query_states.shape[-2]= 1的话是只有当前token传入
        is_causal = attention_mask is None and query_states.shape[-2] > 1 and not is_cross_attention
        # 是否使用传统的eager注意力机制
        using_eager = self.config._attn_implementation == "eager"
        attention_interface: Callable = eager_attention_forward # 具体的注意力方法
        if self.config._attn_implementation != "eager":
            # 对不能使用sdpa的情况,回退到eager模式 sdpa不支持输出权重矩阵和头掩码
            if self.config._attn_implementation == "sdpa" and (output_attentions or head_mask is not None):
                using_eager = True
                logger.warning_once(
                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
                )
            else: 
                # Attention functions are consistent with previous equivalent attention classes, however they do not support some options
                # (e.g. layer scaling, head mask) that eager supports. These implementations are thus equivalent to previous code, but
                # not necessarily to eager (if mentionned options are provided).
                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
        # 如果使用eager,并且reorder_and_upcast_attn为True
        if using_eager and self.reorder_and_upcast_attn:
            attn_output, attn_weights = self._upcast_and_reordered_attn(
                query_states, key_states, value_states, attention_mask, head_mask
            )
        else: # 这种是使用具体的注意力方法获取注意力输出和权重矩阵
            attn_output, attn_weights = attention_interface(
                self,
                query_states,
                key_states,
                value_states,
                attention_mask,
                head_mask=head_mask,
                dropout=self.attn_dropout.p if self.training else 0.0,
                is_causal=is_causal,
                **kwargs,
            )
        # -->(b,q_len,d)
        attn_output = attn_output.reshape(*attn_output.shape[:-2], -1).contiguous()
        attn_output = self.c_proj(attn_output) 
        attn_output = self.resid_dropout(attn_output) 
        outputs = (attn_output, present) # 注意力输出+缓存
        if output_attentions:
            outputs += (attn_weights,)
        return outputs  # a, present, (attentions)

In [16]:
config._attn_implementation

'eager'

In [17]:
config.activation_function

'gelu_new'

In [18]:
ACT2FN[config.activation_function]

NewGELUActivation()

In [19]:
# 前馈层
class GPT2MLP(nn.Module):
    def __init__(self, intermediate_size, config):
        super().__init__()
        embed_dim = config.hidden_size
        self.c_fc = Conv1D(intermediate_size, embed_dim)
        self.c_proj = Conv1D(embed_dim, intermediate_size)
        self.act = ACT2FN[config.activation_function]
        self.dropout = nn.Dropout(config.resid_pdrop)

    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
        hidden_states = self.c_fc(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.c_proj(hidden_states)
        hidden_states = self.dropout(hidden_states)
        return hidden_states

In [20]:
print(config.n_inner)

None


In [21]:
config.add_cross_attention

False

In [24]:
class GPT2Block(nn.Module):
    def __init__(self, config, layer_idx=None):
        super().__init__()
        hidden_size = config.hidden_size # 隐藏表示大小
        # 前馈中间层维度
        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
        self.attn = GPT2Attention(config=config, layer_idx=layer_idx) # 注意力
        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
        # 交叉注意力的情况 add_cross_attention为True
        if config.add_cross_attention:
            self.crossattention = GPT2Attention(config=config, is_cross_attention=True, layer_idx=layer_idx)
            self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
        self.mlp = GPT2MLP(inner_dim, config)

    def forward(
        self,
        hidden_states: Optional[Tuple[torch.FloatTensor]],
        layer_past: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
        residual = hidden_states # 残差
        hidden_states = self.ln_1(hidden_states) # norm
        attn_outputs = self.attn( # 自注意力
            hidden_states,
            layer_past=layer_past,
            attention_mask=attention_mask,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
        outputs = attn_outputs[1:]
        # 残差连接 residual:上一层的解码器输入或者最开始的嵌入
        hidden_states = attn_output + residual
        if encoder_hidden_states is not None: # 跨注意力的情况
            # 这种情况必须有crossattention模块,表示使用交叉注意力
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
                    "cross-attention layers by setting `config.add_cross_attention=True`"
                )
            residual = hidden_states # 残差是解码器自注意力的输出
            hidden_states = self.ln_cross_attn(hidden_states)
            cross_attn_outputs = self.crossattention( # 交叉注意力
                hidden_states,
                attention_mask=attention_mask,
                head_mask=head_mask,
                encoder_hidden_states=encoder_hidden_states, # 编码器输出
                encoder_attention_mask=encoder_attention_mask,
                output_attentions=output_attentions,
            )
            attn_output = cross_attn_outputs[0] # 交叉注意力的输出
            # 残差
            hidden_states = residual + attn_output
            outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights

        residual = hidden_states
        hidden_states = self.ln_2(hidden_states) # norm
        feed_forward_hidden_states = self.mlp(hidden_states)
        # 前馈前后残差
        hidden_states = residual + feed_forward_hidden_states
        if use_cache: # 如果使用缓存 带上缓存 自注意力的缓存
            outputs = (hidden_states,) + outputs
        else:
            outputs = (hidden_states,) + outputs[1:]
        return outputs  # hidden_states, present, (attentions, cross_attentions)

In [25]:
class GPT2PreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    config_class = GPT2Config # 设定配置类
    load_tf_weights = load_tf_weights_in_gpt2 # 加载tf本地检查点的方法
    base_model_prefix = "transformer" # 基础模型前缀
    is_parallelizable = True  # 可并行化为True
    supports_gradient_checkpointing = True #支持梯度检查点
    _no_split_modules = ["GPT2Block"] # 静态结构中不可拆分的模块
    # 分布设备放置中跳过的配置中的键
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True # 是否支持flash_attn
    _supports_sdpa = True 

    def __init__(self, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs) # 调用父类的初始化方法
    # 初始化权重
    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, Conv1D)):
            # 与使用 truncated_normal 进行初始化的 TF 版本略有不同
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_() # 设置填充对应的表示为全0
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        # 这来自 GPT-2 原论文 Appendix A：为了解决深层网络中的梯度消失或爆炸问题，他们对输出投影矩阵 c_proj.weight 
        # 使用了缩小的标准差。
        # 原因是：Transformer 深度为 n_layer，每层有两个残差路径（对应两个 LayerNorm + Add），所以总共有 2 * n_layer 
        # 个残差路径，初始化时需考虑这一点进行缩放。
        for name, p in module.named_parameters():
            if name == "c_proj.weight":
                # 特殊缩放初始化 --> 每个 Transformer 块有 2 个层范数
                p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))

In [26]:
config.n_layer

12

In [27]:
math.sqrt(2 * config.n_layer)

4.898979485566356

In [44]:
# 双头模型输出
@dataclass # 数据类
class GPT2DoubleHeadsModelOutput(ModelOutput):
    """
    Base class for outputs of models predicting if two sentences are consecutive or not.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss.
        mc_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mc_labels` is provided):
            Multiple choice classification loss.
        logits (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        mc_logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
        past_key_values (`Tuple[Tuple[torch.Tensor]]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of length `config.n_layers`, containing tuples of tensors of shape `(batch_size, num_heads,
            sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            GPT2Attentions weights after the attention softmax, used to compute the weighted average in the
            self-attention heads.
    """
    loss: Optional[torch.FloatTensor] = None 
    mc_loss: Optional[torch.FloatTensor] = None
    logits: Optional[torch.FloatTensor] = None
    mc_logits: Optional[torch.FloatTensor] = None
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None # 缓存
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

In [29]:
GPT2_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`GPT2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

GPT2_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
            their past given to this model should not be passed as `input_ids` as they have already been computed.
        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            If `past_key_values` is used, `attention_mask` needs to contain the masking strategy that was used for
            `past_key_values`. In other words, the `attention_mask` always has to have the length:
            `len(past_key_values) + len(input_ids)`

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.

            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
            `past_key_values`).
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
PARALLELIZE_DOCSTRING = r"""
    This is an experimental feature and is a subject to change at a moment's notice.

    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
    it will evenly distribute blocks across all devices.

    Args:
        device_map (`Dict[int, list]`, *optional*):
            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
            automatically mapped to the first device (for esoteric reasons). That means that the first device should
            have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the
            following number of attention modules:

                - openai-community/gpt2: 12
                - openai-community/gpt2-medium: 24
                - openai-community/gpt2-large: 36
                - openai-community/gpt2-xl: 48

    Example:

    ```python
    # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules:
    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-xl")
    device_map = {
        0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
        1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
        2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
        3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
    }
    model.parallelize(device_map)
    ```
"""
DEPARALLELIZE_DOCSTRING = r"""
    Moves the model to cpu from a model parallel state.

    Example:

    ```python
    # On a 4 GPU machine with openai-community/gpt2-large:
    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-large")
    device_map = {
        0: [0, 1, 2, 3, 4, 5, 6, 7],
        1: [8, 9, 10, 11, 12, 13, 14, 15],
        2: [16, 17, 18, 19, 20, 21, 22, 23],
        3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35],
    }
    model.parallelize(device_map)  # 将模型拆分到多个设备上
    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
    ```
"""

In [None]:
# model.parallelize(device_map) 的作用是：
# 将 模型结构（Module） 拆分到多个设备上运行
# 同时自动将每个层的 参数（权重） 移动到对应设备上
# 所以权重也是跟着结构被分布的
# 注意
# 嵌入层（embedding）和输出层（LMHead）始终在第一个设备（GPU 0）
# 所以为了负载均衡，第0号设备应少分配几层（比如这里只分了9层，其他是13层）
# 当你使用：
# model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-xl")
# model.parallelize(device_map)
# 即使 from_pretrained 加载的是 单个文件（未分片）的权重，parallelize() 也会：
# 自动将权重按层分发到对应设备
# 具体过程如下：
# from_pretrained 会在 主设备（通常是 CPU 或 GPU:0） 上加载完整模型权重
# 调用 model.parallelize(device_map) 时：
# 会将每一层的模块（nn.Module）迁移到 device_map 指定的设备上
# 同时这些模块内部的权重（即 .weight, .bias）也随模块一起移动到对应设备

In [None]:
# ✅ model = GPT2LMHeadModel.from_pretrained(...) 发生了什么？
# 这一步确实：
# 在 CPU 上构建了完整的模型结构
# 并加载了完整的所有权重（即使你有多个 GPU，默认仍是在 CPU 上完成）
# ✅ model.parallelize(device_map) 做了什么？
# 这一步会：
# 将 模型中的每一层（包括其权重） 移动到你提供的 device_map 指定的 GPU 上
# 同时会把这些层从 CPU 移除（即调用 .to(device)），不再占用 CPU 内存
# ❓ 那 CPU 上还保留原始结构或权重吗？
# 不会保留。
# 一旦你调用 parallelize()，以下都会发生：
# 模型各层被转移到目标 GPU（包括结构和权重）
# CPU 上的模型层和权重会 释放掉，不再占用内存
# 这是通过模块内部的 .to(device) 逐层调用完成的，PyTorch 本身会将原先设备上的数据清除。
# 调用 from_pretrained() 后，模型和权重最初在 CPU；调用 parallelize() 后，结构和权重都被分布到多个 GPU，
# CPU 上不会再保留原始结构和参数。这是一个转移，而不是复制。

In [None]:
# ✅ 在 PyTorch 中使用 from_pretrained(..., device_map=...) 加载 sharded checkpoint：
# 🧩 权重文件已分片（sharded checkpoint）：
# HuggingFace 的大模型（如 GPT2-XL、OPT、BLOOM 等）通常提供 .bin, .bin.1, .bin.2 等多个文件组成的权重（即分片）。
# 配套的 pytorch_model.bin.index.json 文件记录了每个权重张量在哪个文件。
# ⚙️ 配置文件中或代码中指定 device_map：
# device_map 决定了 哪些模块（如某些层）分布在哪个 GPU 上。
# 你可以手动指定，也可以自动推理（device_map="auto"）。
# ✅ 模型结构本身并没有显式分片逻辑：
# 结构是完整统一的（例如一个 GPT2LMHeadModel 对象），只是 每个子模块的 .to(device) 被设置为不同的 GPU。
# 换句话说：结构是被动地“被搬运”到各 GPU 的，而不是像 Flax 那样在结构中硬编码分片规则。
# ✅ from_pretrained(...) 做了哪些事：
# 1. 实例化模型结构（Module 树）
# 是的，会实例化模型结构，即构建一个完整的 nn.Module 对象。

# 这一步使用的是模型的 config 信息，如 hidden size、层数等。

# 此时会触发默认的 PyTorch 初始化逻辑（如 nn.Linear 用 Xavier 初始化）。

# 2. 初始化的权重在哪里？
# 默认初始化是在 CPU 上完成。

# 结构定义 + 初始化权重 → 初始模型对象。

# 如果你用 device_map="auto"，HuggingFace 会临时构造空权重模型并跳过初始化（详见下文）。

# 3. 什么时候加载权重？
# 紧接着，from_pretrained() 会加载检查点权重（可能是 sharded 的），用检查点中的值覆盖初始化的权重。

# 🧠 更高效的情况（device_map="auto"）：
# 当你指定 device_map="auto" 并使用支持的模型时（如 bloom-7b1），会走 内存优化路径：

# 利用 init_empty_weights() 创建 结构但无参数值的模型（即参数 .data 都是 empty tensor）。

# 然后用 load_checkpoint_and_dispatch() 直接将 checkpoint 中的分片权重加载到指定的 GPU 上。

# ✅ 此时 不会先初始化全模型的权重再替换，而是直接将 checkpoint 的值加载进空结构里。

# ✅ 总结回答
# 问题	回答
# from_pretrained(...) 会实例化模型吗？	✅ 会实例化 nn.Module 结构
# 实例化时的权重会初始化吗？	✅ 会，除非用了 memory-efficient 路径（如 init_empty_weights()）
# 初始化权重是在 CPU 还是 GPU？	默认是在 CPU（后续 .to() 分配）；memory-efficient 情况会直接指定 GPU
# 使用 sharded 时，加载的检查点权重是怎么处理的？	✅ 覆盖/替代初始化权重，或直接加载进 empty 权重中（更高效)

In [30]:
config.num_hidden_layers

12

In [31]:
range(torch.cuda.device_count())

range(0, 2)

In [33]:
get_device_map(12, range(torch.cuda.device_count()))

{0: [0, 1, 2, 3, 4, 5], 1: [6, 7, 8, 9, 10, 11]}

In [None]:
# del some_tensor
# torch.cuda.empty_cache()
# 用于清理不再需要的中间变量之后，释放显存，防止 OOM。

In [34]:
tuple([None] * 12)

(None, None, None, None, None, None, None, None, None, None, None, None)

In [37]:
torch.arange(2,1 + 2, dtype=torch.long)

tensor([2])

In [38]:
config.add_cross_attention

False

In [39]:
@add_start_docstrings(
    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
    GPT2_START_DOCSTRING,
)
class GPT2Model(GPT2PreTrainedModel):
    _supports_param_buffer_assignment = False
    def __init__(self, config):
        super().__init__(config)
        self.embed_dim = config.hidden_size
        self.wte = nn.Embedding(config.vocab_size, self.embed_dim) # 词嵌入
        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) # 位置嵌入
        self.drop = nn.Dropout(config.embd_pdrop) # dropout
        # 模块列表
        self.h = nn.ModuleList([GPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)])
        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) # norm
        # Model parallel
        self.model_parallel = False # 是否并行
        self.device_map = None # 设备映射
        self.gradient_checkpointing = False  # 是否使用梯度检查点
        self._attn_implementation = config._attn_implementation # 注意力机制实现
        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings(PARALLELIZE_DOCSTRING)
    def parallelize(self, device_map=None): # 并行化
        # Check validity of device_map
        warnings.warn(
            "`GPT2Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your"
            " model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'h.0': 0, 'h.1': 1,"
            " ...}",
            FutureWarning,
        )
        # 设备映射
        self.device_map = (
            get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map
        )
        assert_device_map(self.device_map, len(self.h))
        self.model_parallel = True # 设置为并行状态
        # 第一个设备
        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
        self.last_device = "cuda:" + str(max(self.device_map.keys())) # 最后一个设备
        self.wte = self.wte.to(self.first_device) # 词嵌入和位置嵌入放到第一个设备上
        self.wpe = self.wpe.to(self.first_device)
        # 把各个层放置到设备
        for k, v in self.device_map.items():
            for block in v: # 遍历需要放到同一个设备上的各个层索引
                cuda_device = "cuda:" + str(k) # 当前层需要放置到的设备
                self.h[block] = self.h[block].to(cuda_device) # 把层放到对应的设备上
        # ln_f to last
        self.ln_f = self.ln_f.to(self.last_device)

    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
    def deparallelize(self): # 去并行化
        warnings.warn(
            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
            FutureWarning,
        )
        self.model_parallel = False # 设置并行状态:False
        self.device_map = None # 清空映射字典
        self.first_device = "cpu"  # 去并行,就是移到cpu内存
        self.last_device = "cpu"
        self.wte = self.wte.to("cpu")
        self.wpe = self.wpe.to("cpu")
        for index in range(len(self.h)): # 每一层也移到cpu
            self.h[index] = self.h[index].to("cpu")
        self.ln_f = self.ln_f.to("cpu")
        # 清除 PyTorch 内部的 显存缓存池（caching allocator）中未使用的内存块；
        # 释放这些缓存回 CUDA 驱动，使得 nvidia-smi 看到的显存占用下降；
        # 适用于：你手动释放了变量（如 del tensor）后，希望显存立即释放。
        torch.cuda.empty_cache() # 清空 GPU 缓存内存
    # 获取词嵌入
    def get_input_embeddings(self):
        return self.wte
    def set_input_embeddings(self, new_embeddings):
        self.wte = new_embeddings
    def _prune_heads(self, heads_to_prune):
        # 对每一层进行修剪头
        for layer, heads in heads_to_prune.items(): # 层索引,当前层对应的头索引列表
            self.h[layer].attn.prune_heads(heads)
    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPastAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
        # 是否输出注意力矩阵
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = ( # 是否输出每一层的隐藏状态
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache # 是否使用缓存
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        if input_ids is not None and inputs_embeds is not None: # 不允许同时传入input_ids和inputs_embeds
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None: # 这种情况是有input_ids,无inputs_embeds
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            input_shape = input_ids.size() # 输入形状:(b,s)
            input_ids = input_ids.view(-1, input_shape[-1])
            batch_size = input_ids.shape[0] # b
        elif inputs_embeds is not None: # 有inputs_embeds,无input_ids
            input_shape = inputs_embeds.size()[:-1]
            batch_size = inputs_embeds.shape[0]
        else: # 这种是两者都没有传入,抛出值错误
            raise ValueError("You have to specify either input_ids or inputs_embeds")
        # 获取输入数据所在的设备
        device = input_ids.device if input_ids is not None else inputs_embeds.device
        # token type ids 对应每个token所在的句子编号
        if token_type_ids is not None:
            token_type_ids = token_type_ids.view(-1, input_shape[-1])
        # 如果没传入缓存
        if past_key_values is None:
            past_length = 0 # 设定默认的缓存长度为0
            past_key_values = tuple([None] * len(self.h)) # 初始化每层的缓存都是None
        else: # 如果传入了缓存
            past_length = past_key_values[0][0].size(-2) # 获取缓存中key序列长度
        if position_ids is None: # 设置位置ids
            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0)
        # 如果没传入inputs_embeds,这时根据input_ids获取词嵌入
        if inputs_embeds is None:
            inputs_embeds = self.wte(input_ids)
        position_embeds = self.wpe(position_ids) 
        # 获取层输入 carry
        hidden_states = inputs_embeds + position_embeds.to(inputs_embeds.device)
        # 设置是否使用sdpa
        _use_sdpa = self._attn_implementation == "sdpa" and output_attentions is False and head_mask is None
        attention_mask = attention_mask.view(batch_size, -1) if attention_mask is not None else None
        if self._attn_implementation == "flash_attention_2":
            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
        elif _use_sdpa:
            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( # 准备4维因果掩码
                attention_mask=attention_mask,
                input_shape=(batch_size, input_shape[-1]),
                inputs_embeds=inputs_embeds,
                past_key_values_length=past_length,
            )
        else: # 使用eager机制
            if attention_mask is not None:
                # We create a 4D attention mask from a 2D tensor mask.
                # Sizes are [batch_size, 1, 1, to_seq_length]
                # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
                # this attention mask is more simple than the triangular masking of causal attention
                # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
                attention_mask = attention_mask[:, None, None, :]
                # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
                # masked positions, this operation will create a tensor which is 0.0 for
                # positions we want to attend and the dtype's smallest value for masked positions.
                # Since we are adding it to the raw scores before the softmax, this is
                # effectively the same as removing these entirely.
                attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
                # 注意力bias 遮挡位置会是很大的负数
                attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
        # 如果配置中指定跨注意力可用,并且传入了编码器输出
        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if self.config.add_cross_attention and encoder_hidden_states is not None:
            # 编码器输出对应的批次和序列长度
            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() 
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
            if encoder_attention_mask is None: # 设定默认的编码器填充掩码
                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
            if _use_sdpa: # 如果sdpa可用,准备4d的编码器填充掩码
                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
                    mask=encoder_attention_mask, dtype=inputs_embeds.dtype, tgt_len=input_shape[-1]
                )
            elif not self._attn_implementation == "flash_attention_2": # 如果不是flash_attention
                encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
        else: # 这种是自注意力的情况,这时设定编码器填充掩码为None,因为不需要
            encoder_attention_mask = None
        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # head_mask has shape n_layer x batch x n_heads x N x N
        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
        if token_type_ids is not None: # 带上token type嵌入
            token_type_embeds = self.wte(token_type_ids)
            hidden_states = hidden_states + token_type_embeds
        hidden_states = self.drop(hidden_states) # dropout
        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
        # 如果设置了使用梯度检查点,并且是训练模式
        if self.gradient_checkpointing and self.training:
            if use_cache: # 使用梯度检查点就不能使用缓存
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False
        
        presents = () if use_cache else None # 缓存
        all_self_attentions = () if output_attentions else None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
        all_hidden_states = () if output_hidden_states else None
        for i in range(len(self.h)): # 遍历每一层索引
            block, layer_past = self.h[i], past_key_values[i] # 每一层,每一层对应的缓存k,v states
            # 如果设定了并行化
            if self.model_parallel:
                torch.cuda.set_device(hidden_states.device) # 设置当前线程的默认 GPU 设备为 hidden_states所在device
                # 确保 layer_past 与 hidden_states 位于同一设备上
                if layer_past is not None:
                    layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
                # 确保 attention_mask 始终与 hidden_states 位于同一设备上
                if attention_mask is not None:
                    attention_mask = attention_mask.to(hidden_states.device)
                if isinstance(head_mask, torch.Tensor):
                    head_mask = head_mask.to(hidden_states.device)
            if output_hidden_states: # 每一层的隐藏状态输入
                all_hidden_states = all_hidden_states + (hidden_states,)
            if self.gradient_checkpointing and self.training:
                outputs = self._gradient_checkpointing_func(
                    block.__call__, # 调用每一层的call方法
                    hidden_states,
                    None, # 使用gradient_checkpointing时,缓存不能用,所以这里是None
                    attention_mask,
                    head_mask[i],
                    encoder_hidden_states,
                    encoder_attention_mask,
                    use_cache,
                    output_attentions,
                )
            else:
                outputs = block(
                    hidden_states,
                    layer_past=layer_past,
                    attention_mask=attention_mask,
                    head_mask=head_mask[i],
                    encoder_hidden_states=encoder_hidden_states,
                    encoder_attention_mask=encoder_attention_mask,
                    use_cache=use_cache,
                    output_attentions=output_attentions,
                )

            hidden_states = outputs[0] # 每一层最后的输出
            if use_cache is True: # 每一层都有缓存(past_key,past_value)
                presents = presents + (outputs[1],)

            if output_attentions:
                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
            # 如果模型并行
            if self.model_parallel:
                for k, v in self.device_map.items():
                    # 如果当前层是当前设备对应的层列表的最后一个,并且当前设备不是最后一个设备
                    # 这时要移动hidden_states到下个设备,因为下一层的结构和权重都在下个设备上
                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
        hidden_states = self.ln_f(hidden_states) # 经过所有层之后,隐藏状态标准化
        # 变形
        hidden_states = hidden_states.view(output_shape)
        # Add last hidden state
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)
        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
                if v is not None
            )
        # 带缓存和交叉注意力权重矩阵的结构化输出
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=presents,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )

In [40]:
@add_start_docstrings(
    """
    The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    GPT2_START_DOCSTRING,
)
class GPT2LMHeadModel(GPT2PreTrainedModel, GenerationMixin):
    # 指定要权重共享的键 
    _tied_weights_keys = ["lm_head.weight"]
    def __init__(self, config):
        super().__init__(config)
        self.transformer = GPT2Model(config) # gpt2主模块
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) # 语言模型头
        # Model parallel
        self.model_parallel = False
        self.device_map = None
        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings(PARALLELIZE_DOCSTRING)
    def parallelize(self, device_map=None): # 结构和权重会移到并行的gpu设备
        warnings.warn(
            "`GPT2LMHeadModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load"
            " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'transformer.h.0':"
            " 0, 'transformer.h.1': 1, ...}",
            FutureWarning,
        )
        self.device_map = ( # 设备映射
            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
            if device_map is None
            else device_map
        )
        assert_device_map(self.device_map, len(self.transformer.h))
        self.transformer.parallelize(self.device_map) # 主模块并行化
        self.lm_head = self.lm_head.to(self.transformer.first_device)
        self.model_parallel = True # 设置并行标记

    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
    def deparallelize(self): # 去并行
        warnings.warn(
            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
            FutureWarning,
        )
        self.transformer.deparallelize() # 调用主模块的去并行
        self.transformer = self.transformer.to("cpu") 
        self.lm_head = self.lm_head.to("cpu")
        self.model_parallel = False
        torch.cuda.empty_cache()
    # 获取输出嵌入:(b,s,v)
    def get_output_embeddings(self):
        return self.lm_head
    
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        # 如果并行
        if self.model_parallel:
            # 把hidden_states移动到第一个设备
            torch.cuda.set_device(self.transformer.first_device) 
            hidden_states = hidden_states.to(self.lm_head.weight.device)
        lm_logits = self.lm_head(hidden_states) # (b,s,v)
        loss = None
        if labels is not None: # 如果传入了标签
            # 计算损失
            loss = self.loss_function(
                lm_logits,
                labels,
                vocab_size=self.config.vocab_size,
                **kwargs,
            )

        if not return_dict:
            output = (lm_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return CausalLMOutputWithCrossAttentions(
            loss=loss,
            logits=lm_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
            cross_attentions=transformer_outputs.cross_attentions,
        )

    @staticmethod # 静态方法
    def _reorder_cache(
        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
    ) -> Tuple[Tuple[torch.Tensor]]:
        """
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        """
        return tuple(
            # 0表示批次维度,在这个上下文中是不变的维度 
            # layer_past 是某层的 (past_key, past_value)，shape 通常为：
            # (batch_size * num_beams, num_heads, seq_len, head_dim)
            # past_state.index_select(0, beam_idx.to(past_state.device))
            # 这一行表示按 beam_idx 重排第 0 个维度（即 batch 维度），确保在 beam search 重排序后，缓存的顺序也对得上。
            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
            for layer_past in past_key_values # 遍历每一层的缓存
        )

In [45]:
@add_start_docstrings(
    """
The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
input embeddings, the classification head takes as input the input of a specified classification token index in the
input sequence).
""",
    GPT2_START_DOCSTRING,
)
class GPT2DoubleHeadsModel(GPT2PreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"] # 指定权重字典中权重共享的键
    def __init__(self, config):
        super().__init__(config)
        config.num_labels = 1
        self.transformer = GPT2Model(config) # 主模块
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.multiple_choice_head = SequenceSummary(config) # 多选头
        # Model parallel
        self.model_parallel = False
        self.device_map = None
        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings(PARALLELIZE_DOCSTRING)
    def parallelize(self, device_map=None):
        warnings.warn(
            "`GPT2DoubleHeadsModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should"
            " load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your"
            " own `device_map` but it needs to be a dictionary module_name to device, so for instance"
            " {'transformer.h.0': 0, 'transformer.h.1': 1, ...}",
            FutureWarning,
        )
        self.device_map = (
            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
            if device_map is None
            else device_map
        )
        assert_device_map(self.device_map, len(self.transformer.h))
        self.transformer.parallelize(self.device_map) 
        self.lm_head = self.lm_head.to(self.transformer.first_device)
        self.multiple_choice_head = self.multiple_choice_head.to(self.transformer.first_device)
        self.model_parallel = True

    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
    def deparallelize(self):
        warnings.warn(
            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
            FutureWarning,
        )
        self.transformer.deparallelize()
        self.transformer = self.transformer.to("cpu")
        self.lm_head = self.lm_head.to("cpu")
        self.multiple_choice_head = self.multiple_choice_head.to("cpu")
        self.model_parallel = False
        torch.cuda.empty_cache()

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        mc_token_ids: Optional[torch.LongTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        mc_labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ) -> Union[Tuple, GPT2DoubleHeadsModelOutput]:
        r"""
        mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
            Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -
            1]`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to
            `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`
        mc_labels (`torch.LongTensor` of shape `(batch_size)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
            where *num_choices* is the size of the second dimension of the input tensors. (see *input_ids* above)

        Return:

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, GPT2DoubleHeadsModel

        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
        >>> model = GPT2DoubleHeadsModel.from_pretrained("openai-community/gpt2")

        >>> # Add a [CLS] to the vocabulary (we should train it also!)
        >>> num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})
        >>> # Update the model embeddings with the new vocabulary size
        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))

        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]

        >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
        >>> mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1

        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
        >>> lm_logits = outputs.logits
        >>> mc_logits = outputs.mc_logits
        ```"""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = transformer_outputs[0]
        # 如果设定模型并行
        if self.model_parallel: # 移动hidden_states到lm_head所在设备
            torch.cuda.set_device(self.transformer.first_device)
            hidden_states = hidden_states.to(self.lm_head.weight.device)
        lm_logits = self.lm_head(hidden_states)
        # 多选logits
        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
        mc_loss = None
        if mc_labels is not None: # 如果传入了mc_labels
            loss_fct = CrossEntropyLoss()
            mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
        lm_loss = None
        if labels is not None: # 如果传入了labels
            labels = labels.to(lm_logits.device) 
            shift_logits = lm_logits[..., :-1, :].contiguous() # 因为输入是labels[:,:-1]
            shift_labels = labels[..., 1:].contiguous() # 标签 labels[:,1:]
            loss_fct = CrossEntropyLoss() # 交叉熵损失
            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) # 语言模型损失
        if not return_dict:
            output = (lm_logits, mc_logits) + transformer_outputs[1:]
            if mc_loss is not None:
                output = (mc_loss,) + output
            return ((lm_loss,) + output) if lm_loss is not None else output

        return GPT2DoubleHeadsModelOutput(
            loss=lm_loss,
            mc_loss=mc_loss,
            logits=lm_logits,
            mc_logits=mc_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    @staticmethod # 静态方法
    def _reorder_cache(
        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
    ) -> Tuple[Tuple[torch.Tensor]]:
        """
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        """
        return tuple(
            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
            for layer_past in past_key_values # 遍历出每一层缓存
        )

In [None]:
help(GPT2DoubleHeadsModel.forward)

In [48]:
from transformers import AutoTokenizer, GPT2DoubleHeadsModel

In [49]:
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = GPT2DoubleHeadsModel.from_pretrained("openai-community/gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2DoubleHeadsModel were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['multiple_choice_head.summary.bias', 'multiple_choice_head.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [50]:
model.parallelize()



In [51]:
# Add a [CLS] to the vocabulary (we should train it also!)
num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})

In [52]:
num_added_tokens

1

In [53]:
# Update the model embeddings with the new vocabulary size
embedding_layer = model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [54]:
embedding_layer

Embedding(50258, 768)

In [55]:
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
encoded_choices = [tokenizer.encode(s) for s in choices]
cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]

In [56]:
encoded_choices

[[15496, 11, 616, 3290, 318, 13779, 220, 50257],
 [15496, 11, 616, 3797, 318, 13779, 220, 50257]]

In [57]:
cls_token_location

[7, 7]

In [58]:
input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1

In [60]:
outputs = model(input_ids.to("cuda:0"), mc_token_ids=mc_token_ids.to("cuda:0"))

In [61]:
lm_logits = outputs.logits
mc_logits = outputs.mc_logits

In [64]:
print(lm_logits.shape,mc_logits.shape)

torch.Size([1, 2, 8, 50258]) torch.Size([1, 2])


In [65]:
config.num_labels

2

In [None]:
@add_start_docstrings(
    """
    The GPT2 Model transformer with a sequence classification head on top (linear layer).

    [`GPT2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    GPT2_START_DOCSTRING,
)
class GPT2ForSequenceClassification(GPT2PreTrainedModel): # 序列分类
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.transformer = GPT2Model(config) # 主模块
        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
        # Model parallel
        self.model_parallel = False
        self.device_map = None
        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint="microsoft/DialogRPT-updown",
        output_type=SequenceClassifierOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states) # (b,s,num_labels)
        if input_ids is not None:
            batch_size, sequence_length = input_ids.shape[:2]
        else:
            batch_size, sequence_length = inputs_embeds.shape[:2]
        # 多批次时,需要配置中设定填充token id
        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        if self.config.pad_token_id is None:
            last_non_pad_token = -1
        elif input_ids is not None: # elif中的是pad_token_id有设置的情况
            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
        else:
            last_non_pad_token = -1
            # 当你传入的是 inputs_embeds 而不是 input_ids 时，模型将无法检测 padding token，因此如果你用了 padding token，
            # 结果可能不符合预期。
            # 背景说明：
            # 一般来说，模型接收 input_ids，它会自动查找哪些是 padding token（例如 token id 为 0），然后通过 
            # attention_mask 或 tokenizer 的 pad token id 去处理。
            # 但如果你直接传入 inputs_embeds（嵌入后的张量），模型已经看不到原始的 token id 了，它无法判断哪些是 padding。
            # 因此，无法自动屏蔽 padding 部分的注意力或处理逻辑，从而可能导致错误的 attention、loss 等。
            logger.warning_once(
                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
            )
        # 对每个样本，从它的 logit 序列中选出最后一个非 pad token 的 logit 向量（大小为 vocab_size）
        # logits 形状是 [batch_size, seq_len, vocab_size]
        # last_non_pad_token 是一个形状为 [batch_size] 的张量，记录每个样本中最后一个非 pad token 的位置（即对应的 token index）
        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

In [None]:
logits = torch.tensor([
    [  # batch 0
        [0.1, 0.2, 0.3, 0.4],  # token 0
        [0.5, 0.6, 0.7, 0.8],  # token 1
        [0.9, 1.0, 1.1, 1.2],  # token 2
        [1.3, 1.4, 1.5, 1.6],  # token 3
        [1.7, 1.8, 1.9, 2.0],  # token 4
    ],
    [  # batch 1
        [2.1, 2.2, 2.3, 2.4],
        [2.5, 2.6, 2.7, 2.8],
        [2.9, 3.0, 3.1, 3.2],
        [3.3, 3.4, 3.5, 3.6],
        [3.7, 3.8, 3.9, 4.0],
    ],
    [  # batch 2
        [4.1, 4.2, 4.3, 4.4],
        [4.5, 4.6, 4.7, 4.8],
        [4.9, 5.0, 5.1, 5.2],
        [5.3, 5.4, 5.5, 5.6],
        [5.7, 5.8, 5.9, 6.0],
    ],
])  # shape: (3, 5, 4)

In [None]:
last_non_pad_token = torch.tensor([2, 4, 1])  # 每个样本的最后有效 token 的索引

In [None]:
pooled_logits = logits[torch.arange(3), last_non_pad_token]

In [None]:
# 等价于：
# pooled_logits[0] = logits[0, 2] = [0.9, 1.0, 1.1, 1.2]
# pooled_logits[1] = logits[1, 4] = [3.7, 3.8, 3.9, 4.0]
# pooled_logits[2] = logits[2, 1] = [4.5, 4.6, 4.7, 4.8]
# 最终 pooled_logits 的值：
# tensor([
#     [0.9, 1.0, 1.1, 1.2],
#     [3.7, 3.8, 3.9, 4.0],
#     [4.5, 4.6, 4.7, 4.8],
# ])  # shape: (3, 4)

In [None]:
help()

In [None]:
@add_start_docstrings(
    """
    GPT2 Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    GPT2_START_DOCSTRING,
)
class GPT2ForTokenClassification(GPT2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.transformer = GPT2Model(config)
        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
            classifier_dropout = config.classifier_dropout
        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
            classifier_dropout = config.hidden_dropout
        else:
            classifier_dropout = 0.1
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Model parallel
        self.model_parallel = False
        self.device_map = None

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    # fmt: off
    @add_code_sample_docstrings(
        checkpoint="brad1141/gpt2-finetuned-comp2",
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_loss=0.25,
        expected_output=[
            "Lead",
            "Lead",
            "Lead",
            "Position",
            "Lead",
            "Lead",
            "Lead",
            "Lead",
            "Lead",
            "Lead",
            "Lead",
            "Lead",
        ],
    )
    # fmt: on
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = transformer_outputs[0]
        hidden_states = self.dropout(hidden_states)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + transformer_outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

In [None]:
@add_start_docstrings(
    """
    The GPT-2 Model transformer with a span classification head on top for extractive question-answering tasks like
    SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    GPT2_START_DOCSTRING,
)
class GPT2ForQuestionAnswering(GPT2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.transformer = GPT2Model(config)
        self.qa_outputs = nn.Linear(config.hidden_size, 2)

        # Model parallel
        self.model_parallel = False
        self.device_map = None

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
        real_checkpoint=_CHECKPOINT_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1).to(start_logits.device)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1).to(end_logits.device)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [None]:
__all__ = [
    "GPT2DoubleHeadsModel",
    "GPT2ForQuestionAnswering",
    "GPT2ForSequenceClassification",
    "GPT2ForTokenClassification",
    "GPT2LMHeadModel",
    "GPT2Model",
    "GPT2PreTrainedModel",
    "load_tf_weights_in_gpt2",
]