In [15]:
import torch
from torch import nn
import math
from bertviz.transformers_neuron_view import BertModel,BertConfig
from transformers import BertTokenizer

In [16]:
max_length=256
model_name='bert-base-uncased'
config=BertConfig.from_pretrained(model_name,output_attentions=True,
                                 output_hidden_states=True,
                                 return_dict=True)
tokenizer=BertTokenizer.from_pretrained(model_name)
config.max_position_embeddings=max_length
model=BertModel(config).from_pretrained(model_name)
model=model.eval()

In [17]:
model.config

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": true,
  "output_hidden_states": false,
  "pad_token_id": 0,
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

这份配置是针对一个标准的 bert-base 模型，具体参数含义如下：

"architectures": ["BertForMaskedLM"]:

指定了模型的具体架构。BertForMaskedLM 表示这是一个用于“遮蔽语言模型”任务的BERT模型。通俗讲，就是我们常说的“完形填空”任务，这是BERT预训练的核心任务之一。

"hidden_size": 768:

隐藏层维度。在模型内部，每个词或“词元 (token)”都会被转换成一个包含768个数字的向量（vector）。这是衡量模型复杂度的核心指标之一。

"num_hidden_layers": 12:

Transformer层数。这个模型由12层Transformer编码器堆叠而成。这决定了模型的“深度”。

"num_attention_heads": 12:

注意力头数量。在每一层Transformer中，都有12个“注意力头”。这让模型在处理一个词时，能够同时关注输入文本的不同部分，从而捕捉更丰富的语法和语义关系。

"max_position_embeddings": 512:

最大位置编码。定义了模型单次能处理的文本最大长度为512个词元。

"vocab_size": 30522:

词汇表大小。表示这个模型总共认识30,522个不同的词元（包括完整的单词、子词以及 [CLS], [SEP] 等特殊符号）。

"hidden_dropout_prob": 0.1:

Dropout概率。这是一种在训练时使用的技术，以10%的概率随机“丢弃”一些神经元，用来防止模型过拟合，增强其泛化能力。

"model_type": "bert":

明确指出模型类型是 bert。

In [18]:
att_head_size=int(model.config.hidden_size/model.config.num_attention_heads)# 多头

In [19]:
model.encoder.layer[0].attention.self.query.weight.T[:,:64]

tensor([[-0.0164, -0.0326,  0.0105,  ..., -0.0186, -0.0095,  0.0112],
        [ 0.0261,  0.0346,  0.0334,  ...,  0.0482, -0.0285, -0.0349],
        [-0.0263, -0.0423,  0.0109,  ..., -0.0724, -0.0453, -0.0304],
        ...,
        [ 0.0154, -0.0527, -0.0279,  ..., -0.0434,  0.0170,  0.0217],
        [ 0.0768,  0.1393,  0.0258,  ...,  0.0385,  0.0357, -0.0631],
        [ 0.0548,  0.0078, -0.0468,  ...,  0.0423, -0.0408,  0.0212]],
       grad_fn=<SliceBackward0>)

# data

In [20]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
inputs_tests=tokenizer(newsgroups_train['data'][:1],
                      truncation=True,padding=True,max_length=max_length,
                      return_tensors='pt')


In [23]:
inputs_tests['input_ids'].shape

torch.Size([1, 201])

# model output

In [24]:
model_output=model(**inputs_tests)

- last_hidden_state(batch_size,sequence_length,hidden_size)：last hidden state which is outputted from the last BertLayer
- pooler_output(batch_size,hidden_size):output of the Powoler layer
- hidden_states(batch_size,sequence_length,hidden_size):hidden-states of the model at the output of each BertLayer plus the initial embedding
- attentions(batch_size,num_heads,sequence_length,sequence_length):one for each BertLayer .Attentions weights after the attention SoftMax

In [28]:
model_output[-1][0].keys()

dict_keys(['attn', 'queries', 'keys'])

In [33]:
 model_output[-1][0]

{'attn': tensor([[[[5.3477e-03, 1.0850e-02, 5.1914e-03,  ..., 3.9360e-03,
            3.6427e-03, 1.4398e-02],
           [8.5758e-03, 4.0597e-03, 1.2523e-02,  ..., 4.5496e-03,
            4.1374e-03, 7.1238e-03],
           [5.0512e-03, 4.2847e-03, 4.5569e-03,  ..., 4.2947e-03,
            4.5143e-03, 3.1244e-03],
           ...,
           [1.0410e-03, 2.3348e-03, 5.5335e-03,  ..., 1.2235e-03,
            1.7816e-03, 1.1450e-03],
           [1.0483e-03, 2.2994e-03, 5.7017e-03,  ..., 1.2290e-03,
            1.6825e-03, 6.6760e-04],
           [2.1551e-03, 5.6242e-03, 6.2811e-03,  ..., 4.5159e-03,
            4.7504e-03, 1.4801e-03]],
 
          [[1.8761e-02, 8.6471e-03, 1.3796e-03,  ..., 2.3716e-02,
            2.3086e-02, 7.7072e-05],
           [1.4939e-03, 1.2985e-03, 6.6249e-03,  ..., 6.5390e-04,
            7.7479e-04, 1.7725e-03],
           [5.2377e-04, 2.2026e-03, 2.6308e-02,  ..., 2.6647e-04,
            2.9575e-04, 1.5150e-03],
           ...,
           [5.2149e-04, 2.6457

In [30]:
model_output[-1][0]['attn'][0,0,:,:]

tensor([[0.0053, 0.0109, 0.0052,  ..., 0.0039, 0.0036, 0.0144],
        [0.0086, 0.0041, 0.0125,  ..., 0.0045, 0.0041, 0.0071],
        [0.0051, 0.0043, 0.0046,  ..., 0.0043, 0.0045, 0.0031],
        ...,
        [0.0010, 0.0023, 0.0055,  ..., 0.0012, 0.0018, 0.0011],
        [0.0010, 0.0023, 0.0057,  ..., 0.0012, 0.0017, 0.0007],
        [0.0022, 0.0056, 0.0063,  ..., 0.0045, 0.0048, 0.0015]],
       grad_fn=<SliceBackward0>)

In [36]:
emb_output=model.embeddings(inputs_tests['input_ids'],inputs_tests['token_type_ids'])

In [37]:
emb_output.shape

torch.Size([1, 201, 768])

In [43]:
Q_first_head_first_layer=emb_output[0] @ model.encoder.layer[0].attention.self.query.weight.T[:,:att_head_size]\
+ model.encoder.layer[0].attention.self.query.bias[:att_head_size]
#201*64

In [44]:
K_first_head_first_layer=emb_output[0] @ model.encoder.layer[0].attention.self.key.weight.T[:,:att_head_size]\
+ model.encoder.layer[0].attention.self.key.bias[:att_head_size]
# 201*64

In [46]:
attn_scores=torch.nn.Softmax(dim=-1)(Q_first_head_first_layer@K_first_head_first_layer.T / math.sqrt(att_head_size))

In [47]:
attn_scores

tensor([[0.0053, 0.0109, 0.0052,  ..., 0.0039, 0.0036, 0.0144],
        [0.0086, 0.0041, 0.0125,  ..., 0.0045, 0.0041, 0.0071],
        [0.0051, 0.0043, 0.0046,  ..., 0.0043, 0.0045, 0.0031],
        ...,
        [0.0010, 0.0023, 0.0055,  ..., 0.0012, 0.0018, 0.0011],
        [0.0010, 0.0023, 0.0057,  ..., 0.0012, 0.0017, 0.0007],
        [0.0022, 0.0056, 0.0063,  ..., 0.0045, 0.0048, 0.0015]],
       grad_fn=<SoftmaxBackward0>)