In [1]:
import torch
from transformers import BertModel,BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name,output_hidden_states=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
model.config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.30.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [4]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

![11](https://heidloff.net/assets/img/2023/02/transformers.png)

- BertLayer
    - attention: BertAttention
        - self: BertSelfAttention
        - ouput: BertSelfOutput
    - intermediate: BertIntermediate 768 => 4*768
    - output BertOutput 4*768 => 768

In [12]:
input_tests = 'this is a 12212121 test sentence'

model_input = tokenizer(input_tests,return_tensors='pt')
model_input

{'input_ids': tensor([[  101,  2023,  2003,  1037, 13092, 12521, 12521,  2487,  3231,  6251,
           102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

## 2. model output

In [14]:
model.eval()
with torch.no_grad():
    output = model(**model_input)

In [15]:
len(output)

3

In [21]:
# embedding output
output[2][0].shape

torch.Size([1, 11, 768])

In [23]:
# first encoder layer output
output[2][1],output[2][1].shape

(tensor([[[ 0.1744,  0.0174, -0.0389,  ...,  0.0573,  0.0755,  0.0823],
          [-0.8607,  0.4032,  0.0201,  ...,  0.0300,  0.5575, -0.0560],
          [-1.3562, -0.4334, -0.5909,  ...,  0.3519,  0.3770,  0.4681],
          ...,
          [ 0.7914, -0.6698, -0.4684,  ..., -0.1181,  0.1001, -2.2668],
          [ 0.8059,  0.5017,  0.0224,  ...,  0.5648, -1.0590, -1.9305],
          [-0.3559,  0.1290,  0.1229,  ..., -0.0796,  0.7220,  0.0411]]]),
 torch.Size([1, 11, 768]))

## 3. from scratch
- BertLayer
    - attention: BertAttention
        - self: BertSelfAttention
        - ouput: BertSelfOutput
    - intermediate: BertIntermediate 768 => 4*768
    - output BertOutput 4*768 => 768

In [26]:
embeddings = output[2][0]
layer = model.encoder.layer[0]
layer

BertLayer(
  (attention): BertAttention(
    (self): BertSelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): BertSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): BertIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
    (intermediate_act_fn): GELUActivation()
  )
  (output): BertOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

#### 3.1 第一次 add&norm in multi-head attention

In [33]:
# qkv (11*768)
mha_output = layer.attention.self(embeddings)
mha_output[0].shape

torch.Size([1, 11, 768])

In [35]:
# add&norm
attn_output = layer.attention.output(mha_output[0],embeddings) 
attn_output.shape

torch.Size([1, 11, 768])

In [38]:
mlp1 = layer.intermediate(attn_output)
mlp1.shape

torch.Size([1, 11, 3072])

In [40]:
layer.output(mlp1, attn_output)

tensor([[[ 0.1744,  0.0174, -0.0389,  ...,  0.0573,  0.0755,  0.0823],
         [-0.8607,  0.4032,  0.0201,  ...,  0.0300,  0.5575, -0.0560],
         [-1.3562, -0.4334, -0.5909,  ...,  0.3519,  0.3770,  0.4681],
         ...,
         [ 0.7914, -0.6698, -0.4684,  ..., -0.1181,  0.1001, -2.2668],
         [ 0.8059,  0.5017,  0.0224,  ...,  0.5648, -1.0590, -1.9305],
         [-0.3559,  0.1290,  0.1229,  ..., -0.0796,  0.7220,  0.0411]]],
       grad_fn=<NativeLayerNormBackward0>)

In [41]:
# first encoder layer output
output[2][1],output[2][1].shape

(tensor([[[ 0.1744,  0.0174, -0.0389,  ...,  0.0573,  0.0755,  0.0823],
          [-0.8607,  0.4032,  0.0201,  ...,  0.0300,  0.5575, -0.0560],
          [-1.3562, -0.4334, -0.5909,  ...,  0.3519,  0.3770,  0.4681],
          ...,
          [ 0.7914, -0.6698, -0.4684,  ..., -0.1181,  0.1001, -2.2668],
          [ 0.8059,  0.5017,  0.0224,  ...,  0.5648, -1.0590, -1.9305],
          [-0.3559,  0.1290,  0.1229,  ..., -0.0796,  0.7220,  0.0411]]]),
 torch.Size([1, 11, 768]))