In [3]:
!pip install bertviz

Collecting bertviz
  Downloading bertviz-1.4.0-py3-none-any.whl (157 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.6/157.6 kB[0m [31m554.4 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting boto3
  Downloading boto3-1.26.155-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.6/135.6 kB[0m [31m661.1 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-macosx_10_9_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Collecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.1-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.8/79.8 kB[0m [31m298.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting

$$
\mathcal{attention} (\text{Q},\text{K},\text{V}) = \mathcal{sofxmax} \left(\frac{\text{Q} \text{K}^{\text{T}}}{\sqrt \mathcal{d}_k} \right) \text V
$$

In [4]:
import torch
from torch import nn
import math
from bertviz.transformers_neuron_view import BertModel,BertConfig
from transformers import BertTokenizer

## 1. model config and load

In [7]:
max_len = 256
model_name = 'bert-base-uncased'
config = BertConfig.from_pretrained(model_name,output_attentions=True,output_hidden_states=True,return_dict=True)
tokenizer = BertTokenizer.from_pretrained(model_name)

config.max_position_embeddings =  max_len
model = BertModel(config).from_pretrained(model_name)
model.eval()

100%|██████████| 440473133/440473133 [02:28<00:00, 2973348.97B/s]


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (den

In [11]:
model.config

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": true,
  "output_hidden_states": false,
  "pad_token_id": 0,
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

In [10]:
att_head_size = int(model.config.hidden_size/model.config.num_attention_heads)
att_head_size

64

![11](https://heidloff.net/assets/img/2023/02/transformers.png)

In [20]:
model.encoder.layer[0].attention.self.query.weight.T[:,:64].shape

torch.Size([768, 64])

## 2. data

In [22]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_trains = fetch_20newsgroups(subset='train')
input_tests = tokenizer(newsgroups_trains['data'][:1],truncation=True,max_length=max_len,return_tensors='pt')

In [24]:
input_tests.keys()
input_tests['input_ids'].shape

torch.Size([1, 201])

## 3. model output

In [25]:
model_output = model(**input_tests)

In [35]:
len(model_output)
len(model_output[-1])
model_output[-1][0]['attn'][0,0,:,:]

tensor([[0.0053, 0.0109, 0.0052,  ..., 0.0039, 0.0036, 0.0144],
        [0.0086, 0.0041, 0.0125,  ..., 0.0045, 0.0041, 0.0071],
        [0.0051, 0.0043, 0.0046,  ..., 0.0043, 0.0045, 0.0031],
        ...,
        [0.0010, 0.0023, 0.0055,  ..., 0.0012, 0.0018, 0.0011],
        [0.0010, 0.0023, 0.0057,  ..., 0.0012, 0.0017, 0.0007],
        [0.0022, 0.0056, 0.0063,  ..., 0.0045, 0.0048, 0.0015]],
       grad_fn=<SliceBackward0>)

## 4. from scratch

In [61]:
emb_output = model.embeddings(input_tests['input_ids'],input_tests['token_type_ids'])

emb_output,emb_output.shape

(tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
          [-0.1172,  0.6055,  0.0487,  ...,  0.5867,  0.8167,  0.4067],
          [-0.7412,  0.3854, -0.7550,  ...,  0.5425,  0.5629,  0.6106],
          ...,
          [ 0.0679,  0.2560,  0.3443,  ...,  0.5042,  0.4860,  0.3145],
          [ 0.1079,  0.0740,  0.4233,  ...,  0.2864,  0.5379,  0.1220],
          [-0.0594, -0.0563,  0.2673,  ..., -0.7952, -0.0813, -0.6690]]],
        grad_fn=<AddBackward0>),
 torch.Size([1, 201, 768]))

In [40]:
model.encoder.layer[0]

BertLayer(
  (attention): BertAttention(
    (self): BertSelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): BertSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): BertIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
  )
  (output): BertOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [62]:
model.encoder.layer[0].attention.self.query.weight.shape,model.encoder.layer[0].attention.self.query.bias.shape

model.encoder.layer[0].attention.self.query.bias[:att_head_size]

tensor([ 0.5849, -0.3312, -0.4301,  0.3745, -0.2981,  0.4103,  0.0136,  0.2938,
         0.2338, -0.1294,  0.1367,  0.4521, -0.1008,  0.1104,  0.4317,  0.5654,
         0.0308, -0.0466, -0.3148, -0.1194,  0.0061,  0.0062,  0.0023,  0.4704,
        -0.0229, -0.0624, -0.0711,  0.5856, -0.4203, -0.0035,  0.3211, -0.0183,
        -0.1371, -0.2399,  0.0593, -0.0781, -0.0723, -0.1255,  0.5402, -0.0784,
        -0.2501, -0.4059, -0.3739, -0.1389, -0.7171, -0.4686, -0.1296, -0.3605,
         0.0325,  0.2098,  0.4282, -0.0019,  0.6725, -0.1765,  0.2999, -0.2933,
         0.4123,  0.0808, -0.1765, -0.2740,  0.6475,  0.0608, -0.3303,  0.1725],
       grad_fn=<SliceBackward0>)

In [46]:
# emb_output[0].shape = (201,768)
# model.encoder.layer[0].attention.self.query.weight.T (768,64)
# 201*64 + 64
Q_fst_head_fst_layer = emb_output[0] @ model.encoder.layer[0].attention.self.query.weight.T[:, :att_head_size] \
    + model.encoder.layer[0].attention.self.query.bias[:att_head_size]

In [49]:
# 201*64
K_fst_head_fst_layer = emb_output[0] @ model.encoder.layer[0].attention.self.key.weight.T[:, :att_head_size] \
    + model.encoder.layer[0].attention.self.key.bias[:att_head_size]

$$
\mathcal{attention} (\text{Q},\text{K},\text{V}) = \mathcal{sofxmax} \left(\frac{\text{Q} \text{K}^{\text{T}}}{\sqrt \mathcal{d}_k} \right) \text V
$$

In [64]:
import torch.nn.functional as F

# 201*201
attn_scores = F.softmax(Q_fst_head_fst_layer @
                        K_fst_head_fst_layer.T / math.sqrt(att_head_size), dim=1)
attn_scores,attn_scores.shape

(tensor([[0.0053, 0.0109, 0.0052,  ..., 0.0039, 0.0036, 0.0144],
         [0.0086, 0.0041, 0.0125,  ..., 0.0045, 0.0041, 0.0071],
         [0.0051, 0.0043, 0.0046,  ..., 0.0043, 0.0045, 0.0031],
         ...,
         [0.0010, 0.0023, 0.0055,  ..., 0.0012, 0.0018, 0.0011],
         [0.0010, 0.0023, 0.0057,  ..., 0.0012, 0.0017, 0.0007],
         [0.0022, 0.0056, 0.0063,  ..., 0.0045, 0.0048, 0.0015]],
        grad_fn=<SoftmaxBackward0>),
 torch.Size([201, 201]))

In [63]:
model_output[-1][0]['attn'][0,0,:,:],model_output[-1][0]['attn'][0,0,:,:].shape

(tensor([[0.0053, 0.0109, 0.0052,  ..., 0.0039, 0.0036, 0.0144],
         [0.0086, 0.0041, 0.0125,  ..., 0.0045, 0.0041, 0.0071],
         [0.0051, 0.0043, 0.0046,  ..., 0.0043, 0.0045, 0.0031],
         ...,
         [0.0010, 0.0023, 0.0055,  ..., 0.0012, 0.0018, 0.0011],
         [0.0010, 0.0023, 0.0057,  ..., 0.0012, 0.0017, 0.0007],
         [0.0022, 0.0056, 0.0063,  ..., 0.0045, 0.0048, 0.0015]],
        grad_fn=<SliceBackward0>),
 torch.Size([201, 201]))

In [58]:
V_fst_head_fst_layer = emb_output[0] @ model.encoder.layer[0].attention.self.value.weight.T[:, :att_head_size] \
    + model.encoder.layer[0].attention.self.value.bias[:att_head_size]
V_fst_head_fst_layer.shape

torch.Size([201, 64])

In [60]:
attn_emb = attn_scores @ V_fst_head_fst_layer
attn_emb.shape

torch.Size([201, 64])