In [1]:
import torch
from torch import nn
from transformers import BertModel, BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name, output_hidden_states=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 1. input

In [10]:
text = "After stealing money from the bank vault, the bank robber was seen " \
    "fishing on the Mississippi river bank."

token_input = tokenizer(text, return_tensors='pt')
token_input['input_ids'], token_input['input_ids'].shape

(tensor([[  101,  2044, 11065,  2769,  2013,  1996,  2924, 11632,  1010,  1996,
           2924, 27307,  2001,  2464,  5645,  2006,  1996,  5900,  2314,  2924,
           1012,   102]]),
 torch.Size([1, 22]))

## 2. model forward
- forward
    - embedding => encoder => pooler

In [8]:
model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states=True)

model.eval()
with torch.no_grad():
    outputs = model(**token_input)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 3. output
- len(outputs) == 3
- outputs[0]
    - last_hidden_size, shape: `batch_size * seq_len * hidden_size(1 * 22 * 768)`
- outputs[1]
    - pooler_output, shape: `batch_size * hidden_size(1*768)`
    - last layer hidden-state of the first token of sequence (classification token,[CLS])
- outputs[2](model.config.output_hidden_states=True)
    - type: tuple
    - one for the output of the embeddings(1), if the model has an embedding layer(12), one for the output of each layer
    - `(1+12) * (batch_size*seq_len*hidden_size) = 13*1*22*768`

- outputs[0] == outputs[2][-1]

- outputs[1] == model.pooler(outputs[2][-1])

- outputs[2][0] == model.embeddings(token_input['input_ids'], token_input['token_type_ids'])

In [9]:
len(outputs)

3

In [11]:
outputs[0].shape

torch.Size([1, 22, 768])

In [13]:
outputs[1]

tensor([[-0.6031, -0.3342, -0.7174,  0.3347,  0.5145, -0.1722,  0.4502,  0.2768,
         -0.3769, -0.9998, -0.3657,  0.7535,  0.9817, -0.0192,  0.7959, -0.3459,
         -0.1338, -0.3026,  0.1097,  0.5836,  0.5736,  0.9999,  0.1798,  0.1845,
          0.2250,  0.9109, -0.5653,  0.8616,  0.8994,  0.7423, -0.2525,  0.0394,
         -0.9894, -0.1331, -0.7763, -0.9826,  0.2223, -0.6115,  0.1941,  0.0177,
         -0.7634,  0.2312,  0.9999, -0.7000,  0.4623, -0.2202, -1.0000,  0.1908,
         -0.8150,  0.6483,  0.5878,  0.8198,  0.1014,  0.3185,  0.3963, -0.3216,
         -0.1701,  0.0588, -0.1544, -0.4987, -0.5284,  0.1228, -0.4823, -0.7788,
          0.6954,  0.0891, -0.0855, -0.1500,  0.0390, -0.0760,  0.6154,  0.2662,
         -0.0129, -0.7253,  0.1352,  0.2921, -0.5613,  1.0000,  0.1536, -0.9681,
          0.7166,  0.2600,  0.4519,  0.5470, -0.2798, -1.0000,  0.3419, -0.2645,
         -0.9863,  0.1263,  0.5249, -0.2000,  0.5980,  0.4752, -0.2355, -0.4808,
         -0.3786, -0.7284, -

In [20]:
# outputs[1] == model.pooler(outputs[2][-1])
outputs[2][0].shape,outputs[2][1].shape,outputs[2][1].shape
len(outputs[2])

13

In [21]:
outputs[2][0]

tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
         [ 0.2329,  0.1390,  0.2979,  ..., -0.0655,  0.8885,  0.5109],
         [ 0.2257, -0.7165, -0.7255,  ...,  0.4844,  0.6030, -0.0957],
         ...,
         [-0.0374, -0.6155, -1.4419,  ...,  0.0793, -0.0811, -0.3802],
         [-0.0228,  0.4207, -0.3288,  ...,  0.4464,  0.5178,  0.5501],
         [-0.2350,  0.1566, -0.0462,  ..., -0.4206,  0.3074, -0.2288]]])

In [22]:
model.embeddings(token_input['input_ids'])

tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
         [ 0.2329,  0.1390,  0.2979,  ..., -0.0655,  0.8885,  0.5109],
         [ 0.2257, -0.7165, -0.7255,  ...,  0.4844,  0.6030, -0.0957],
         ...,
         [-0.0374, -0.6155, -1.4419,  ...,  0.0793, -0.0811, -0.3802],
         [-0.0228,  0.4207, -0.3288,  ...,  0.4464,  0.5178,  0.5501],
         [-0.2350,  0.1566, -0.0462,  ..., -0.4206,  0.3074, -0.2288]]],
       grad_fn=<NativeLayerNormBackward0>)