In [1]:
from transformers import BertTokenizer,BertModel,BertForMaskedLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'bert-base-uncased'
text = 'this is a text sentence.'
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
inputs = tokenizer(text,return_tensors='pt')
inputs

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3793, 6251, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [5]:
tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

['[CLS]', 'this', 'is', 'a', 'text', 'sentence', '.', '[SEP]']

## 1. forward and pool output

In [7]:
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [11]:
import torch
with torch.no_grad():
    outputs = model(**inputs)
    outputs
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [12]:
outputs['last_hidden_state'].shape,outputs['pooler_output'].shape

(torch.Size([1, 8, 768]), torch.Size([1, 768]))

In [13]:
outputs['pooler_output']

tensor([[-0.9320, -0.4660, -0.7054,  0.8013,  0.5395, -0.2326,  0.8985,  0.3100,
         -0.5941, -1.0000, -0.1055,  0.8209,  0.9858,  0.2604,  0.9526, -0.6847,
         -0.2648, -0.6366,  0.3471, -0.7114,  0.6518,  0.9998,  0.4752,  0.3604,
          0.5276,  0.9401, -0.6821,  0.9463,  0.9627,  0.7596, -0.7972,  0.2211,
         -0.9909, -0.2564, -0.7488, -0.9928,  0.4347, -0.8085, -0.0717, -0.0137,
         -0.9286,  0.3592,  1.0000, -0.4739,  0.2884, -0.4205, -1.0000,  0.3234,
         -0.9190,  0.7232,  0.6715,  0.5167,  0.2301,  0.5236,  0.5461, -0.0622,
         -0.0630,  0.1883, -0.2802, -0.6607, -0.6670,  0.3900, -0.6133, -0.9413,
          0.5821,  0.5472, -0.1604, -0.3769, -0.1576, -0.0250,  0.9041,  0.2955,
         -0.0418, -0.8321,  0.3900,  0.3109, -0.6546,  1.0000, -0.5625, -0.9819,
          0.6171,  0.5162,  0.6011,  0.0482,  0.2289, -1.0000,  0.5958, -0.2103,
         -0.9917,  0.1602,  0.5617, -0.2821,  0.4634,  0.6287, -0.4933, -0.3350,
         -0.4028, -0.5878, -

## 2. from scratch
![111](https://mccormickml.com/assets/BERT/padding_and_mask.png)

In [17]:
my_output = model.pooler.activation(
    model.pooler.dense(outputs['last_hidden_state'][0][0,:]))
my_output

tensor([-0.9320, -0.4660, -0.7054,  0.8013,  0.5395, -0.2326,  0.8985,  0.3100,
        -0.5941, -1.0000, -0.1055,  0.8209,  0.9858,  0.2604,  0.9526, -0.6847,
        -0.2648, -0.6366,  0.3471, -0.7114,  0.6518,  0.9998,  0.4752,  0.3604,
         0.5276,  0.9401, -0.6821,  0.9463,  0.9627,  0.7596, -0.7972,  0.2211,
        -0.9909, -0.2564, -0.7488, -0.9928,  0.4347, -0.8085, -0.0717, -0.0137,
        -0.9286,  0.3592,  1.0000, -0.4739,  0.2884, -0.4205, -1.0000,  0.3234,
        -0.9190,  0.7232,  0.6715,  0.5167,  0.2301,  0.5236,  0.5461, -0.0622,
        -0.0630,  0.1883, -0.2802, -0.6607, -0.6670,  0.3900, -0.6133, -0.9413,
         0.5821,  0.5472, -0.1604, -0.3769, -0.1576, -0.0250,  0.9041,  0.2955,
        -0.0418, -0.8321,  0.3900,  0.3109, -0.6546,  1.0000, -0.5625, -0.9819,
         0.6171,  0.5162,  0.6011,  0.0482,  0.2289, -1.0000,  0.5958, -0.2103,
        -0.9917,  0.1602,  0.5617, -0.2821,  0.4634,  0.6287, -0.4933, -0.3350,
        -0.4028, -0.5878, -0.2872, -0.26