In [1]:
import torch
import torch.nn as nn
import transformers

from transformers import BertTokenizer, BertModel, AutoConfig

## BERT Model

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config = AutoConfig.from_pretrained('bert-base-uncased')

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel(config, add_pooling_layer=False).to(device)

model.eval();

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [3]:
# View BERT model architecture

model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [4]:
data = [
    'A man with a hard hat is dancing.',
    'A man wearing a hard hat is dancing.',
    'A young child is riding a horse.',
    'A child is riding a horse.',
]

In [5]:
model.encoder.layer.__len__()

12

In [6]:
inputs = tokenizer.batch_encode_plus(
    data,
    return_tensors='pt',
    padding=True,
).to(device)

In [7]:
inputs['input_ids'].shape  # (bs, seq_len)

torch.Size([4, 11])

In [8]:
inputs['input_ids']

tensor([[ 101, 1037, 2158, 2007, 1037, 2524, 6045, 2003, 5613, 1012,  102],
        [ 101, 1037, 2158, 4147, 1037, 2524, 6045, 2003, 5613, 1012,  102],
        [ 101, 1037, 2402, 2775, 2003, 5559, 1037, 3586, 1012,  102,    0],
        [ 101, 1037, 2775, 2003, 5559, 1037, 3586, 1012,  102,    0,    0]],
       device='cuda:0')

In [9]:
inputs['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]], device='cuda:0')

In [10]:
outputs = model(**inputs, output_hidden_states=True, return_dict=True)
outputs.keys()

odict_keys(['last_hidden_state', 'hidden_states'])

In [11]:
outputs['hidden_states'].__len__()

13

In [58]:
(outputs['last_hidden_state'] == outputs['hidden_states'][-1]).all()

tensor(True, device='cuda:0')

In [59]:
outputs.last_hidden_state.shape  # (bs, seq_len, hidden_size)

torch.Size([4, 11, 768])

In [68]:
# cls, cls_before_pooler

pooler_output = outputs.last_hidden_state[:, 0]
pooler_output.shape

torch.Size([4, 768])

In [69]:
# avg

last_hidden = outputs.last_hidden_state   # (bs, seq_len, hidden_size)
attention_mask = inputs['attention_mask']  # (bs, seq_len)

In [73]:
last_hidden.shape

torch.Size([4, 11, 768])

In [70]:
attention_mask.unsqueeze(-1).shape

torch.Size([4, 11, 1])

In [72]:
(last_hidden * attention_mask.unsqueeze(-1)).shape

torch.Size([4, 11, 768])

In [76]:
(last_hidden * attention_mask.unsqueeze(-1)).sum(1).shape

torch.Size([4, 768])

In [78]:
 attention_mask.sum(-1).unsqueeze(-1).shape

torch.Size([4, 1])

In [83]:
((last_hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)).shape

torch.Size([4, 768])

In [92]:
# avg_first_last

first_hidden = outputs.hidden_states[0]
last_hidden = outputs.hidden_states[-1]
pooled_result = (
    (((first_hidden + last_hidden) / 2.0) * attention_mask.unsqueeze(-1)).sum(1) /
    attention_mask.sum(-1).unsqueeze(-1)
)
pooled_result.shape

torch.Size([4, 768])

## DistilBERT Model

In [3]:
import torch

from transformers import DistilBertTokenizer, DistilBertModel, AutoConfig

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config = AutoConfig.from_pretrained('distilbert-base-uncased')

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)

model.eval();

In [6]:
model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [11]:
model.transformer.layer.__len__()

6

In [7]:
data = [
    'A man with a hard hat is dancing.',
    'A man wearing a hard hat is dancing.',
    'A young child is riding a horse.',
    'A child is riding a horse.',
]

In [8]:
inputs = tokenizer.batch_encode_plus(
    data,
    return_tensors='pt',
    padding=True,
).to(device)

In [9]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask'])

In [10]:
outputs = model(**inputs, output_hidden_states=True, return_dict=True)
outputs.keys()

odict_keys(['last_hidden_state', 'hidden_states'])

In [12]:
outputs.hidden_states.__len__()

7

In [14]:
# avg_first_last

attention_mask = inputs.attention_mask

first_hidden = outputs.hidden_states[0]
last_hidden = outputs.hidden_states[-1]
pooled_result = (
    (((first_hidden + last_hidden) / 2.0) * attention_mask.unsqueeze(-1)).sum(1) /
    attention_mask.sum(-1).unsqueeze(-1)
)
pooled_result.shape

torch.Size([4, 768])

In [15]:
pooled_result

tensor([[ 0.0244, -0.1374, -0.0628,  ..., -0.0035,  0.0239,  0.0379],
        [ 0.0257, -0.1583, -0.0503,  ..., -0.0099, -0.0145,  0.0198],
        [ 0.0419, -0.0668, -0.1307,  ..., -0.1258, -0.1022, -0.1649],
        [ 0.0570, -0.0996, -0.1695,  ..., -0.1387, -0.0736, -0.0526]],
       device='cuda:0', grad_fn=<DivBackward0>)