In [1]:
import torch
from transformers import BertTokenizer, BertModel
import logging
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## defines tokenizer to tokenize input.
## All words unknown to the vocabulary will be split into subwords all the way down to invidual characters
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
text = "Sometimes I am so sick of being unproductive that I have dreams where I am being chased by deadlines"

In [4]:
## adding BERT defined separators in front

markedUpText = "[CLS]" + text + "[SEP]"

tokenized_text = tokenizer.tokenize(markedUpText)

print(text.split())
print(tokenized_text)

['Sometimes', 'I', 'am', 'so', 'sick', 'of', 'being', 'unproductive', 'that', 'I', 'have', 'dreams', 'where', 'I', 'am', 'being', 'chased', 'by', 'deadlines']
['[CLS]', 'sometimes', 'i', 'am', 'so', 'sick', 'of', 'being', 'un', '##pro', '##ductive', 'that', 'i', 'have', 'dreams', 'where', 'i', 'am', 'being', 'chased', 'by', 'deadline', '##s', '[SEP]']


In [5]:
## exploring the vocabulary of the BERT tokenizer

## [0] - [PAD]
## [100] - [UNK]
## [101] - [CLS]
## [102] - [SEP]
## [104] - [MASK]
## [999:1996] are starting characters ! ... ~
## [1996:29612] are words
## [29612:] are subwords


list(tokenizer.vocab.keys())[29612:]

['##!',
 '##"',
 '###',
 '##$',
 '##%',
 '##&',
 "##'",
 '##(',
 '##)',
 '##*',
 '##+',
 '##,',
 '##-',
 '##.',
 '##/',
 '##:',
 '##;',
 '##<',
 '##=',
 '##>',
 '##?',
 '##@',
 '##[',
 '##\\',
 '##]',
 '##^',
 '##_',
 '##`',
 '##{',
 '##|',
 '##}',
 '##~',
 '##¡',
 '##¢',
 '##£',
 '##¤',
 '##¥',
 '##¦',
 '##§',
 '##¨',
 '##©',
 '##ª',
 '##«',
 '##¬',
 '##®',
 '##±',
 '##´',
 '##µ',
 '##¶',
 '##·',
 '##º',
 '##»',
 '##¼',
 '##¾',
 '##¿',
 '##æ',
 '##ð',
 '##÷',
 '##þ',
 '##đ',
 '##ħ',
 '##ŋ',
 '##œ',
 '##ƒ',
 '##ɐ',
 '##ɑ',
 '##ɒ',
 '##ɔ',
 '##ɕ',
 '##ə',
 '##ɡ',
 '##ɣ',
 '##ɨ',
 '##ɪ',
 '##ɫ',
 '##ɬ',
 '##ɯ',
 '##ɲ',
 '##ɴ',
 '##ɹ',
 '##ɾ',
 '##ʀ',
 '##ʁ',
 '##ʂ',
 '##ʃ',
 '##ʉ',
 '##ʊ',
 '##ʋ',
 '##ʌ',
 '##ʎ',
 '##ʐ',
 '##ʑ',
 '##ʒ',
 '##ʔ',
 '##ʰ',
 '##ʲ',
 '##ʳ',
 '##ʷ',
 '##ʸ',
 '##ʻ',
 '##ʼ',
 '##ʾ',
 '##ʿ',
 '##ˈ',
 '##ˡ',
 '##ˢ',
 '##ˣ',
 '##ˤ',
 '##β',
 '##γ',
 '##δ',
 '##ε',
 '##ζ',
 '##θ',
 '##κ',
 '##λ',
 '##μ',
 '##ξ',
 '##ο',
 '##π',
 '##ρ',
 '##σ',
 '##τ',
 '##υ',
 '##φ',

In [6]:
## map words to indices

token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)

for token, id in zip(tokenized_text, token_ids):
    print(token, id)

[CLS] 101
sometimes 2823
i 1045
am 2572
so 2061
sick 5305
of 1997
being 2108
un 4895
##pro 21572
##ductive 26638
that 2008
i 1045
have 2031
dreams 5544
where 2073
i 1045
am 2572
being 2108
chased 13303
by 2011
deadline 15117
##s 2015
[SEP] 102


In [7]:
## add sentence ids to each token

sentence_ids = [1] * len(token_ids)

for token, token_id, sentence_id in zip(tokenized_text, token_ids, sentence_ids):
    print(token, token_id, sentence_id)

[CLS] 101 1
sometimes 2823 1
i 1045 1
am 2572 1
so 2061 1
sick 5305 1
of 1997 1
being 2108 1
un 4895 1
##pro 21572 1
##ductive 26638 1
that 2008 1
i 1045 1
have 2031 1
dreams 5544 1
where 2073 1
i 1045 1
am 2572 1
being 2108 1
chased 13303 1
by 2011 1
deadline 15117 1
##s 2015 1
[SEP] 102 1


In [8]:
## We need to turn token id and segment id lists into tensors

tokens_tensor = torch.tensor([token_ids])

segment_tensor = torch.tensor([sentence_ids])

print(tokens_tensor)

print(segment_tensor)

tensor([[  101,  2823,  1045,  2572,  2061,  5305,  1997,  2108,  4895, 21572,
         26638,  2008,  1045,  2031,  5544,  2073,  1045,  2572,  2108, 13303,
          2011, 15117,  2015,   102]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [9]:
## load the BERT pretrained model

model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [10]:
with torch.no_grad():
    outputs = model(tokens_tensor, segment_tensor)

    print(tokens_tensor.shape)
    print(segment_tensor.shape)

    ## output = last layer hidden state, pooler output, hidden state output by layer
    hidden_states = outputs[2] # last layer

    layer_id = 0

    for layer in hidden_states:

        print(f"layer: {layer_id} shape: {layer.shape}")

        print(f"\t length of a sentence is {len(layer)}")

        sentence_hidden_layer = layer[0]

        for word, output in zip(tokenized_text, sentence_hidden_layer):
            print(f"\t\t{word}: {output[:3]}")

        layer_id += 1

torch.Size([1, 24])
torch.Size([1, 24])
layer: 0 shape: torch.Size([1, 24, 768])
	 length of a sentence is 1
		[CLS]: tensor([ 0.1686, -0.2858, -0.3261])
		sometimes: tensor([-0.8025, -0.1504, -0.4737])
		i: tensor([-0.3319,  0.4860, -0.1578])
		am: tensor([-0.5092,  0.0347,  0.0977])
		so: tensor([-0.3429,  0.2499,  0.3117])
		sick: tensor([-0.3673,  0.5709,  0.4238])
		of: tensor([-0.3686,  0.1742,  0.0343])
		being: tensor([-0.0210,  0.3527, -0.6497])
		un: tensor([-0.6634, -0.5497,  0.1337])
		##pro: tensor([ 0.3372,  0.9769, -0.8495])
		##ductive: tensor([ 0.8530, -0.5424, -0.2855])
		that: tensor([-0.9356,  0.6592, -0.4697])
		i: tensor([-0.3938,  0.6048, -0.0081])
		have: tensor([-0.5228,  0.5126,  0.4224])
		dreams: tensor([-0.8008,  0.4791, -0.7845])
		where: tensor([-1.0717,  0.7141, -1.0482])
		i: tensor([-0.4299,  0.7476, -0.1223])
		am: tensor([-0.5885,  0.3569,  0.3604])
		being: tensor([-0.1835,  0.6141, -0.5630])
		chased: tensor([-0.0855, -0.2292,  0.2734])
		by: tenso

In [11]:
with torch.no_grad():
    outputs = model(tokens_tensor, segment_tensor)

    hidden_states = outputs[2][-1]

    for word, output in zip(tokenized_text, sentence_hidden_layer):
        print(f"\t\t{word}: {output[:3]}")

		[CLS]: tensor([ 0.5613,  0.0896, -0.1570])
		sometimes: tensor([-0.0022,  0.2364,  0.5325])
		i: tensor([0.4311, 0.1228, 0.0790])
		am: tensor([-0.0116,  0.2984,  0.6138])
		so: tensor([-0.6140, -0.3001,  0.0847])
		sick: tensor([ 0.4870, -0.3558,  0.1976])
		of: tensor([-0.5893,  0.5575,  0.0885])
		being: tensor([ 0.1253,  0.0403, -0.0894])
		un: tensor([ 0.0331, -0.3374,  0.2331])
		##pro: tensor([ 0.1838, -0.0970, -0.2647])
		##ductive: tensor([ 0.6815,  0.0218, -0.2953])
		that: tensor([-0.3935,  0.8944,  0.1987])
		i: tensor([0.3077, 0.2916, 0.2182])
		have: tensor([0.1284, 0.8554, 0.9328])
		dreams: tensor([0.4492, 0.3384, 0.6839])
		where: tensor([-0.3373,  0.1329,  0.5526])
		i: tensor([0.2139, 0.3042, 0.2200])
		am: tensor([ 0.1149,  0.6153, -0.0596])
		being: tensor([ 0.2230, -0.2829, -0.0083])
		chased: tensor([ 0.8882, -0.4940,  0.5868])
		by: tensor([ 0.1810, -0.1551,  0.3991])
		deadline: tensor([ 0.4241, -0.3513, -0.1401])
		##s: tensor([-0.0337, -0.6354, -0.3106])
		

In [12]:
## use BERT to evaluate our input

## [POINT] torch.no_grad will tell pytorch to not make the computing graph on the forward pass.
## the forward pass is used during backprop, but since we only need the encoder's output states, we don't need the graph

## [POINT] BaseModelOutput (base model's output) returns [0] Last layer's hidden layer [1] hidden layer [2] attentions
with torch.no_grad():
    outputs = model(tokens_tensor, segment_tensor)

    hidden_states = outputs[2][-1]

    print(f"layer: {layer_id} shape: {layer.shape}")

layer: 13 shape: torch.Size([1, 24, 768])
