In [1]:
import torch
from transformers import BertTokenizer, BertModel
import logging
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## defines tokenizer to tokenize input.
## All words unknown to the vocabulary will be split into subwords all the way down to invidual characters
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
## input sentence
text = "I wish I can be more productive and be less tempted to play video games"

In [4]:
## adding BERT defined separators in front

markedUpText = "[CLS]" + text + "[SEP]"

tokenized_text = tokenizer.tokenize(markedUpText)

print(text.split())
print(tokenized_text)

['I', 'wish', 'I', 'can', 'be', 'more', 'productive', 'and', 'be', 'less', 'tempted', 'to', 'play', 'video', 'games']
['[CLS]', 'i', 'wish', 'i', 'can', 'be', 'more', 'productive', 'and', 'be', 'less', 'tempted', 'to', 'play', 'video', 'games', '[SEP]']


In [5]:
## exploring the vocabulary of the BERT tokenizer

## [0] - [PAD]
## [100] - [UNK]
## [101] - [CLS]
## [102] - [SEP]
## [104] - [MASK]
## [999:1996] are starting characters ! ... ~
## [1996:29612] are words
## [29612:] are subwords


list(tokenizer.vocab.keys())[29612:]

['##!',
 '##"',
 '###',
 '##$',
 '##%',
 '##&',
 "##'",
 '##(',
 '##)',
 '##*',
 '##+',
 '##,',
 '##-',
 '##.',
 '##/',
 '##:',
 '##;',
 '##<',
 '##=',
 '##>',
 '##?',
 '##@',
 '##[',
 '##\\',
 '##]',
 '##^',
 '##_',
 '##`',
 '##{',
 '##|',
 '##}',
 '##~',
 '##¡',
 '##¢',
 '##£',
 '##¤',
 '##¥',
 '##¦',
 '##§',
 '##¨',
 '##©',
 '##ª',
 '##«',
 '##¬',
 '##®',
 '##±',
 '##´',
 '##µ',
 '##¶',
 '##·',
 '##º',
 '##»',
 '##¼',
 '##¾',
 '##¿',
 '##æ',
 '##ð',
 '##÷',
 '##þ',
 '##đ',
 '##ħ',
 '##ŋ',
 '##œ',
 '##ƒ',
 '##ɐ',
 '##ɑ',
 '##ɒ',
 '##ɔ',
 '##ɕ',
 '##ə',
 '##ɡ',
 '##ɣ',
 '##ɨ',
 '##ɪ',
 '##ɫ',
 '##ɬ',
 '##ɯ',
 '##ɲ',
 '##ɴ',
 '##ɹ',
 '##ɾ',
 '##ʀ',
 '##ʁ',
 '##ʂ',
 '##ʃ',
 '##ʉ',
 '##ʊ',
 '##ʋ',
 '##ʌ',
 '##ʎ',
 '##ʐ',
 '##ʑ',
 '##ʒ',
 '##ʔ',
 '##ʰ',
 '##ʲ',
 '##ʳ',
 '##ʷ',
 '##ʸ',
 '##ʻ',
 '##ʼ',
 '##ʾ',
 '##ʿ',
 '##ˈ',
 '##ˡ',
 '##ˢ',
 '##ˣ',
 '##ˤ',
 '##β',
 '##γ',
 '##δ',
 '##ε',
 '##ζ',
 '##θ',
 '##κ',
 '##λ',
 '##μ',
 '##ξ',
 '##ο',
 '##π',
 '##ρ',
 '##σ',
 '##τ',
 '##υ',
 '##φ',

In [6]:
## map words to indices

token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)

for token, id in zip(tokenized_text, token_ids):
    print(token, id)

[CLS] 101
i 1045
wish 4299
i 1045
can 2064
be 2022
more 2062
productive 13318
and 1998
be 2022
less 2625
tempted 16312
to 2000
play 2377
video 2678
games 2399
[SEP] 102


In [7]:
## add sentence ids to each token

sentence_ids = [1] * len(token_ids)

for token, token_id, sentence_id in zip(tokenized_text, token_ids, sentence_ids):
    print(token, token_id, sentence_id)

[CLS] 101 1
i 1045 1
wish 4299 1
i 1045 1
can 2064 1
be 2022 1
more 2062 1
productive 13318 1
and 1998 1
be 2022 1
less 2625 1
tempted 16312 1
to 2000 1
play 2377 1
video 2678 1
games 2399 1
[SEP] 102 1


In [8]:
## We need to turn token id and segment id lists into tensors

tokens_tensor = torch.tensor([token_ids])

segment_tensor = torch.tensor([sentence_ids])

print(tokens_tensor)

print(segment_tensor)

tensor([[  101,  1045,  4299,  1045,  2064,  2022,  2062, 13318,  1998,  2022,
          2625, 16312,  2000,  2377,  2678,  2399,   102]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [9]:
## load the BERT pretrained model

model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [25]:
with torch.no_grad():
    outputs = model(tokens_tensor, segment_tensor)

    ## output = last layer hidden state, pooler output, hidden state output by layer
    hidden_states = outputs[2] # last layer

    layer_id = 0

    for layer in hidden_states:

        print(f"layer: {layer_id} shape: {layer.shape}")

        print(f"\t length of a sentence is {len(layer)}")

        sentence_hidden_layer = layer[0]

        for word, output in zip(tokenized_text, sentence_hidden_layer):
            print(f"\t\t{word}: {output[:3]}")

        layer_id += 1

layer: 0 shape: torch.Size([1, 17, 768])
	 length of a sentence is 1
		[CLS]: tensor([ 0.1686, -0.2858, -0.3261])
		i: tensor([-3.4026e-04,  5.3974e-01, -2.8805e-01])
		wish: tensor([-0.4765, -0.4256, -0.0395])
		i: tensor([-0.2035,  0.3146, -0.3127])
		can: tensor([ 1.0617, -0.8491,  0.7407])
		be: tensor([ 0.5315, -0.0140, -0.7170])
		more: tensor([ 0.8776,  0.2554, -0.4276])
		productive: tensor([-0.1674,  1.1317, -0.7485])
		and: tensor([-0.1952,  0.1577, -0.1134])
		be: tensor([ 0.7034,  0.2527, -0.6757])
		less: tensor([ 0.2373,  0.7896, -0.0711])
		tempted: tensor([0.2990, 0.1633, 0.6750])
		to: tensor([0.1360, 0.5616, 0.0765])
		play: tensor([-1.0072,  0.5862, -0.2050])
		video: tensor([ 0.3764,  0.8183, -0.1226])
		games: tensor([-0.1782,  1.0912,  0.5143])
		[SEP]: tensor([-0.5870,  0.2658,  0.0439])
layer: 1 shape: torch.Size([1, 17, 768])
	 length of a sentence is 1
		[CLS]: tensor([ 0.0586,  0.0456, -0.0594])
		i: tensor([ 0.4496,  0.4641, -0.3497])
		wish: tensor([-0.4856

In [26]:
with torch.no_grad():
    outputs = model(tokens_tensor, segment_tensor)

    hidden_states = outputs[2][-1]

    for word, output in zip(tokenized_text, sentence_hidden_layer):
        print(f"\t\t{word}: {output[:3]}")

		[CLS]: tensor([ 0.3246,  0.3165, -0.1085])
		i: tensor([ 0.2805,  0.3541, -0.3777])
		wish: tensor([0.2140, 0.5532, 0.2470])
		i: tensor([0.3993, 0.1602, 0.0859])
		can: tensor([ 0.3594, -0.2082,  0.2150])
		be: tensor([ 0.0823, -0.2367, -0.3097])
		more: tensor([-0.2828, -0.6498, -0.3860])
		productive: tensor([ 0.2899,  0.1550, -0.3456])
		and: tensor([-0.3852, -0.0556, -0.4581])
		be: tensor([ 0.0265, -0.1905, -0.2290])
		less: tensor([-0.4236, -0.4347, -0.4813])
		tempted: tensor([ 0.2982, -0.3796,  0.1680])
		to: tensor([ 0.4241,  0.2798, -0.2184])
		play: tensor([1.0557, 0.8643, 0.2405])
		video: tensor([ 0.5695,  0.4635, -0.0652])
		games: tensor([ 0.6511,  0.3822, -0.2510])
		[SEP]: tensor([ 0.6148,  0.2484, -0.2419])


In [None]:
## use BERT to evaluate our input

## [POINT] torch.no_grad will tell pytorch to not make the computing graph on the forward pass.
## the forward pass is used during backprop, but since we only need the encoder's output states, we don't need the graph

## [POINT] BaseModelOutput (base model's output) returns [0] Last layer's hidden layer [1] hidden layer [2] attentions
with torch.no_grad():
    outputs = model(tokens_tensor, segment_tensor)

    hidden_states = outputs[2][-1]

    print(f"layer: {layer_id} shape: {layer.shape}")

layer: 13 shape: torch.Size([1, 26, 768])
