In [1]:
import torch
from transformers import BertTokenizer, BertModel
import logging
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## defines tokenizer to tokenize input.
## All words unknown to the vocabulary will be split into subwords all the way down to invidual characters
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
## input sentence
text = "Im cool with my teacher so i was gonna ask for tht persons name but im too shy to do tht too "

In [4]:
## adding BERT defined separators in front

markedUpText = "[CLS]" + text + "[SEP]"

tokenized_text = tokenizer.tokenize(markedUpText)

print(tokenized_text)

['[CLS]', 'im', 'cool', 'with', 'my', 'teacher', 'so', 'i', 'was', 'gonna', 'ask', 'for', 'th', '##t', 'persons', 'name', 'but', 'im', 'too', 'shy', 'to', 'do', 'th', '##t', 'too', '[SEP]']


In [5]:
## exploring the vocabulary of the BERT tokenizer

## [0] - [PAD]
## [100] - [UNK]
## [101] - [CLS]
## [102] - [SEP]
## [104] - [MASK]
## [999:1996] are starting characters ! ... ~
## [1996:29612] are words
## [29612:] are subwords


list(tokenizer.vocab.keys())[29612:]

['##!',
 '##"',
 '###',
 '##$',
 '##%',
 '##&',
 "##'",
 '##(',
 '##)',
 '##*',
 '##+',
 '##,',
 '##-',
 '##.',
 '##/',
 '##:',
 '##;',
 '##<',
 '##=',
 '##>',
 '##?',
 '##@',
 '##[',
 '##\\',
 '##]',
 '##^',
 '##_',
 '##`',
 '##{',
 '##|',
 '##}',
 '##~',
 '##¡',
 '##¢',
 '##£',
 '##¤',
 '##¥',
 '##¦',
 '##§',
 '##¨',
 '##©',
 '##ª',
 '##«',
 '##¬',
 '##®',
 '##±',
 '##´',
 '##µ',
 '##¶',
 '##·',
 '##º',
 '##»',
 '##¼',
 '##¾',
 '##¿',
 '##æ',
 '##ð',
 '##÷',
 '##þ',
 '##đ',
 '##ħ',
 '##ŋ',
 '##œ',
 '##ƒ',
 '##ɐ',
 '##ɑ',
 '##ɒ',
 '##ɔ',
 '##ɕ',
 '##ə',
 '##ɡ',
 '##ɣ',
 '##ɨ',
 '##ɪ',
 '##ɫ',
 '##ɬ',
 '##ɯ',
 '##ɲ',
 '##ɴ',
 '##ɹ',
 '##ɾ',
 '##ʀ',
 '##ʁ',
 '##ʂ',
 '##ʃ',
 '##ʉ',
 '##ʊ',
 '##ʋ',
 '##ʌ',
 '##ʎ',
 '##ʐ',
 '##ʑ',
 '##ʒ',
 '##ʔ',
 '##ʰ',
 '##ʲ',
 '##ʳ',
 '##ʷ',
 '##ʸ',
 '##ʻ',
 '##ʼ',
 '##ʾ',
 '##ʿ',
 '##ˈ',
 '##ˡ',
 '##ˢ',
 '##ˣ',
 '##ˤ',
 '##β',
 '##γ',
 '##δ',
 '##ε',
 '##ζ',
 '##θ',
 '##κ',
 '##λ',
 '##μ',
 '##ξ',
 '##ο',
 '##π',
 '##ρ',
 '##σ',
 '##τ',
 '##υ',
 '##φ',

In [7]:
## map words to indices

token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)

for token, id in zip(tokenized_text, token_ids):
    print(token, id)

[CLS] 101
im 10047
cool 4658
with 2007
my 2026
teacher 3836
so 2061
i 1045
was 2001
gonna 6069
ask 3198
for 2005
th 16215
##t 2102
persons 5381
name 2171
but 2021
im 10047
too 2205
shy 11004
to 2000
do 2079
th 16215
##t 2102
too 2205
[SEP] 102


In [13]:
## add sentence ids to each token

sentence_ids = [1] * len(token_ids)

for token, token_id, sentence_id in zip(tokenized_text, token_ids, sentence_ids):
    print(token, token_id, sentence_id)

[CLS] 101 1
im 10047 1
cool 4658 1
with 2007 1
my 2026 1
teacher 3836 1
so 2061 1
i 1045 1
was 2001 1
gonna 6069 1
ask 3198 1
for 2005 1
th 16215 1
##t 2102 1
persons 5381 1
name 2171 1
but 2021 1
im 10047 1
too 2205 1
shy 11004 1
to 2000 1
do 2079 1
th 16215 1
##t 2102 1
too 2205 1
[SEP] 102 1


In [15]:
## We need to turn token id and segment id lists into tensors

tokens_tensor = torch.tensor([token_ids])

segment_tensor = torch.tensor([sentence_ids])

print(tokens_tensor)

print(segment_tensor)

tensor([[  101, 10047,  4658,  2007,  2026,  3836,  2061,  1045,  2001,  6069,
          3198,  2005, 16215,  2102,  5381,  2171,  2021, 10047,  2205, 11004,
          2000,  2079, 16215,  2102,  2205,   102]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]])


In [17]:
## load the BERT pretrained model

model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

model.eval()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [40]:
## use BERT to evaluate our input

## [POINT] torch.no_grad will tell pytorch to not make the computing graph on the forward pass.
## the forward pass is used during backprop, but since we only need the encoder's output states, we don't need the graph

## [POINT] BaseModelOutput (base model's output) returns [0] Last layer's hidden layer [1] hidden layer [2] attentions
with torch.no_grad():
    outputs = model(tokens_tensor, segment_tensor)

    hidden_states = outputs[2]

    layer_id = 0

    for layer in hidden_states:
        print(f"layer: {layer_id} shape: {layer.shape}")

        print(f"\t length of a sentence is {len(layer[0])}")

        sentence_hidden_layer = layer[0]

        for word, output in zip(tokenized_text, sentence_hidden_layer):
            print(f"\t\t{word}: {output[:3]}")

        layer_id += 1

layer: 0 shape: torch.Size([1, 26, 768])
	 length of a sentence is 26
		[CLS]: tensor([ 0.1686, -0.2858, -0.3261])
		im: tensor([-0.1992, -0.2818, -0.4229])
		cool: tensor([0.2039, 0.0116, 0.3376])
		with: tensor([-0.5715,  0.1363,  0.2100])
		my: tensor([ 0.4374,  0.5032, -0.5967])
		teacher: tensor([-0.1058, -0.3168, -0.1371])
		so: tensor([-0.1433,  0.1276,  0.0729])
		i: tensor([-0.1160,  0.2462, -0.0892])
		was: tensor([-0.2475, -0.7300,  0.2477])
		gonna: tensor([ 0.8068,  0.0414, -0.0809])
		ask: tensor([-1.9051, -1.2284, -0.0197])
		for: tensor([-0.1977,  0.5124, -0.7430])
		th: tensor([ 0.0146, -0.9605,  1.1715])
		##t: tensor([-1.3675,  0.3061,  0.9769])
		persons: tensor([-0.8862,  0.7206, -0.3123])
		name: tensor([ 0.2807,  0.8009, -0.3979])
		but: tensor([-0.0458,  0.7050,  0.1048])
		im: tensor([-0.4673, -0.1840, -0.1914])
		too: tensor([-0.5517,  0.5391, -0.3514])
		shy: tensor([-1.0193, -0.0746, -0.0489])
		to: tensor([ 0.5639,  0.5797, -0.2717])
		do: tensor([ 0.4234, 

: 