In [2]:
!pip3 install pytorch_pretrained_bert

Collecting pytorch_pretrained_bert
  Using cached https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl
Collecting torch>=0.4.1 (from pytorch_pretrained_bert)
  Using cached https://files.pythonhosted.org/packages/24/19/4804aea17cd136f1705a5e98a00618cb8f6ccc375ad8bfa437408e09d058/torch-1.4.0-cp36-cp36m-manylinux1_x86_64.whl
Collecting numpy (from pytorch_pretrained_bert)
  Using cached https://files.pythonhosted.org/packages/62/20/4d43e141b5bc426ba38274933ef8e76e85c7adea2c321ecf9ebf7421cedf/numpy-1.18.1-cp36-cp36m-manylinux1_x86_64.whl
Collecting requests (from pytorch_pretrained_bert)
  Using cached https://files.pythonhosted.org/packages/1a/70/1935c770cb3be6e3a8b78ced23d7e0f3b187f5cbfab4749523ed65d7c9b1/requests-2.23.0-py2.py3-none-any.whl
Collecting tqdm (from pytorch_pretrained_bert)
  Using cached https://files.pythonhosted.org/packages/47/55/fd9170ba08a1a64a18a7f8a18f088037316f2a41

In [3]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import matplotlib.pyplot as plt
% matplotlib inline

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
text = "I saw the fly on the newspaper and \
I grabbed the fly swatter before I was able to get the bug I saw it \
fly."
marked_text = "[CLS] " + text + " [SEP]"

#Use Bert tokenizer to token the sentence
tokenized_text = tokenizer.tokenize(marked_text)

print(tokenized_text)

['[CLS]', 'i', 'saw', 'the', 'fly', 'on', 'the', 'newspaper', 'and', 'i', 'grabbed', 'the', 'fly', 'swat', '##ter', 'before', 'i', 'was', 'able', 'to', 'get', 'the', 'bug', 'i', 'saw', 'it', 'fly', '.', '[SEP]']


In [5]:
# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

[CLS]           101
i             1,045
saw           2,387
the           1,996
fly           4,875
on            2,006
the           1,996
newspaper     3,780
and           1,998
i             1,045
grabbed       4,046
the           1,996
fly           4,875
swat         25,414
##ter         3,334
before        2,077
i             1,045
was           2,001
able          2,583
to            2,000
get           2,131
the           1,996
bug          11,829
i             1,045
saw           2,387
it            2,009
fly           4,875
.             1,012
[SEP]           102


In [6]:
# Mark each of the 29 tokens as belonging to sentence "1".
segments_ids = [1] * len(tokenized_text)

print (segments_ids)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [7]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [8]:
# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)

In [9]:
print ("Number of layers:", len(encoded_layers))
layer_i = 0

print ("Number of batches:", len(encoded_layers[layer_i]))
batch_i = 0

print ("Number of tokens:", len(encoded_layers[layer_i][batch_i]))
token_i = 0

print ("Number of hidden units:", 
       len(encoded_layers[layer_i][batch_i][token_i]))

Number of layers: 12
Number of batches: 1
Number of tokens: 29
Number of hidden units: 768


In [10]:
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings = torch.stack(encoded_layers, dim=0)

token_embeddings.size()

torch.Size([12, 1, 29, 768])

In [11]:
# Remove dimension 1, the "batches".
token_embeddings = torch.squeeze(token_embeddings, dim=1)

token_embeddings.size()

torch.Size([12, 29, 768])

In [12]:
# Swap "layers" and "tokens" dimensions 0 and 1.
token_embeddings = token_embeddings.permute(1,0,2)

token_embeddings.size()

torch.Size([29, 12, 768])

In [13]:
# We want to create words and sentence vectors from hidden states 
# by summing the last four layers
# Stores the token vectors, with shape [29 x 768]
token_vecs_sum = []

# `token_embeddings` is a [29 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:

    # `token` is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    
    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

Shape is: 29 x 768


In [14]:
# `encoded_layers` has shape [12 x 1 x 29 x 768]

# `token_vecs` is a tensor with shape [29 x 768]
token_vecs = encoded_layers[11][0]

# Calculate the average of all 29 token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)

In [15]:
print ("Our final sentence embedding vector of shape:", sentence_embedding.size())

Our final sentence embedding vector of shape: torch.Size([768])


In [16]:
for i, token_str in enumerate(tokenized_text):
  print (i, token_str)

0 [CLS]
1 i
2 saw
3 the
4 fly
5 on
6 the
7 newspaper
8 and
9 i
10 grabbed
11 the
12 fly
13 swat
14 ##ter
15 before
16 i
17 was
18 able
19 to
20 get
21 the
22 bug
23 i
24 saw
25 it
26 fly
27 .
28 [SEP]


In [17]:
print('First 5 vector values for each instance of "fly" and "bug".')
print('')
print("the fly          ", str(token_vecs_sum[4][:5]))
print("the fly swatter  ", str(token_vecs_sum[12][:5]))
print("it fly           ", str(token_vecs_sum[26][:5]))

First 5 vector values for each instance of "fly" and "bug".

the fly           tensor([2.5765, 0.4508, 0.1037, 0.9915, 5.4395])
the fly swatter   tensor([3.4694, 0.1672, 1.0029, 1.6941, 4.0429])
it fly            tensor([1.6850, 2.4804, 1.6526, 1.3139, 1.5256])


In [18]:
from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word fly 
# in "the fly" vs "it fly" (different meanings).
diff_fly = 1 - cosine(token_vecs_sum[4], token_vecs_sum[26])

# Calculate the cosine similarity between the word fly
# in "the fly" vs "fly swatter" (same meaning).
same_fly = 1 - cosine(token_vecs_sum[4], token_vecs_sum[12])

print('Vector similarity for  *similar*  meanings:  %.2f' % same_fly)
print('Vector similarity for *different* meanings:  %.2f' % diff_fly)

Vector similarity for  *similar*  meanings:  0.73
Vector similarity for *different* meanings:  0.66
