In [1]:
!pip install transformers



In [2]:
import torch
from transformers import BertTokenizer, BertModel

import logging
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [19]:
text1 = 'cup on table'
text2 = 'blue cup on desk'

marked_text = "[CLS] " + text1 + " [SEP]"
marked_text2 = "[CLS] " + text2 + " [SEP]"

# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(marked_text)
tokenized_text2 = tokenizer.tokenize(marked_text2)

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)

segments_ids = [0] * len(tokenized_text)
segments_ids2 = [0] * len(tokenized_text2)

In [20]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

tokens_tensor2 = torch.tensor([indexed_tokens2])
segments_tensors2 = torch.tensor([segments_ids2])

In [21]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [22]:
with torch.no_grad():

    outputs = model(tokens_tensor, segments_tensors)
    hidden_states = outputs[2]
    
    outputs2 = model(tokens_tensor2, segments_tensors2)
    hidden_states2 = outputs2[2]

In [23]:
token_embeddings = torch.stack(hidden_states, dim=0)
token_embeddings2 = torch.stack(hidden_states2, dim=0)

token_embeddings.size()

torch.Size([13, 1, 5, 768])

In [24]:
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings2 = torch.squeeze(token_embeddings2, dim=1)

token_embeddings.size()

torch.Size([13, 5, 768])

In [26]:
# Swap dimensions 0 and 1.
token_embeddings = token_embeddings.permute(1,0,2)
token_embeddings2 = token_embeddings2.permute(1,0,2)

token_embeddings.size()

torch.Size([13, 5, 768])

In [27]:
# Stores the token vectors, with shape [22 x 768]
token_vecs_sum = []
token_vecs_sum2 = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:

    # `token` is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    
    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)
    
for token in token_embeddings2:

    # `token` is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vec2 = torch.sum(token[-4:], dim=0)

    # Use `sum_vec` to represent `token`.
    token_vecs_sum2.append(sum_vec2)

In [16]:
token_vecs_sum[0].shape

torch.Size([768])

In [28]:
# `token_vecs` is a tensor with shape [22 x 768]
token_vecs = hidden_states[-2][0]

# Calculate the average of all 22 token vectors.
sentence_embedding1 = torch.mean(token_vecs, dim=0)

token_vecs2 = hidden_states2[-2][0]

# Calculate the average of all 22 token vectors.
sentence_embedding2 = torch.mean(token_vecs2, dim=0)

In [18]:
sentence_embedding.shape

torch.Size([768])

In [31]:
from scipy.spatial.distance import cosine

diff_trip = 1 - cosine(sentence_embedding1, sentence_embedding2)
print(diff_trip)

0.8773030042648315
