In [None]:
# Notebook for how approaches to generating word embeddings and sentence embeddings with BERT pretrained models.
# Token embeddings with BERT are generated based on context, so the embeddings are unique to the sentence in which
# the word is occuring. These models should also be fine-tuned on domain sentences if possible in order to more 
# accurately model words that weren't frequent in the general corpus. Code for the first section is from:
# mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#33-creating-word-and-sentence-vectors-from-hidden-states

import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import logging
import matplotlib.pyplot as plt

text1 = "white embryo and seedling (albino), lethal hypocotyl"
text2 = "Increased abundance of miRNA precursors."
text3 = "incomplete penetrance; increased aluminum resistance; accumulates lower levels of Al in the root tips."

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

marked_text = "{} {} {}".format("[CLS]",text1,"[SEP]")
tokenized_text = tokenizer.tokenize(marked_text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# All 1's if just one sentence, otherwise 0's then 1's to indicate the two sentences.
segments_ids = [1] * len(tokenized_text) 

print(tokenized_text)
print(indexed_tokens) 
print(segments_ids)

In [None]:
# Convert the token and segment lists into tensors.
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensor = torch.tensor([segments_ids])

In [None]:
# Load and describe the pretained BERT model.
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

In [None]:
# Do a forward pass (don't need gradients or backpropagation) for the tokens in this sentence.
with torch.no_grad():
    encoded_layers,_ = model(tokens_tensor,segments_tensor)

# The dimensions of the encoded layers are [Layer, Batch, Token, Hidden Unit].
print ("Number of layers:", len(encoded_layers))
print ("Number of batches:", len(encoded_layers[0]))
print ("Number of tokens:", len(encoded_layers[0][0]))
print ("Number of hidden units:", len(encoded_layers[0][0][0]))

In [None]:
# We want to rearrange the encoded layers nested list so that the dimensions are [Batch, Token, Layer, Hidden Unit].
token_embeddings = torch.stack(encoded_layers, dim=0)
print(token_embeddings.size())
token_embeddings = token_embeddings.permute(1,2,0,3)
print(token_embeddings.size())

In [None]:
# Which layers to use as embeddings is a modeling choice that is context dependent, this uses the last four, either
# concatenating all the layers together or summing them.
token_vecs_cat = []
token_vecs_sum = []
batch = 0
for token in token_embeddings[batch]:
    concatenated_layer_vectors = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
    token_vecs_cat.append(concatenated_layer_vectors)
    sum_vec = torch.sum(token[-4:], dim=0)
    token_vecs_sum.append(sum_vec)
    
# The token_vecs_cat list now has dimensions [Tokens, Length of 4 Layers] for this one input sentence. 
# The token_vecs_sum list now has dimensions [Tokens, Length of 1 Layer] for this one input sentence.
print ('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))
print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

In [None]:
# Generate a vector to represent the entire sentence.
layer_to_use = 10
batch = 0
token_vectors = encoded_layers[layer_to_use][batch]
sentence_embedding = torch.mean(token_vectors,dim=0)
print(sentence_embedding.size())

In [None]:
# Combining some of the previous steps into a single function for generating sentence embeddings.
def embed(text, model):
    marked_text = "{} {} {}".format("[CLS]",text,"[SEP]")
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensor = torch.tensor([segments_ids])
    token_embeddings = torch.stack(encoded_layers, dim=0)
    token_embeddings = token_embeddings.permute(1,2,0,3)
    token_vecs_cat = []
    token_vecs_sum = []
    batch = 0
    for token in token_embeddings[batch]:
        concatenated_layer_vectors = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
        token_vecs_cat.append(np.array(concatenated_layer_vectors))
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(np.array(sum_vec))
 
    a = np.array(token_vecs_sum)
    return(np.mean(a,axis=0))
        
# Running the function on 1000 sentences to check the runtime.
texts = [text1]*1000      
vectors = [embed(text, model) for text in texts]