In [2]:
import csv
import transformers
import numpy as np
from scipy.spatial.distance import cosine
import torch
from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
#% matplotlib inline

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


  from .autonotebook import tqdm as notebook_tqdm


In [73]:
class BertEncoding():
	#https://colab.research.google.com/drive/1yFphU6PW9Uo6lmDly_ud9a6c4RCYlwdX#scrollTo=Zn0n2S-FWZih
	"""
	Reads a CSV of format:
		word1 word2, word3 word4, label
	Where word1&2 make sentence 1 and 3&4 make sentence2.
	"""
	def __init__(self, csv_dir):
		self.data = self.listOfRowsFromSCV(csv_dir)
		self.model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states = True).eval()# output_hidden_states is whether the model returns all hidden-states. # Put the model in "evaluation" mode, meaning feed-forward operation. 
		pass

	def listOfRowsFromSCV(self, csv_dir):
		rows = []
		with open(csv_dir, "r") as csvfile:
			reader_variable = csv.reader(csvfile, delimiter=",")
			for row in reader_variable:
				rows.append(row)
		return rows	
	
	def getSentEmbedding(self, sent):
		marked_text = "[CLS] " + sent + " [SEP]"
		tokenized_text = tokenizer.tokenize(marked_text)	# Tokenize our sentence with the BERT tokenizer.
		return(tokenized_text)	# Print out the tokens.

	def preprocessSentence(self, sent, printing=None):
		# Add the special tokens.
		marked_text = "[CLS] " + sent + " [SEP]"
		# Split the sentence into tokens.
		tokenized_text = tokenizer.tokenize(marked_text)
		# Map the token strings to their vocabulary indeces.
		indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
		if printing:
			# Display the words with their indeces.
			for tup in zip(tokenized_text, indexed_tokens):
				print('{:<12} {:>6,}'.format(tup[0], tup[1]))

		# Mark each of the 22 tokens as belonging to sentence "1".
		segments_ids = [1] * len(tokenized_text)
		if printing:
			print(segments_ids)

		# Convert inputs to PyTorch tensors
		tokens_tensor = torch.tensor([indexed_tokens])
		segments_tensors = torch.tensor([segments_ids])
		return tokens_tensor, segments_tensors
	
	def hiddenLayersBERT(self, sentences):
		# Run the text through BERT, and collect all of the hidden states produced from all 12 layers. 
		hidden_states = [] 
		for idx, sentence in enumerate(sentences):
			with torch.no_grad():
				outputs = self.model(sentence[0], sentence[1])
				
				# Evaluating the model will return a different number of objects based on 
				# how it's  configured in the `from_pretrained` call earlier. In this case, 
				# becase we set `output_hidden_states = True`, the third item will be the 
				# hidden states from all layers. See the documentation for more details:
				# https://huggingface.co/transformers/model_doc/bert.html#bertmodel
				hidden_states.append(outputs[2])
		return hidden_states

	def GetTokenVecSum(self, hidden_state):
		# Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor.
		# Remove dimension 1, the "batches".
		# Swap dimensions 0 and 1.
		token_embeddings = torch.squeeze(torch.stack(hidden_state, dim=0), dim=1).permute(1,0,2)

		# `hidden_state` has shape [13 x 1 x 22 x 768]
		# `token_vecs` is a tensor with shape [22 x 768]
		token_vecs = hidden_state[-2][0]

		# Calculate the average of all 22 token vectors.
		sentence_embedding = torch.mean(token_vecs, dim=0)

		#stores the token vectors, with shape [22 x 768]
		token_vecs_sum = []

		# `token_embeddings` is a [22 x 12 x 768] tensor.
		for token in token_embeddings:
			# `token` is a [12 x 768] tensor
			# Sum the vectors from the last four layers.
			sum_vec = torch.sum(token[-4:], dim=0)
			# Use `sum_vec` to represent `token`.
			token_vecs_sum.append(sum_vec)

		#print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))
		return token_vecs_sum, sentence_embedding

	def GetEmbeddingsForBothSentences(self, sentences):
		token_embedding_list = []
		sentence_embedding_list = []
		for idx, sentence in enumerate(sentences):
			token_embedding_list.append(self.GetTokenVecSum(sentence)[0])
			sentence_embedding_list.append(self.GetTokenVecSum(sentence)[1])
		return token_embedding_list, sentence_embedding_list
	
	def CosineSimilarity(self, sent0, sent1):
		# Calculate the cosine similarity between the word bank in "bank robber" vs "river bank" (different meanings).
		return 1-cosine(sent0, sent1)
	
	def evaluateSentPair(self, row_number):
		sents_to_evaluate = [self.preprocessSentence(self.data[row_number][0]), self.preprocessSentence(self.data[row_number][1])]

		hidden = self.hiddenLayersBERT(sents_to_evaluate)
		embeddings = self.GetEmbeddingsForBothSentences(hidden)

		return self.CosineSimilarity(embeddings[1][0], embeddings[1][1])
	
	def evaluateAllSentPairs(self):
		res = []
		for i in range(len(self.data)):
			res.append(self.evaluateSentPair(i))
		return res

In [75]:
cls.evaluateAllSentPairs()

[0.9686732292175293,
 0.7988609075546265,
 0.8739149570465088,
 0.8251831531524658,
 0.813016951084137,
 0.8406080007553101,
 0.8638482093811035,
 0.9042112827301025,
 0.8447734713554382,
 0.6192495226860046,
 0.6420484781265259,
 0.6192495226860046,
 0.5932442545890808,
 0.7548890709877014,
 0.8230564594268799,
 0.7330701947212219,
 0.817246675491333,
 0.7026510238647461,
 0.743051290512085,
 0.8083339333534241,
 0.7476651668548584,
 0.8074460625648499,
 0.7038283944129944,
 0.7743031978607178,
 0.7347809672355652,
 0.6991939544677734,
 0.6020883917808533,
 0.7560901641845703,
 0.6456645131111145,
 0.8126022219657898,
 0.6224542856216431,
 0.609408974647522,
 0.6590306758880615,
 0.7127388119697571,
 0.6719609498977661,
 0.653028130531311,
 0.6624095439910889]