## This notebook contains BERT Embedding Procedures on Pre-Trained and Custum Corpuses and Tests Performed on BERT to Determine How it Performs on Text Similarity Tasks

In [None]:
import torch
from transformers import BertTokenizer, BertModel

In [None]:
import pickle
from operator import itemgetter
from scipy.spatial.distance import cosine

In [None]:
# Load pre-trained BERT Model Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
# Load Bert pre-trained model
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

# Fine tune model
from simpletransformers.language_modeling import LanguageModelingModel
import logging

for k, v in missionStatements.items():
    sent = v[3].replace(".", " [SEP]")
    if sent[-5:] != "[SEP]":
        sent = sent + "[SEP]"
        
    f = open("text.txt", "a")
    f.write(sent)

f.close()


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
}

model = LanguageModelingModel('bert', 'bert-base-cased', args=train_args, use_cuda=False)
model.train_model("train.txt", eval_file="text.txt")
model.eval_model("test.txt")

In [None]:
# Function to get Bert Word Embeddings

def getBertEmbedding(text1):
    
    # BERT expects text input in this format 
    marked_text1 = "[CLS] " + text1 + " [SEP]" #this takes entire statement as one sentence
        
    # Split the sentences into tokens
    tokenized_text1 = tokenizer.tokenize(marked_text1)
    
    # Map the token strings to their vocabulary indeces
    indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
    
    # Mark each of the tokens as belonging to sentence "1"
    segments_ids1 = [1] * len(tokenized_text1)

    # Convert inputs to PyTorch tensors
    tokens_tensor1 = torch.tensor([indexed_tokens1])
    segments_tensors1 = torch.tensor([segments_ids1])

    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers
    with torch.no_grad():

        outputs1 = model(tokens_tensor1, segments_tensors1)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states1 = outputs1[2]

   
    # Concatenate the tensors for all layers. Use 'stack' to
    # create a new dimension in the tensor.

    token_embeddings1 = torch.stack(hidden_states1, dim=0)
    #token_embeddings1.size()

    # Remove dimension 1, the "batches".
    token_embeddings1 = torch.squeeze(token_embeddings1, dim=1)
    #token_embeddings.size()
   
    # Swap dimensions 0 and 1.
    token_embeddings1 = token_embeddings1.permute(1,0,2)
    #token_embeddings1.size()

    # hidden_states has shape [13 x 1 x 22 x 768]
    # token_vecs is a tensor with shape [22 x 768]
    token_vecs1 = hidden_states1[-2][0]
    
    # Calculate the average of all token vectors.
    sentence_embedding1 = torch.mean(token_vecs1, dim=0)
    
    return sentence_embedding1

### 1. Create BERT Embeddings Dictionaries for Mission Statements

In [None]:
bert_embeddings = {}

for key, value in missionStatements.items():

    orgID = key
    text2 = value[3]
    
    BertEmbedding = getBertEmbedding(text2)
    bert_embeddings[orgID] = BertEmbedding

## 2. Testing Section: This section presents several tests. In order to run on different models, substitute in the appropriate dictionaries from Section 1. 

### Specifically, what the tests do is to take a mission statement from a charity, manipulate the mission statement, e.g. delete parts, add noise, etc., and see if the altered mission statement is successfully matched with the organization's original/unaltered mission statement

In [None]:
import os
import random
from nltk.corpus import wordnet
from randomwordgenerator import randomwordgenerator

### 2.1.1. This is just a check to see that everything works. The mission statement is not altered. The unaltered statement is matched with the closest mission statement (it should be itself)  - hundred percent match should be attained.

In [None]:
correct_count = 0
incorrect_count = 0


for k, v in missionStatements.items():
    
    orgID = k
    statement = v[3]
    
    # generate BERT embedding of input mission statement
    bert_statement = getBertEmbedding(statement)
    collect_Cosine = []
    

    for key, value in bert_embeddings.items():                
        
        # Calculate the cosine similarity between two inputs 
        cosim = 1 - cosine(value, bert_statementL)
        collect_Cosine.append([cosim, key])
            
    value_list_sorted = sorted(collect_Cosine, key=itemgetter(0))
    
    if value_list_sorted[-1][1] == orgIDL:
        correct_count += 1.0
    else:
        incorrect_count += 1.0
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 2.1.2. This is the first manipulation. A fraction of the mission statement is selected (either 1/2, 1/5, or 1/10) and then attempted to match with the closest unaltered mission statement. If it is matched with its unaltered version, it's recorded as a correct match. 

In [None]:
correct_count = 0
incorrect_count = 0


for k, v in missionStatements.items():
    
    orgID = k
    statement = v[3]
    
    lenStatement = len(statementL)
    bert_statement = bert_statement[:int(0.5*lenStatement)]
    
    # generate BERT embedding of input mission statement
    bert_statement = getBertEmbedding(statement)
    collect_Cosine = []
    
    for key, value in bert_embeddings.items():                
        
        # Calculate the cosine similarity between two inputs 
        cosim = 1 - cosine(value, bert_statementL)
        collect_Cosine.append([cosim, key])
           
    # sort cosine values
    value_list_sorted = sorted(collect_Cosine, key=itemgetter(0))
    
    if value_list_sorted[-1][1] == orgIDL:
        correct_count += 1.0
    else:
        incorrect_count += 1.0
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 2.1.3. This test selects a random subset of words from the mission statement and only uses these words in its attempt to match with the correct un-altered mission statement. 

In [None]:
correct_count = 0
incorrect_count = 0


for k, v in missionStatements.items():
    
    orgID = k
    statement = v[3]
    
    # select random subset of words from mission statement block starts here
    # convert mission statement into list of words
    missionStatementWords_asList = statementL.split()

    # get number of words in mission statement
    no_words = len(missionStatementWords_asList)
    half_no_words = int(no_words*0.5)

    # list to populate with random words from mission statement
    select_Words_atRandom = []

    while len(select_Words_atRandom) < half_no_words:

        randno = random.randint(0, no_words-1)
        word = missionStatementWords_asList[randno]

        # Non-Unique Version 
        select_Words_atRandom.append(word)
        
        # Unique Words Version: this version prevents duplicate words to be in new statement
        #if word not in select_Words_atRandom:
        #    select_Words_atRandom.append(word)
        #else:
        #    pass

    constructed_missionStatement = " ".join(select_Words_atRandom)
    statement = constructed_missionStatement
    
    # generate BERT embedding of input mission statement
    bert_statement = getBertEmbedding(statement)
    collect_Cosine = []
    
    for key, value in bert_embeddings.items():                
        
        # Calculate the cosine similarity between two inputs 
        cosim = 1 - cosine(value, bert_statementL)
        collect_Cosine.append([cosim, key])
           
    # sort cosine values
    value_list_sorted = sorted(collect_Cosine, key=itemgetter(0))
    
    if value_list_sorted[-1][1] == orgIDL:
        correct_count += 1.0
    else:
        incorrect_count += 1.0
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 2.2.1. This test attempts to measure how noise affects the algorithms performance. Fraction of the original mission statement is replaced with random words (noise). 

In [None]:
correct_count = 0
incorrect_count = 0


for k, v in missionStatements.items():
    
    orgID = k
    statement = v[3]
    
    # substitute fraction of mission statement with random words approximating noise
    missionStatementWords_asList = statement.split()

    no_words = len(missionStatementWords_asList)
    fraction_of_words = int(no_words * (1/2))

    firstPart = missionStatementWords_asList[:fraction_of_words + 1]
    secondPart = randomwordgenerator.generate_random_words(n = int(no_words * (1/2)))

    # Default Version
    constructed_missionStatement1 = " ".join(firstPart)
    constructed_missionStatement2 = " ".join(secondPart)
    missionStatement = constructed_missionStatement1 + " " + constructed_missionStatement2 
    statement = missionStatement
    
    # generate BERT embedding of input mission statement
    bert_statement = getBertEmbedding(statement)
    collect_Cosine = []
    
    for key, value in bert_embeddings.items():                
        
        # Calculate the cosine similarity between two inputs 
        cosim = 1 - cosine(value, bert_statementL)
        collect_Cosine.append([cosim, key])
           
    # sort cosine values
    value_list_sorted = sorted(collect_Cosine, key=itemgetter(0))
    
    if value_list_sorted[-1][1] == orgIDL:
        correct_count += 1.0
    else:
        incorrect_count += 1.0
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 2.2.2. This test is the same as in 2.2.1., except that the words are shuffled, so that strings from the original mission statement are not retained.  

In [None]:
correct_count = 0
incorrect_count = 0


for k, v in missionStatements.items():
    
    orgID = k
    statement = v[3]
    
    # substitute fraction of mission statement with random words approximating noise
    missionStatementWords_asList = statement.split()

    no_words = len(missionStatementWords_asList)
    fraction_of_words = int(no_words * (1/2))

    firstPart = missionStatementWords_asList[:fraction_of_words + 1]
    secondPart = randomwordgenerator.generate_random_words(n = int(no_words * (1/2)))
    
    # Version to shuffle the selected words
    combinedParts = firstPart + secondPart
    random.shuffle(combinedParts)
    missionStatement = " ".join(combinedParts)
    statement = missionStatement
    
    # generate BERT embedding of input mission statement
    bert_statement = getBertEmbedding(statement)
    collect_Cosine = []
    
    for key, value in bert_embeddings.items():                
        
        # Calculate the cosine similarity between two inputs 
        cosim = 1 - cosine(value, bert_statementL)
        collect_Cosine.append([cosim, key])
           
    # sort cosine values
    value_list_sorted = sorted(collect_Cosine, key=itemgetter(0))
    
    if value_list_sorted[-1][1] == orgIDL:
        correct_count += 1.0
    else:
        incorrect_count += 1.0
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 2.3.1. This test replaces the words in the original mission statement with synonyms. This test approximates how the algorithm handles cases where the meaning is retained but the actual words are different. 

In [None]:
correct_count = 0
incorrect_count = 0


for k, v in missionStatements.items():
    
    orgID = k
    statement = v[3]
    
    # generate synonyms: start of block
    missionStatementWords_asList = statement.split()
    constructedMissionList = []
    
    for word in missionStatementWords_asList:
        
        synonyms = []
        
        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                synonyms.append(l.name())
    
        a = set(synonyms)
        a = list(a)
        
        # replace word in mission statement with first synonym
        try:
            constructedMissionList.append(a[0])
        except:
            constructedMissionList.append("")
        
    
   
        # Attempt to replace the word with the first three suggested synonyms by wordnet
        # If the first one equals the word to be replaced or already occurs in mission statement
        # then proceed to the second suggestion. If the first three suggestions do not mee the conditions
        # or if there are no suggestions (except clause), then leave blank replacement      
        
        #try:
        #    if a[0] != word and a[0] not in missionStatementWords_asList:
        #        constructedMissionList.append(a[0])
        #    elif a[1] != word and a[1] not in missionStatementWords_asList:
        #        constructedMissionList.append(a[1])
        #    elif a[2] != word and a[2] not in missionStatementWords_asList:
        #        constructedMissionList.append(a[2])
        #    else:
        #        #pass
        #        constructedMissionList.append("")
        #except:
        #    constructedMissionList.append("")
        
    constructed_missionStatement = " ".join(constructedMissionList)
    statement = constructed_missionStatement

    # generate BERT embedding of input mission statement
    bert_statement = getBertEmbedding(statement)
    collect_Cosine = []
    
    for key, value in bert_embeddings.items():                
        
        # Calculate the cosine similarity between two inputs 
        cosim = 1 - cosine(value, bert_statementL)
        collect_Cosine.append([cosim, key])
           
    # sort cosine values
    value_list_sorted = sorted(collect_Cosine, key=itemgetter(0))
    
    if value_list_sorted[-1][1] == orgIDL:
        correct_count += 1.0
    else:
        incorrect_count += 1.0
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 2.3.2. This is the same test as 2.3.1, except that no words that already exist in the mission statement are accepted. That is, if a synonym appears somewhere else in the mission statement, then this will not be considered an acceptable synonym. The procedure instead attempts to replace the word with the second or third synonym. The purpose is to attempt to retain meaning of the mission statement withou retaining any of the word tokens that were present in the original mission statement. 

In [None]:
correct_count = 0
incorrect_count = 0


for k, v in missionStatements.items():
    
    orgID = k
    statement = v[3]
    
    # generate synonyms: start of block
    missionStatementWords_asList = statement.split()
    constructedMissionList = []
    
    for word in missionStatementWords_asList:
        
        synonyms = []
        
        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                synonyms.append(l.name())
    
        a = set(synonyms)
        a = list(a)
           
        # Attempt to replace the word with the first three suggested synonyms by wordnet
        # If the first one equals the word to be replaced or already occurs in mission statement
        # then proceed to the second suggestion. If the first three suggestions do not mee the conditions
        # or if there are no suggestions (except clause), then leave blank replacement      
        
        try:
            if a[0] != word and a[0] not in missionStatementWords_asList:
                constructedMissionList.append(a[0])
            elif a[1] != word and a[1] not in missionStatementWords_asList:
                constructedMissionList.append(a[1])
            elif a[2] != word and a[2] not in missionStatementWords_asList:
                constructedMissionList.append(a[2])
            else:
                #pass
                constructedMissionList.append("")
        except:
            constructedMissionList.append("")
        
    constructed_missionStatement = " ".join(constructedMissionList)
    statement = constructed_missionStatement

    # generate BERT embedding of input mission statement
    bert_statement = getBertEmbedding(statement)
    collect_Cosine = []
    
    for key, value in bert_embeddings.items():                
        
        # Calculate the cosine similarity between two inputs 
        cosim = 1 - cosine(value, bert_statementL)
        collect_Cosine.append([cosim, key])
           
    # sort cosine values
    value_list_sorted = sorted(collect_Cosine, key=itemgetter(0))
    
    if value_list_sorted[-1][1] == orgIDL:
        correct_count += 1.0
    else:
        incorrect_count += 1.0
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)