## This notebook contains GloVe Embeddings Procedures on Pre-Trained and Custum Corpuses and Tests Performed on GloVe to Determine How it Performs on Text Similarity Tasks

In [None]:
import random
import pickle

import numpy as np
import matplotlib.pyplot as plt
from operator import itemgetter

from scipy import spatial
from scipy.spatial.distance import cosine

from nltk.corpus import wordnet
from sklearn.manifold import TSNE
from randomwordgenerator import randomwordgenerator

### 1. Create Glove Embeddings Dictionaries to Call Word Vectors

#### 1a. GloVe Pre-trained Models in 50 and 300 Dimension Vectors

In [None]:
embeddings_dict_50d = {}
with open("glove.6B.50d.txt", 'r', encoding="utf-8") as f: #leave out encoding for older versions
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict_50d[word] = vector

In [None]:
embeddings_dict_300d = {}
with open("glove.6B.300d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict_300d[word] = vector

#### 1b. GloVe Custom-Trained Models in 50 and 300 Dimension Vectors

In [None]:
# Load the mission statements and write them to a text file separating each document with new line

filename = "missionStatements.pickle"

with open(filename, "rb") as handle:
    missionStatements = pickle.load(handle)
    
for k, v in missionStatements.items():
    with open("corpus.txt", "a") as f:
        text = v[3].replace("\n", "")
        f.write(text + "\n")

In [None]:
embeddings_dict_tuned = {}
with open("vectors_50d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict_tuned[word] = vector

In [None]:
embeddings_dict_tuned_300d = {}
with open("vectors_300d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict_tuned_300d[word] = vector

### 2. Embedding Section

#### 2a. Create embedding for each mission statement for later use in testing: Pre-Trained Model 

In [None]:
# Create a function to retrieve embedding vectors
def get_Text2Vec(text):
    
    avgwtext2vec = None
    count = 0
    
    for word in text.split():
        
        if word in embeddings_dict_300d:
            
            count += 1
            
            if avgwtext2vec is None:
                avgwtext2vec = embeddings_dict_300d[word]
            else:
                avgwtext2vec = avgwtext2vec + embeddings_dict_300d[word]
                
    if avgwtext2vec is not None:
        avgwtext2vec = avgwtext2vec / count
        
        return avgwtext2vec

In [None]:
# Create embedding for each mission statement and save as dictionary
# This is the embedding that combined pretrained model and self corpu

text2vec_Embeddings = {}

for key, value in missionStatements.items():
    
    orgID = key # oganization ID
    missionStatement = value[3]
    
    vectors = get_Text2Vec(missionStatement)
    text2vec_Embeddings[orgID] = vectors

In [None]:
# Save embeddings in pickle format for later use
with open("text2vec_Embeddings.pickle", "wb") as handle:
    pickle.dump(text2vec_Embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### 2b. Create embedding for each mission statement for later use in testing: Tuned Model 

In [None]:
# Create a function to retrieve tuned embedding vectors
def get_Text2Vec_Tuned_300d_Combined(text):
    
    avgwtext2vec = None
    count = 0
    
    for word in text.split():
        
        if word in embeddings_dict_tuned_300d and word in embeddings_dict_300d:
            
            count += 1
            
            if avgwtext2vec is None:
                # can test variations
                avgwtext2vec = embeddings_dict_tuned_300d[word] + embeddings_dict_300d[word]
                #avgwtext2vec = (embeddings_dict_tuned_300d[word] + embeddings_dict_300d[word])/2
            else:
                # can test variations
                avgwtext2vec = avgwtext2vec + (embeddings_dict_tuned_300d[word] + embeddings_dict_300d[word])
                #avgwtext2vec = avgwtext2vec + ((embeddings_dict_tuned_300d[word] + embeddings_dict_300d[word])/2)
                
    if avgwtext2vec is not None:
        avgwtext2vec = avgwtext2vec / count
        
        return avgwtext2vec

In [None]:
# Create embedding for each mission statement and save as dictionary
# This is the embedding that combined pretrained model and self corpus

text2vec_Embeddings_Tuned_300d_Combined = {}

for key, value in missionStatements.items():
    
    orgID = key
    missionStatement = value[3]

    vectors = get_Text2Vec_Tuned_300d_Combined(missionStatement)
    text2vec_Embeddings_Tuned_300d_Combined[orgID] = vectors

In [None]:
# Save embeddings in pickle format for later use
with open("text2vec_Embeddings_Tuned_300d_Combined.pickle", "wb") as handle:
    pickle.dump(text2vec_Embeddings_Tuned_300d_Combined, handle, protocol=pickle.HIGHEST_PROTOCOL)

## 3. Testing Section: This section presents several tests. In order to run on different models, substitute in the appropriate dictionaries from Section 2. 

### Specifically, what the tests do is to take a mission statement from a charity, manipulate the mission statement, e.g. delete parts, add noise, etc., and see if the altered mission statement is successfully matched with the organization's original/unaltered mission statement

### 3.1.1. This is just a check to see that everything works. The mission statement is not altered. The unaltered statement is matched with the closest mission statement (it should be itself)  - hundred percent match should be attained.

In [None]:
# keep track of correct and incorrect matches
correct_count = 0
incorrect_count = 0

# loop through mission statements 
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
        
    # generate embedding for mission statement
    t2v_missionStatement = get_Text2Vec(missionStatement)
    collect_Cosine_Values = []

    for key, value in text2vec_Embeddings.items():
        
        cosim = 1 - cosine(value, t2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    # sort to attain highest cosine values
    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))
    if value_list_sorted[-1][1] == orgID:
        correct_count = correct_count + 1        
    else:
        incorrect_count = incorrect_count + 1
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 3.1.2. This is the first manipulation. A fraction of the mission statement is selected (either 1/2, 1/5, or 1/10) and then attempted to match with the closest unaltered mission statement. If it is matched with its unaltered version, it's recorded as a correct match. 

In [None]:
# keep track of correct and incorrect matches
correct_count = 0
incorrect_count = 0

# loop through mission statements 
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
    
    # vary length of statement e.g. first half, first tenth, second half, etc
    missionStatement_length = len(missionStatement)
    missionStatement = missionStatement[:int(0.2*missionStatement_length)]
        
    # generate embedding for mission statement
    t2v_missionStatement = get_Text2Vec(missionStatement)
    collect_Cosine_Values = []

    for key, value in text2vec_Embeddings.items():
        
        cosim = 1 - cosine(value, t2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    # sort to attain highest cosine values
    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))
    if value_list_sorted[-1][1] == orgID:
        correct_count = correct_count + 1        
    else:
        incorrect_count = incorrect_count + 1
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 3.1.3. This test selects a random subset of words from the mission statement and only uses these words in its attempt to match with the correct un-altered mission statement. 

In [None]:
# keep track of correct and incorrect matches
correct_count = 0
incorrect_count = 0

# loop through mission statements 
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
    
    missionStatementWords_asList = missionStatement.split()
    # get number of words in mission statement
    no_words = len(missionStatementWords_asList)
    half_no_words = int(no_words*0.5) # select to populate with half of the words from original statement

    # list to populate with random words from mission statement
    select_Words_atRandom = []

    # populate new statement until the half mark is reached
    while len(select_Words_atRandom) < half_no_words:

        randno = random.randint(0, no_words-1)
        word = missionStatementWords_asList[randno]

        # Non-Unique Version 
        select_Words_atRandom.append(word)
        
        # Unique Version: if no duplicates should be in new statement use this version
        #if word not in select_Words_atRandom:
        #    select_Words_atRandom.append(word)
        #else:
        #    pass

    constructed_missionStatement = " ".join(select_Words_atRandom)
    missionStatement = constructed_missionStatement
    
    # generate embedding for mission statement
    t2v_missionStatement = get_Text2Vec(missionStatement)
    collect_Cosine_Values = []

    for key, value in text2vec_Embeddings.items():
        
        cosim = 1 - cosine(value, t2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    # sort to attain highest cosine values
    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))
    if value_list_sorted[-1][1] == orgID:
        correct_count = correct_count + 1        
    else:
        incorrect_count = incorrect_count + 1
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 3.2.1. This test attempts to measure how noise affects the algorithms performance. Fraction of the original mission statement is replaced with random words (noise). 

In [None]:
# keep track of correct and incorrect matches
correct_count = 0
incorrect_count = 0

# loop through mission statements 
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k

    missionStatementWords_asList = missionStatement.split()
    no_words = len(missionStatementWords_asList)
    fraction_of_words = int(no_words * (1/4))

    firstPart = missionStatementWords_asList[:fraction_of_words + 1]
    # substitute fraction of mission statement with random words to approximate noise
    secondPart = randomwordgenerator.generate_random_words(n = int(no_words * (3/4)))

    constructed_missionStatement1 = " ".join(firstPart)
    constructed_missionStatement2 = " ".join(secondPart)
    missionStatement = constructed_missionStatement1 + " " + constructed_missionStatement2 
        
    # generate embedding for mission statement
    t2v_missionStatement = get_Text2Vec(missionStatement)
    collect_Cosine_Values = []

    for key, value in text2vec_Embeddings.items():
        
        cosim = 1 - cosine(value, t2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    # sort to attain highest cosine values
    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))
    if value_list_sorted[-1][1] == orgID:
        correct_count = correct_count + 1        
    else:
        incorrect_count = incorrect_count + 1
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 3.2.2. This test is the same as in 3.2.1., except that the words are shuffled, so that strings from the original mission statement are not retained.  

In [None]:
# keep track of correct and incorrect matches
correct_count = 0
incorrect_count = 0

# loop through mission statements 
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
    
    missionStatementWords_asList = missionStatement.split()
    no_words = len(missionStatementWords_asList)
    fraction_of_words = int(no_words * (1/4))

    firstPart = missionStatementWords_asList[:fraction_of_words + 1]
    # substitute fraction of mission statement with random words to approximate noise
    secondPart = randomwordgenerator.generate_random_words(n = int(no_words * (3/4)))
 
    # shuffle the selected words
    combinedParts = firstPart + secondPart
    random.shuffle(combinedParts)
    missionStatement = " ".join(combinedParts)
             
    # generate embedding for mission statement
    t2v_missionStatement = get_Text2Vec(missionStatement)
    collect_Cosine_Values = []

    for key, value in text2vec_Embeddings.items():
        
        cosim = 1 - cosine(value, t2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    # sort to attain highest cosine values
    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))
    if value_list_sorted[-1][1] == orgID:
        correct_count = correct_count + 1        
    else:
        incorrect_count = incorrect_count + 1
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 3.3.1. This test replaces the words in the original mission statement with synonyms. This test approximates how the algorithm handles cases where the meaning is retained but the actual words are different. 

In [None]:
# keep track of correct and incorrect matches
correct_count = 0
incorrect_count = 0

# loop through mission statements 
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
    
    # procedures to replace words with synonyms start here
    missionStatementWords_asList = missionStatement.split()
    constructedMissionList = []
    
    for word in missionStatementWords_asList:
        
        synonyms = []
        
        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                synonyms.append(l.name())
    
        a = set(synonyms)
        a = list(a)
        
        #replace with first synonym
        try:
            constructedMissionList.append(a[0])
        except:
            constructedMissionList.append("")
        
    constructed_missionStatement = " ".join(constructedMissionList)
    missionStatement = constructed_missionStatement
        
    # generate embedding for mission statement
    t2v_missionStatement = get_Text2Vec(missionStatement)
    collect_Cosine_Values = []

    for key, value in text2vec_Embeddings.items():
        
        cosim = 1 - cosine(value, t2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    # sort to attain highest cosine values
    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))
    if value_list_sorted[-1][1] == orgID:
        correct_count = correct_count + 1        
    else:
        incorrect_count = incorrect_count + 1
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 3.3.2. This is the same test as 3.3.1, except that no words that already exist in the mission statement are accepted. That is, if a synonym appears somewhere else in the mission statement, then this will not be considered an acceptable synonym. The procedure instead attempts to replace the word with the second or third synonym. The purpose is to attempt to retain meaning of the mission statement withou retaining any of the word tokens that were present in the original mission statement. 

In [None]:
# keep track of correct and incorrect matches
correct_count = 0
incorrect_count = 0

# loop through mission statements 
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
    
    # procedures to replace words with synonyms start here
    missionStatementWords_asList = missionStatement.split()
    constructedMissionList = []
    
    for word in missionStatementWords_asList:
        
        synonyms = []
        
        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                synonyms.append(l.name())
    
        a = set(synonyms)
        a = list(a)  
   
        # Attempt to replace the word with the first three suggested synonyms by wordnet
        # If the first one equals the word to be replaced or already occurs in mission statement
        # then proceed to the second suggestion. If the first three suggestions do not meet the conditions
        # or if there are no suggestions (except clause), then leave blank replacement      
        
        try:
            if a[0] != word and a[0] not in missionStatementWords_asList:
                constructedMissionList.append(a[0])
            elif a[1] != word and a[1] not in missionStatementWords_asList:
                constructedMissionList.append(a[1])
            elif a[2] != word and a[2] not in missionStatementWords_asList:
                constructedMissionList.append(a[2])
            else:
                #pass
                constructedMissionList.append("")
        except:
            constructedMissionList.append("")
        
    constructed_missionStatement = " ".join(constructedMissionList)
    missionStatement = constructed_missionStatement    
    
    # generate embedding for mission statement
    t2v_missionStatement = get_Text2Vec(missionStatement)
    collect_Cosine_Values = []

    for key, value in text2vec_Embeddings.items():
        
        cosim = 1 - cosine(value, t2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    # sort to attain highest cosine values
    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))
    if value_list_sorted[-1][1] == orgID:
        correct_count = correct_count + 1        
    else:
        incorrect_count = incorrect_count + 1
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)