## This notebook contains Word2Vec Embedding Procedures on Pre-Trained and Custum Corpuses and Tests Performed on Word2Vec to Determine How it Performs on Text Similarity Tasks

In [None]:
import pickle
import pandas as pd

In [None]:
filename = "missionStatements.pickle"

with open(filename, "rb") as handle:
    missionStatements = pickle.load(handle)

In [None]:
df_format = []

for k, v in missionStatements.items():
    df_format.append([k, v[3]])
    
df = pd.DataFrame(df_format, columns = ['orgID', 'missionStatement']) 

### Text Preprocessing

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [None]:
def text_preprocessing(text):
    
    # remove stop words
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    
    # remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    
    return text.lower()

## 1. Train Word2Vec Model and Create Dictionary for Word Embeddings

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [None]:
corpus = []
for words in df['missionStatement']:
    words = text_preprocessing(words)
    corpus.append(words.split())

In [None]:
EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz'
google_word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

# Training the corpus with Google Pretrained Model
google_model = Word2Vec(size = 300, window = 5, min_count = 2, workers = -1)
google_model.build_vocab(corpus)

google_model.intersect_word2vec_format(EMBEDDING_FILE, lockf=1.0, binary=True)
google_model.train(corpus, total_examples = google_model.corpus_count, epochs = 5)

In [None]:
# Create function to retrieve Word2Vec Embeddings

def getWord2Vec_Embedding(text):
    
    avgword2vec = None
    count = 0
    
    for word in text.split():
        
        if word in google_model.wv.vocab:
            
            count += 1
            
            if avgword2vec is None:
                avgword2vec = google_model[word]
            else:
                avgword2vec = avgword2vec + google_model[word]
                
    if avgword2vec is not None:
        avgword2vec = avgword2vec / count
        
        return avgword2vec

In [None]:
# Create a dictionary for word embeddings

word2Vec_Embeddings = {}

for index, row in df.iterrows():
    
    orgID = row['orgID']
    missionStatement = row['missionStatement']
    # to use with word preprocessing
    #missionStatement = text_preprocessing(row['missionStatement'])
    
    
    vectors = getWord2Vec_Embedding(missionStatement)
    word2Vec_Embeddings[orgID] = vectors

## 2. Testing Section: This section presents several tests. In order to run on different models, substitute in the appropriate dictionaries from Section 1. 

### Specifically, what the tests do is to take a mission statement from a charity, manipulate the mission statement, e.g. delete parts, add noise, etc., and see if the altered mission statement is successfully matched with the organization's original/unaltered mission statement

In [None]:
import random
from operator import itemgetter
from nltk.corpus import wordnet
from scipy.spatial.distance import cosine
from randomwordgenerator import randomwordgenerator

### 2.1.1. This is just a check to see that everything works. The mission statement is not altered. The unaltered statement is matched with the closest mission statement (it should be itself)  - hundred percent match should be attained.

In [None]:
# keep track of in/correct matches
correct_count = 0
incorrect_count = 0

# loop over mission statements
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
    
    # generate w2v word embedding for altered mission statement
    w2v_missionStatement = getWord2Vec_Embedding(missionStatement)
    collect_Cosine_Values = []

    for key, value in word2Vec_Embeddings_cleanedStatements.items():
        
        cosim = 1 - cosine(value, w2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))

    if value_list_sorted[-1][1] == orgID:
        correct_count += 1        
    else:
        incorrect_count += 1
        
percent = correct_count/(correct_count + incorrect_count) *100
print(percent)

### 2.1.2. This is the first manipulation. A fraction of the mission statement is selected (either 1/2, 1/5, or 1/10) and then attempted to match with the closest unaltered mission statement. If it is matched with its unaltered version, it's recorded as a correct match. 

In [None]:
# keep track of in/correct matches
correct_count = 0
incorrect_count = 0

# loop over mission statements
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k

    # vary length of statement e.g. first half, first tenth, second half, etc
    missionStatement_length = len(missionStatement)
    missionStatement = missionStatement[:int(0.5*missionStatement_length)]
        
    # generate w2v word embedding for altered mission statement
    w2v_missionStatement = getWord2Vec_Embedding(missionStatement)
    collect_Cosine_Values = []

    for key, value in word2Vec_Embeddings_cleanedStatements.items():
        
        cosim = 1 - cosine(value, w2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))

    if value_list_sorted[-1][1] == orgID:
        correct_count += 1        
    else:
        incorrect_count += 1
        
percent = correct_count/(correct_count + incorrect_count) *100
print(percent)

### 2.1.3. This test selects a random subset of words from the mission statement and only uses these words in its attempt to match with the correct un-altered mission statement. 

In [None]:
# keep track of in/correct matches
correct_count = 0
incorrect_count = 0

# loop over mission statements
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
    
    # select random subset of words from mission statement
    # convert mission statement into list of words
    missionStatementWords_asList = missionStatement.split()

    # get number of words in mission statement
    no_words = len(missionStatementWords_asList)
    half_no_words = int(no_words*0.5)

    # list to populate with random words from mission statement
    select_Words_atRandom = []

    while len(select_Words_atRandom) < half_no_words:

        randno = random.randint(0, no_words-1)
        word = missionStatementWords_asList[randno]

        # Non-Unique Version 
        select_Words_atRandom.append(word)
        
        # Unique Version: this version prevents duplicates to be in new statement
        #if word not in select_Words_atRandom:
        #    select_Words_atRandom.append(word)
        #else:
        #    pass

    constructed_missionStatement = " ".join(select_Words_atRandom)
    missionStatement = constructed_missionStatement
    
    # generate w2v word embedding for altered mission statement
    w2v_missionStatement = getWord2Vec_Embedding(missionStatement)
    collect_Cosine_Values = []

    for key, value in word2Vec_Embeddings_cleanedStatements.items():
        
        cosim = 1 - cosine(value, w2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))

    if value_list_sorted[-1][1] == orgID:
        correct_count += 1        
    else:
        incorrect_count += 1
        
percent = correct_count/(correct_count + incorrect_count) *100
print(percent)

### 2.2.1. This test attempts to measure how noise affects the algorithms performance. Fraction of the original mission statement is replaced with random words (noise). 

In [None]:
# keep track of in/correct matches
correct_count = 0
incorrect_count = 0

# loop over mission statements
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
    
    # Substitute fraction of mission statement with random words approximating noise
    missionStatementWords_asList = missionStatement.split()

    no_words = len(missionStatementWords_asList)
    fraction_of_words = int(no_words * (1/2))

    firstPart = missionStatementWords_asList[:fraction_of_words + 1]
    secondPart = randomwordgenerator.generate_random_words(n = int(no_words * (1/2)))

    constructed_missionStatement1 = " ".join(firstPart)
    constructed_missionStatement2 = " ".join(secondPart)
    missionStatement = constructed_missionStatement1 + " " + constructed_missionStatement2 
    
    # generate w2v word embedding for altered mission statement
    w2v_missionStatement = getWord2Vec_Embedding(missionStatement)
    collect_Cosine_Values = []

    for key, value in word2Vec_Embeddings_cleanedStatements.items():
        
        cosim = 1 - cosine(value, w2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))

    if value_list_sorted[-1][1] == orgID:
        correct_count += 1        
    else:
        incorrect_count += 1
        
percent = correct_count/(correct_count + incorrect_count) *100
print(percent)

### 2.2.2. This test is the same as in 2.2.1., except that the words are shuffled, so that strings from the original mission statement are not retained.  

In [None]:
# keep track of in/correct matches
correct_count = 0
incorrect_count = 0

# loop over mission statements
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
    
    # Substitute fraction of mission statement with random words approximating noise
    missionStatementWords_asList = missionStatement.split()

    no_words = len(missionStatementWords_asList)
    fraction_of_words = int(no_words * (1/2))

    firstPart = missionStatementWords_asList[:fraction_of_words + 1]
    secondPart = randomwordgenerator.generate_random_words(n = int(no_words * (1/2)))
 
    # Version to shuffle the selected words
    combinedParts = firstPart + secondPart
    random.shuffle(combinedParts)
    missionStatement = " ".join(combinedParts)
    
    # generate w2v word embedding for altered mission statement
    w2v_missionStatement = getWord2Vec_Embedding(missionStatement)
    collect_Cosine_Values = []

    for key, value in word2Vec_Embeddings_cleanedStatements.items():
        
        cosim = 1 - cosine(value, w2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))

    if value_list_sorted[-1][1] == orgID:
        correct_count += 1        
    else:
        incorrect_count += 1
        
percent = correct_count/(correct_count + incorrect_count) *100
print(percent)

### 2.3.1. This test replaces the words in the original mission statement with synonyms. This test approximates how the algorithm handles cases where the meaning is retained but the actual words are different. 

In [None]:
# keep track of in/correct matches
correct_count = 0
incorrect_count = 0

# loop over mission statements
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
    
    # block to generate synonyms
    missionStatementWords_asList = missionStatement.split()
    constructedMissionList = []
    
    for word in missionStatementWords_asList:
        
        synonyms = []
        
        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                synonyms.append(l.name())
    
        a = set(synonyms)
        a = list(a)
        
        # replace with first synonym in wordnet
        try:
            constructedMissionList.append(a[0])
        except:
            constructedMissionList.append("")
         
    constructed_missionStatement = " ".join(constructedMissionList)
    missionStatement = constructed_missionStatement
    
    # generate w2v word embedding for altered mission statement
    w2v_missionStatement = getWord2Vec_Embedding(missionStatement)
    collect_Cosine_Values = []

    for key, value in word2Vec_Embeddings_cleanedStatements.items():
        
        cosim = 1 - cosine(value, w2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))

    if value_list_sorted[-1][1] == orgID:
        correct_count += 1        
    else:
        incorrect_count += 1
        
percent = correct_count/(correct_count + incorrect_count) *100
print(percent)

### 2.3.2. This is the same test as 2.3.1, except that no words that already exist in the mission statement are accepted. That is, if a synonym appears somewhere else in the mission statement, then this will not be considered an acceptable synonym. The procedure instead attempts to replace the word with the second or third synonym. The purpose is to attempt to retain meaning of the mission statement withou retaining any of the word tokens that were present in the original mission statement. 

In [None]:
# keep track of in/correct matches
correct_count = 0
incorrect_count = 0

# loop over mission statements
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
    
    # retrieve synonym block
    missionStatementWords_asList = missionStatement.split()
    constructedMissionList = []
    
    for word in missionStatementWords_asList:
        
        synonyms = []
        
        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                synonyms.append(l.name())
    
        a = set(synonyms)
        a = list(a)
   
        # Attempt to replace the word with the first three suggested synonyms by wordnet
        # If the first one equals the word to be replaced or already occurs in mission statement
        # then proceed to the second suggestion. If the first three suggestions do not mee the conditions
        # or if there are no suggestions (except clause), then leave blank replacement      
        
        try:
            if a[0] != word and a[0] not in missionStatementWords_asList:
                constructedMissionList.append(a[0])
            elif a[1] != word and a[1] not in missionStatementWords_asList:
                constructedMissionList.append(a[1])
            elif a[2] != word and a[2] not in missionStatementWords_asList:
                constructedMissionList.append(a[2])
            else:
                #pass
                constructedMissionList.append("")
        except:
            constructedMissionList.append("")
         
    constructed_missionStatement = " ".join(constructedMissionList)
    missionStatement = constructed_missionStatement

    # generate w2v word embedding for altered mission statement
    w2v_missionStatement = getWord2Vec_Embedding(missionStatement)
    collect_Cosine_Values = []

    for key, value in word2Vec_Embeddings_cleanedStatements.items():
        
        cosim = 1 - cosine(value, w2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))

    if value_list_sorted[-1][1] == orgID:
        correct_count += 1        
    else:
        incorrect_count += 1
        
percent = correct_count/(correct_count + incorrect_count) *100
print(percent)