## This notebook contains Hybrid TFIDF and Word2Vec Embeddings Procedures on Pre-Trained and Custum Corpuses and Tests Performed on TFIDF-Word2Vec to Determine How it Performs on Text Similarity Tasks

### 1. Train the Model

In [None]:
filename = "/Users/standard/Documents/Working/all_Combined_Cleaned_Ratings_Selection.pickle"

with open(filename, "rb") as handle:
    missionStatements = pickle.load(handle)

In [None]:
# Create a list of documents and tokenize the words to feed to Gensim TFIDF Model
documents = []
for k, v in missionStatements.items():
    documents.append(v[3]) # v[3] holds missions statements
    
texts = [
        [word for word in document.split()] 
        for document in documents]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
# Create a dictionary to store tfidf values of mission statment words
tfidf = models.tfidfmodel.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

tfidf_values_missionStatements_dict = {}
d = {}
for doc in corpus_tfidf:
    for id, value in doc:
        word = dictionary.get(id)
        tfidf_values_missionStatements_dict[word] = value

In [None]:
# load mission statements into a dataframe
df_format = []

for k, v in missionStatements.items():
    df_format.append([k, v[3]])
    
df = pd.DataFrame(df_format, columns = ['orgID', 'missionStatement']) 

In [None]:
corpus = []
for words in df['missionStatement']:
    corpus.append(words.split())

In [None]:
# train the model
EMBEDDING_FILE = '/Users/standard/Documents/Working/EDA2/GoogleNews-vectors-negative300.bin.gz'
google_word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

# Training the corpus with Google Pretrained Model
google_model = Word2Vec(size = 300, window = 5, min_count = 2, workers = -1)
google_model.build_vocab(corpus)

google_model.intersect_word2vec_format(EMBEDDING_FILE, lockf=1.0, binary=True)

google_model.train(corpus, total_examples = google_model.corpus_count, epochs = 5)

### 2. Create a Dictionary to Hold the Embeddings for each Mission Statement

In [None]:
# Create a function to retrieve the hybrid tfidf-word2vec embedding
def get_hybrid_tfidf_word2Vec_Embedding(text):
    
    avgword2vec = None
    count = 0
    
    for word in text.split():
        
        if word in google_model.wv.vocab:
            
            count += 1
            
            if avgword2vec is None:
                avgword2vec = np.dot(google_model[word], tfidf_values_missionStatements_dict[word])
                
            else:
                avgword2vec = avgword2vec + (np.dot(google_model[word], tfidf_values_missionStatements_dict[word]))
                
    if avgword2vec is not None:
        avgword2vec = avgword2vec / count
        
        return avgword2vec

In [None]:
# Create a dictionary to hold the embeddings for each mission statement
hybrid_tfidf_word2Vec_Embeddings = {}

for index, row in df.iterrows():
   
    missionStatement = row['missionStatement']
    orgID = row['orgID']
    
    vectors = get_hybrid_tfidf_word2Vec_Embedding
    hybrid_tfidf_word2Vec_Embeddings[orgID] = vectors

## 3. Testing Section: This section presents several tests.

### Specifically, what the tests do is to take a mission statement from a charity, manipulate the mission statement, e.g. delete parts, add noise, etc., and see if the altered mission statement is successfully matched with the organization's original/unaltered mission statement

### 3.1.1. This is just a check to see that everything works. The mission statement is not altered. The unaltered statement is matched with the closest mission statement (it should be itself)  - hundred percent match should be attained.

In [None]:
# keep track of correct and incorrect matches
correct_count = 0
incorrect_count = 0

# loop through mission statements 
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
        
    # generate embedding for mission statement
    tfidf_w2v_missionStatement = get_hybrid_tfidf_word2Vec_Embedding(missionStatement)
    collect_Cosine_Values = []

    for key, value in hybrid_tfidf_word2Vec_Embeddings.items():
        
        cosim = 1 - cosine(value, tfidf_w2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    # sort to attain highest cosine values
    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))
    if value_list_sorted[-1][1] == orgID:
        correct_count = correct_count + 1        
    else:
        incorrect_count = incorrect_count + 1
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 3.1.2. This is the first manipulation. A fraction of the mission statement is selected (either 1/2, 1/5, or 1/10) and then attempted to match with the closest unaltered mission statement. If it is matched with its unaltered version, it's recorded as a correct match. 

In [None]:
# keep track of correct and incorrect matches
correct_count = 0
incorrect_count = 0

# loop through mission statements 
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
    
    # vary length of statement e.g. first half, first tenth, second half, etc
    missionStatement_length = len(missionStatement)
    missionStatement = missionStatement[:int(0.2*missionStatement_length)]
        
    # generate embedding for mission statement
    tfidf_w2v_missionStatement = get_hybrid_tfidf_word2Vec_Embedding(missionStatement)
    collect_Cosine_Values = []

    for key, value in hybrid_tfidf_word2Vec_Embeddings.items():
        
        cosim = 1 - cosine(value, tfidf_w2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    # sort to attain highest cosine values
    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))
    if value_list_sorted[-1][1] == orgID:
        correct_count = correct_count + 1        
    else:
        incorrect_count = incorrect_count + 1
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 3.1.3. This test selects a random subset of words from the mission statement and only uses these words in its attempt to match with the correct un-altered mission statement. 

In [None]:
# keep track of correct and incorrect matches
correct_count = 0
incorrect_count = 0

# loop through mission statements 
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
    
    # block to select random subset of words from mission statement
    missionStatementWords_asList = missionStatement.split()

    # get number of words in mission statement
    no_words = len(missionStatementWords_asList)
    half_no_words = int(no_words*0.5)

    # list to populate with random words from mission statement
    select_Words_atRandom = []

    while len(select_Words_atRandom) < half_no_words:

        randno = random.randint(0, no_words-1)
        word = missionStatementWords_asList[randno]

        # Non-Unique Version 
        select_Words_atRandom.append(word)
        
        # Unique Version: if no duplicates should be in new statement use this version
        #if word not in select_Words_atRandom:
        #    select_Words_atRandom.append(word)
        #else:
        #    pass

    constructed_missionStatement = " ".join(select_Words_atRandom)
    missionStatement = constructed_missionStatement
        
    # generate embedding for mission statement
    tfidf_w2v_missionStatement = get_hybrid_tfidf_word2Vec_Embedding(missionStatement)
    collect_Cosine_Values = []

    for key, value in hybrid_tfidf_word2Vec_Embeddings.items():
        
        cosim = 1 - cosine(value, tfidf_w2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    # sort to attain highest cosine values
    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))
    if value_list_sorted[-1][1] == orgID:
        correct_count = correct_count + 1        
    else:
        incorrect_count = incorrect_count + 1
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 3.2.1. This test attempts to measure how noise affects the algorithms performance. Fraction of the original mission statement is replaced with random words (noise). 

In [None]:
# keep track of correct and incorrect matches
correct_count = 0
incorrect_count = 0

# loop through mission statements 
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
    
    # substitute fraction of mission statement with random words to approximate noise
    
    missionStatementWords_asList = missionStatement.split()

    no_words = len(missionStatementWords_asList)
    fraction_of_words = int(no_words * (1/4))

    firstPart = missionStatementWords_asList[:fraction_of_words + 1]
    secondPart = randomwordgenerator.generate_random_words(n = int(no_words * (3/4)))

    constructed_missionStatement1 = " ".join(firstPart)
    constructed_missionStatement2 = " ".join(secondPart)
    missionStatement = constructed_missionStatement1 + " " + constructed_missionStatement2 
     
    # generate embedding for mission statement
    tfidf_w2v_missionStatement = get_hybrid_tfidf_word2Vec_Embedding(missionStatement)
    collect_Cosine_Values = []

    for key, value in hybrid_tfidf_word2Vec_Embeddings.items():
        
        cosim = 1 - cosine(value, tfidf_w2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    # sort to attain highest cosine values
    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))
    if value_list_sorted[-1][1] == orgID:
        correct_count = correct_count + 1        
    else:
        incorrect_count = incorrect_count + 1
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 3.2.2. This test is the same as in 3.2.1., except that the words are shuffled, so that strings from the original mission statement are not retained.  

In [None]:
# keep track of correct and incorrect matches
correct_count = 0
incorrect_count = 0

# loop through mission statements 
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
    
    # substitute fraction of mission statement with random words to approximate noise
    
    missionStatementWords_asList = missionStatement.split()

    no_words = len(missionStatementWords_asList)
    fraction_of_words = int(no_words * (1/4))

    firstPart = missionStatementWords_asList[:fraction_of_words + 1]
    secondPart = randomwordgenerator.generate_random_words(n = int(no_words * (3/4)))

    # Version to shuffle the selected words
    combinedParts = firstPart + secondPart
    random.shuffle(combinedParts)
    missionStatement = " ".join(combinedParts)
     
    # generate embedding for mission statement
    tfidf_w2v_missionStatement = get_hybrid_tfidf_word2Vec_Embedding(missionStatement)
    collect_Cosine_Values = []

    for key, value in hybrid_tfidf_word2Vec_Embeddings.items():
        
        cosim = 1 - cosine(value, tfidf_w2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    # sort to attain highest cosine values
    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))
    if value_list_sorted[-1][1] == orgID:
        correct_count = correct_count + 1        
    else:
        incorrect_count = incorrect_count + 1
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 3.3.1. This test replaces the words in the original mission statement with synonyms. This test approximates how the algorithm handles cases where the meaning is retained but the actual words are different. 

In [None]:
# keep track of correct and incorrect matches
correct_count = 0
incorrect_count = 0

# loop through mission statements 
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
        
    # replace with synonyms block
    missionStatementWords_asList = missionStatement.split()
    constructedMissionList = []
    
    for word in missionStatementWords_asList:
        
        synonyms = []
        
        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                synonyms.append(l.name())
    
        a = set(synonyms)
        a = list(a)
        
        # replace with first synonym
        try:
            constructedMissionList.append(a[0])
        except:
            constructedMissionList.append("")
         
    constructed_missionStatement = " ".join(constructedMissionList)
    missionStatement = constructed_missionStatement
     
    # generate embedding for mission statement
    tfidf_w2v_missionStatement = get_hybrid_tfidf_word2Vec_Embedding(missionStatement)
    collect_Cosine_Values = []

    for key, value in hybrid_tfidf_word2Vec_Embeddings.items():
        
        cosim = 1 - cosine(value, tfidf_w2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    # sort to attain highest cosine values
    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))
    if value_list_sorted[-1][1] == orgID:
        correct_count = correct_count + 1        
    else:
        incorrect_count = incorrect_count + 1
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)

### 3.3.2. This is the same test as 3.3.1, except that no words that already exist in the mission statement are accepted. That is, if a synonym appears somewhere else in the mission statement, then this will not be considered an acceptable synonym. The procedure instead attempts to replace the word with the second or third synonym. The purpose is to attempt to retain meaning of the mission statement withou retaining any of the word tokens that were present in the original mission statement. 

In [None]:
# keep track of correct and incorrect matches
correct_count = 0
incorrect_count = 0

# loop through mission statements 
for k, v in missionStatements.items():

    missionStatement = v[3]
    orgID = k
        
    # replace with synonyms block
    missionStatementWords_asList = missionStatement.split()
    constructedMissionList = []
    
    for word in missionStatementWords_asList:
        
        synonyms = []
        
        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                synonyms.append(l.name())
    
        a = set(synonyms)
        a = list(a)

        # Attempt to replace the word with the first three suggested synonyms by wordnet
        # If the first one equals the word to be replaced or already occurs in mission statement
        # then proceed to the second suggestion. If the first three suggestions do not mee the conditions
        # or if there are no suggestions (except clause), then leave blank replacement      
        try:
            if a[0] != word and a[0] not in missionStatementWords_asList:
                constructedMissionList.append(a[0])
            elif a[1] != word and a[1] not in missionStatementWords_asList:
                constructedMissionList.append(a[1])
            elif a[2] != word and a[2] not in missionStatementWords_asList:
                constructedMissionList.append(a[2])
            else:
                #pass
                constructedMissionList.append("")
        except:
            constructedMissionList.append("")
         
    constructed_missionStatement = " ".join(constructedMissionList)
    missionStatement = constructed_missionStatement
     
    # generate embedding for mission statement
    tfidf_w2v_missionStatement = get_hybrid_tfidf_word2Vec_Embedding(missionStatement)
    collect_Cosine_Values = []

    for key, value in hybrid_tfidf_word2Vec_Embeddings.items():
        
        cosim = 1 - cosine(value, tfidf_w2v_missionStatement)
        collect_Cosine_Values.append([cosim, key])

    # sort to attain highest cosine values
    value_list_sorted = sorted(collect_Cosine_Values, key=itemgetter(0))
    if value_list_sorted[-1][1] == orgID:
        correct_count = correct_count + 1        
    else:
        incorrect_count = incorrect_count + 1
        
perc_correct = (correct_count / (correct_count+incorrect_count))*100
print(perc_correct)