# Cosine Similarity and GloVe
Testing using word embeddings to aid in comparing sentences with cosine similarities

https://www.geeksforgeeks.org/python-measure-similarity-between-two-sentences-using-cosine-similarity/

Utilized for cosine similarity explanation

Pretrained GloVe obtained at following url:
https://nlp.stanford.edu/projects/glove/


Learned about how to read the vectors from here:

https://medium.com/analytics-vidhya/basics-of-using-pre-trained-glove-vectors-in-python-d38905f356db


In [14]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import numpy as np
from scipy import spatial

In [13]:
s1 = "Gyarados first appears in the Pokémon video game series in Pokémon Red and Blue, and later appears in every subsequent sequel."
s2 = "In the anime, Gyarados first appeared in Pokémon - I Choose You! swimming in a river."
q = "What video game series did Gyrados first appear in?"

In [7]:
def cosine_sim(s1, s2):
    t1 = word_tokenize(s1)
    t2 = word_tokenize(s2)

    stops = stopwords.words('english') 

    t1_set = {word for word in t1 if not word in stops}  
    t2_set = {word for word in t2 if not word in stops} 

    combined_set = t1_set.union(t2_set)

    v1 = []
    v2 = []

    #Create vectors based on unique word occurrences
    for word in combined_set:
        if word in t1_set: 
            v1.append(1) # create a vector 
        else: 
            v1.append(0) 

        if word in t2_set: 
            v2.append(1) 
        else: 
            v2.append(0) 

    matches = 0
    #Cosine Formula  
    for i in range(len(combined_set)): 
            matches += v1[i] * v2[i] 
    cosine = matches / float((sum(v1)*sum(v2))**0.5) 
    print("similarity: ", cosine) 

cosine_sim(s1, q)
cosine_sim(s2, q)

similarity:  0.3651483716701107
similarity:  0.09128709291752768


# Attempting to use word embeddings in comparison as well

In [12]:
embeddings_dict = {}
with open("glove_6B/glove.6B.50d.txt", 'r', encoding="utf-8") as f:
     for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector


In [15]:
embeddings_dict

  4.6754e-01,  3.9251e-01, -1.3475e-01,  6.0021e-02,  5.1595e-01,
        -1.9336e+00,  4.5137e-01, -1.5086e-01, -3.3555e-02, -9.7559e-02,
         2.8652e+00,  3.4424e-02, -8.7831e-01, -1.2224e-01, -6.6944e-01,
        -2.8933e-01,  7.6775e-02, -9.8721e-01, -5.0272e-01, -2.5658e-02,
         3.0176e-01,  5.8119e-01, -2.2874e-01, -1.6283e-01,  3.7797e-01,
         1.3962e-01, -2.5891e-01,  7.1100e-01, -1.8220e-01,  4.1096e-01],
       dtype=float32),
 'investigation': array([ 1.252    , -0.61414  , -0.32648  ,  0.70506  , -0.075753 ,
         0.37693  , -0.057267 ,  0.5249   ,  0.93365  , -0.82696  ,
        -0.58949  ,  0.035164 , -1.495    ,  0.011154 ,  1.0813   ,
        -0.67989  , -0.54568  , -0.26465  ,  0.0084573, -0.13989  ,
         0.91637  ,  0.44943  ,  0.11898  , -0.51478  , -0.68206  ,
        -2.5211   , -0.024897 ,  0.25036  , -0.52294  ,  0.18746  ,
         2.2091   , -1.1696   , -0.69227  , -1.7039   ,  0.033632 ,
         0.14221  , -0.081491 , -0.27491  ,  0.55344

In [16]:
def find_closest_embeddings(embedding):
    return sorted(embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean(embeddings_dict[word], embedding))
