In [1]:
import numpy as np
import codecs

"""
Using "Dependency Based" dataset from
url: https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings/
"""

"""
Code for loading bin file is from a blog post
url: https://blog.ekbana.com/loading-glove-pre-trained-word-embedding-model-from-text-file-faster-5d3e8f2b8455
"""
def convert_to_binary(embedding_path):
    """
    Here, it takes path to embedding text file provided by glove.
    :param embedding_path: takes path of the embedding which is in text format or any format other than binary.
    :return: a binary file of the given embeddings which takes a lot less time to load.
    """
    f = codecs.open(embedding_path + ".txt", 'r', encoding='utf-8')
    wv = []
    with codecs.open(embedding_path + ".vocab", "w", encoding='utf-8') as vocab_write:
        count = 0
        for line in f:
            if count == 0:
                pass
            else:
                splitlines = line.split()
                vocab_write.write(splitlines[0].strip())
                vocab_write.write("\n")
                wv.append([float(val) for val in splitlines[1:]])
            count += 1
    np.save(embedding_path + ".npy", np.array(wv))
    
def load_embeddings_binary(embeddings_path):
    """
    It loads embedding provided by glove which is saved as binary file. Loading of this model is
    about  second faster than that of loading of txt glove file as model.
    :param embeddings_path: path of glove file.
    :return: glove model
    """
    with codecs.open(embeddings_path + '.vocab', 'r', 'utf-8') as f_in:
        index2word = [line.strip() for line in f_in]
    wv = np.load(embeddings_path + '.npy')
    model = {}
    for i, w in enumerate(index2word):
        model[w] = wv[i]
    return model

In [7]:
"""
Function to find top similar words as defined by dot product
Written by us :)
"""
def n_similar(inputVec, n, keySpace, model):
    topWord = []
    topDot = []
    for i in range(0,n):
        topWord.append('')
        topDot.append(0)
    length = inputVec.shape[0]
    for key in keySpace:
        lenKey = (np.reshape(model[key],(1,length)) @ np.reshape(model[key],(length,1))) ** .5
        lenInput = (np.reshape(inputVec,(1,length)) @ np.reshape(inputVec,(length,1))) ** .5
        dot = np.reshape(inputVec,(1,length)) @ np.reshape(model[key],(length,1)) / lenKey / lenInput
        for j in range(0,n):
            if (dot > topDot[n - j - 1]):
                if (j != 0):
                    topWord[n - j] = topWord[n - j - 1]
                    topDot[n - j] = topDot[n - j - 1]
                topWord[n - j - 1] = key
                topDot[n - j - 1] = dot
    return topWord, topDot

In [26]:
# note: the path refers to a <path>.txt file, you will need to update your extension
path = "deps"
convert_to_binary(path)
print('converted')
x = load_embeddings_binary(path)
keys = x.keys()

converted


In [27]:
math = x['king'] - x['man'] + x['woman']
vals = keys
result, length = n_similar(math, 5, vals, x)
print(result)
print(length)

['soup', 'boing', 'workin', 'raving', 'goodnight']
[array([[0.34218405]]), array([[0.2252225]]), array([[0.21887097]]), array([[0.2166584]]), array([[0.2162438]])]
