In [29]:
import gensim
from sklearn.decomposition import TruncatedSVD
from scipy.spatial.distance import cosine

def find_best_k(word2vec_model, analogies, k_start, k_end):
    best_k = 0
    max_correct = 0
    
    for k in range(k_start, k_end+1):
        #print(k, "-------------")
        svd = TruncatedSVD(n_components=k)
        reduced_vectors = svd.fit_transform(word2vec_model.vectors)
        
        correct = 0
        for analogy in analogies:
            #print(analogy, "-------------")
            word1, word2, word3, expected = analogy
            word1_index = word2vec_model.key_to_index.get(word1, None)
            word2_index = word2vec_model.key_to_index.get(word2, None)
            word3_index = word2vec_model.key_to_index.get(word3, None)
            
            if word1_index is None or word2_index is None or word3_index is None:
                continue
                
            word1_vec = reduced_vectors[word1_index]
            word2_vec = reduced_vectors[word2_index]
            word3_vec = reduced_vectors[word3_index]
            
            word4_vec = word3_vec - word2_vec + word1_vec
            closest_index = cosine_similarity(word4_vec.reshape(1, -1), reduced_vectors).argmax()
            closest_word = word2vec_model.index_to_key[closest_index]
            
            if closest_word == expected:
                correct += 1
        
    
        if correct > max_correct:
            max_correct = correct
            best_k = k
            
    return best_k


In [None]:
import gensim
from nltk.data import find
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

In [None]:
analogies = [('ATHENS', 'GREECE', 'BANGKOK', 'China'),  ('ATHENS', 'GREECE', 'BERLIN', 'GERMANY'),   ('ATHENS', 'GREECE', 'BERN', 'SWITZERLAND')]
best_k = find_best_k(word2vec_model, analogies, 10, 50)
print(best_k)
