In [61]:
import numpy as np
from numpy.linalg import norm
import math

In [62]:
def load_w2v(filename):
    w2v = {}
    w2vFile = open(filename, 'r', encoding='utf-8')
    w2v_size = int(w2vFile.readline())
    w2v_dim = int(w2vFile.readline())
    for i in w2vFile:
        s = i.split()
        v = [float(val) for val in s[1:]]
        w2v[s[0].strip()] = v
    w2vFile.close()
    return w2v

In [63]:
def get_word_vector(word: str, model: dict):
    if word not in model:
        return []
    return model[word]

In [64]:
def calculate_similarity_two_word(first_word: str, second_word: str, model: dict):
    first_word_vector = get_word_vector(first_word, model)
    second_word_vector = get_word_vector(second_word, model)
    if not first_word_vector:
        print("There is no word '{}' in '{}'".format(first_word, 'Model'))
        return None
    if not second_word_vector:
        print("There is no word '{}' in '{}'".format(second_word, 'Model'))
        return None

    # cosine_similarity = 1- float(spatial.distance.cosine(first_word_vector, second_word_vector))/2
    cosine_similarity = np.dot(first_word_vector,second_word_vector)/(norm(first_word_vector)*norm(second_word_vector))
    return cosine_similarity
   

In [65]:
def find_K_nearest_word(input_word: str, k_parameter: int, model: dict) -> list:
    word_dict = {}
    word_list = []

    for word in model:
        word_dict[word] = calculate_similarity_two_word(input_word, word, model)

    sorted_dict = dict(sorted(word_dict.items(), key=lambda item: item[1], reverse=True))

    for word in sorted_dict:
        word_list.append(word)

    return word_list[1 : k_parameter + 1 ]

In [68]:
model = load_w2v('./W2V_150.txt')
input_word = input("Type input: ")
k_parameter = int(input("K paramater: "))
list_nearest = find_K_nearest_word(input_word, k_parameter, model)
print("The most {} nearest words with '{}' are:".format(k_parameter, input_word))
for word in list_nearest:
    print(word)


Type input:  biển
K paramater:  10


The most 10 nearest words with 'biển' are:
vùng_biển
bãi_biển
bờ_biển
biển_đông
mặt_biển
sông_trà_khúc
phá_tam_giang
vịnh
bán_đảo
biển_cả
