In [56]:
import numpy as np
from numpy.linalg import norm
from scipy import stats
import pandas as pd
from scipy import spatial
import math

In [57]:
def load_w2v(filename):
    w2v = {}
    w2vFile = open(filename, 'r', encoding='utf-8')
    w2v_size = int(w2vFile.readline())
    w2v_dim = int(w2vFile.readline())
    for i in w2vFile:
        s = i.split()
        v = [float(val) for val in s[1:]]
        w2v[s[0].strip()] = v
    w2vFile.close()
    return w2v

In [58]:
model = load_w2v('./W2V_150.txt')

In [59]:
def get_word_vector(word: str, model: dict):
    if word not in model:
        return []
    return model[word]

In [83]:
def calculate_similarity_two_word(first_word: str, second_word: str, model: dict):
    first_word_vector = get_word_vector(first_word, model)
    second_word_vector = get_word_vector(second_word, model)
    if not first_word_vector:
        print("There is no word '{}' in '{}'".format(first_word, 'Model'))
        return None
    if not second_word_vector:
        print("There is no word '{}' in '{}'".format(second_word, 'Model'))
        return None

    # cosine_similarity = 1- float(spatial.distance.cosine(first_word_vector, second_word_vector))/2
    cosine_similarity = np.dot(first_word_vector,second_word_vector)/(norm(first_word_vector)*norm(second_word_vector))
    return cosine_similarity
   

In [84]:
word1 = input("First word: ")
word2 = input("Second word: ")
similarity = calculate_similarity_two_word(word1, word2, model = model)
print("Similarity of 2 word '{}' and '{}' is: {}".format(word1, word2, similarity))

First word:  ưu
Second word:  khuyết


Similarity of 2 word 'ưu' and 'khuyết' is: 0.17679862835626717


In [85]:
similarity = calculate_similarity_two_word('động', 'tĩnh', model = model)
print("Similarity of 2 word is: {}".format(similarity))

Similarity of 2 word is: 0.2770859598682775
