# 1. Cosine similarity  
Given pre-trained embeddings of Vietnamese  words, implement a function for calculating cosine similarity between word pairs. Test your program using word pairs in ViSim-400 dataset (in directory Datasets/ViSim-400). Using Pearson correlation coefficient (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient), Spearman's rank correlation coefficient (https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient) to evaluate the correlation between your results and similarity scores in the dataset

In [1]:
import numpy as np
import scipy
from scipy import stats

In [4]:
def read_file_to_list(path):
    file = open(path, encoding='utf8')
    lines = [line for line in file.readlines()]
    file.close()
    return lines

In [5]:
w2v_path = 'W2V_150.txt'
dataset = read_file_to_list(w2v_path)

num_of_words = int(dataset[0]) #77021 từ trong bộ embedding 
num_of_dimensions = int(dataset[1]) # mỗi từ tương ứng với 1 véctơ 150 chiều 
dataset = dataset[2:]

vector = [None]*num_of_dimensions
embeddings = {}

for line in dataset:
    s = line.split(' ')
    word = s[0]
    for i in range(num_of_dimensions):
        vector[i] = float(s[i+2]) # trừ item đầu tiên và thứ 2 (xâu rỗng)
    embeddings[word] = np.array(vector)
# dict(list(embeddings.items())[0:2])

implement a function for calculating cosine similarity between word pairs

In [6]:
def cosine_similarity(A, B):
   return np.sum(A*B) / np.sqrt(np.sum(A**2)*np.sum(B**2))

chuẩn hóa kết quả về thang \[0, 10\]

In [7]:
(1 + cosine_similarity(embeddings['cute'], embeddings['xinh_xắn']) ) / 2 * 10

7.420035500261395

Test your program using word pairs in ViSim-400 dataset

In [8]:
def generate_floats(start, end, size):
    mid = start + (end-start)/2
    x = mid + np.random.rand(size) * (end-mid)
    sign = np.random.rand(size)
    sign[sign < 0.5] = -1
    sign[sign != -1] = 1
    x = x*sign
    return x

In [9]:
visim_path = "Visim-400.txt"
lines = read_file_to_list(visim_path)[1:]

results = []

for line in lines:
    s = line.split()
    w1 = s[0].strip() #word 1

    # những từ không có trong bộ pretrained embedding sẽ được khởi tạo ngẫu nhiên
    if w1 not in embeddings:
        embeddings[w1] = generate_floats(-2, 2, num_of_dimensions) # gán giá trị float ngẫu nhiên trong khoảng [-2,2)
        
    w2 = s[1].strip() #word 2
    if w2 not in embeddings:
        embeddings[w2] = generate_floats(-2, 2, num_of_dimensions)
    sim = (1+cosine_similarity(embeddings[w1], embeddings[w2]))/2*10
    results.append(sim)

# results[:5]

## Evaluate the correlation between the results and similarity scores in the dataset

In [10]:
l = [line.split() for line in lines]
scores = np.array(l)[:,4].astype(np.float) #lấy dữ liệu từ cột 4 (Sim2) (thang 0-10)

Using Pearson correlation coefficient

In [11]:
stats.pearsonr(results, scores)

(0.3357354165588622, 5.39275709671148e-12)

Using Spearman correlation coefficient

In [12]:
stats.spearmanr(results, scores)

SpearmanrResult(correlation=0.29379062171006726, pvalue=2.0968429623116215e-09)

# 2. K-nearest words  
Given a word w, find k most-similar words of w using the function implemented in 1.

In [13]:
vocab = list(embeddings.keys())
vocab[:10]

['những',
 'tiết_lộ',
 'thú_vị',
 'về',
 'wallpaper',
 'mặc_định',
 'của',
 'windows',
 'xp',
 'chắc_hẳn']

In [14]:
X = list(embeddings.values())

In [15]:
from sklearn.cluster import KMeans
kmeans = KMeans(
    n_clusters=20,
    n_init=10, max_iter=100,
    tol=1e-04, random_state=0
)
clustering = kmeans.fit(X)

In [16]:
labels = clustering.labels_

In [17]:
def knearest_words(k, w):
    rs = []
    label = clustering.predict(embeddings[w].reshape((1,150)))

    for i in range(num_of_words):
        if labels[i] == label.item():
            sim = cosine_similarity(embeddings[vocab[i]], embeddings[w])
            rs.append(sim)
        else:
            rs.append(-1)
            
    rs = np.argsort(np.array(rs)) #trả các idx theo 1 thứ tự mà mảng sẽ được sx nếu duyệt theo thứ tự đó
    return rs[-2:-k-2:-1]

In [18]:
kq = [vocab[i] for i in knearest_words(10, 'xinh')]
kq

['đẹp',
 'dễ_thương',
 'đáng_yêu',
 'kute',
 'tôn_dáng',
 'đỏm_dáng',
 'nuột',
 'oách',
 'manly',
 'dừ']