In [45]:
import gensim
import io
import sys
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
import math

In [46]:
model = gensim.models.KeyedVectors.load_word2vec_format("Word2vec.vec", binary=False)

In [47]:
def read_file(link_file):
    f = open(link_file, encoding='utf-8')
    return f.readlines()[1:]

def get_cosine_distance(u, v):
    uv = sum([u[i]*v[i] for i in range(len(u))])
    norm_u = math.sqrt(sum([u[i]*u[i] for i in range(len(u))]))
    norm_v = math.sqrt(sum([v[i]*v[i] for i in range(len(v))]))
    return uv/(norm_u*norm_v)

def get_dice_distance(u, v):
    sum_min_uv = sum([min(u[i],v[i]) for i in range(len(u))])
    sum_u_plus_v = sum([u[i]+v[i] for i in range(len(u))])
    
    return 2.0*sum_min_uv/sum_u_plus_v

In [48]:
noun_pairs_dataset = read_file("datasets/ViCon-400/400_noun_pairs.txt")
verb_pairs_dataset = read_file("datasets/ViCon-400/400_verb_pairs.txt")
adj_pairs_dataset = read_file("datasets/ViCon-400/600_adj_pairs.txt")

In [49]:
def measure_word_similar(dataset, distance_type): #distance_type: COSINE OR DICE
    results=[]
    error=0
    for line in dataset:
        words = line.split()
        words1 = words[0].strip() #get word from dataset
        words2 = words[1].strip()
        try:
            vector1 = model[words1] #get vector of word
            vector2 = model[words2]
            if distance_type == "DICE":
                distance = get_dice_distance(vector1, vector2)
            elif distance_type == "COSINE":
                distance = get_cosine_distance(vector1, vector2)
            results.append(distance)
        except:
            error += 1
    return results

In [50]:
result_noun_pairs_dataset = measure_word_similar(noun_pairs_dataset, "DICE")
result_verb_pairs_dataset = measure_word_similar(verb_pairs_dataset, "DICE")
result_adj_pairs_dataset = measure_word_similar(adj_pairs_dataset, "DICE")
result_cosine_noun_pairs_dataset = measure_word_similar(noun_pairs_dataset, "COSINE")
result_cosine_verb_pairs_dataset = measure_word_similar(verb_pairs_dataset, "COSINE")
result_cosine_adj_pairs_dataset = measure_word_similar(adj_pairs_dataset, "COSINE")