In [1]:
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
def get_similarity(w1, w2, embeddings, size):
    try:
        w1_vector = embeddings[w1]
        w2_vector = embeddings[w2]
    except KeyError:
        return -10
    
    w1_neighbours_vectors = embeddings.similar_by_vector(w1_vector, topn = size)
    w2_neighbours_vectors = embeddings.similar_by_vector(w2_vector, topn = size)
    
    w1_neighbour_words = [embed[0] for embed in w1_neighbours_vectors]
    w2_neighbour_words = [embed[0] for embed in w2_neighbours_vectors]
    
    w1_cosine = np.average([cosine_similarity([w1_vector], [embeddings[w2]]) for w2 in w2_neighbour_words])
    w2_cosine = np.average([cosine_similarity([w2_vector], [embeddings[w1]]) for w1 in w1_neighbour_words])
    
    return np.average([w1_cosine, w2_cosine])

In [11]:
source = '../data/experiment_final.csv'
pd_dataset = pd.read_csv(source)
pd_dataset = pd_dataset.sort_values(by = 'language', ascending = True)
pd_dataset.head()

Unnamed: 0,c,i,id,l1_c,l1_i,language,type,code
2827,plans,projects,person_2799,plans,projectes,Catalan,RN,ca
2733,waste,lose,person_2421,malgastar,perdre,Catalan,RV,ca
2732,ending,end,person_1465,final,final,Catalan,RN,ca
2731,shot,view,person_1465,disparat,mostra,Catalan,RN,ca
2730,do,join,person_1465,fer,unir,Catalan,RV,ca


In [13]:
neighbour_list = [5, 10, 25, 50, 75, 100]
embedding_list = ['cc']
embedding_path = '../data/pretrained_embeddings/'

In [14]:
def get_embedding_file_name(language, file_type):
    file_name = file_type + '.'
    if (language == "Russian"):
        file_name += 'ru'
    elif (language == "Turkish"):
        file_name += 'tr'
    elif (language == "German"):
        file_name += 'de'
    elif (language == "Japanese"):
        file_name += 'ja'
    elif (language == "Spanish"):
        file_name += 'es'
    elif (language == "Polish"):
        file_name += 'pl'
    elif (language == "Italian"):
        file_name += 'it'
    elif (language == "Catalan"):
        file_name += 'ca'
    elif (language == "Korean"):
        file_name += 'ko'
    elif (language == "French"):
        file_name += 'fr'
    elif (language == "Chinese"):
        file_name += 'zh'
    elif (language == "Portuguese"):
        file_name += 'pt'
    elif (language == "Swedish"):
        file_name += 'sv'
    elif (language == "Greek"):
        file_name += 'el'
    elif (language == "Thai"):
        file_name += 'th'
    elif (language == "Dutch"):
        file_name += 'nl'
        
    file_name += '.vec' 
    return file_name

def load_embedding_vector(file_name):
    return KeyedVectors.load_word2vec_format(embedding_path + file_name, binary=False)  # C text format