In [1]:
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from polyglot.mapping import Embedding

In [2]:
neighbour_list = [5, 10, 25, 50, 75, 100]
embedding_list = ['polyglot']
embedding_path = '../data/pretrained_embeddings/polyglot/embeddings2/'

In [3]:
def get_embedding_file_name(language):
    file_name = ''
    if (language == "Russian"):
        file_name += 'ru'
    elif (language == "Turkish"):
        file_name += 'tr'
    elif (language == "German"):
        file_name += 'de'
    elif (language == "Japanese"):
        file_name += 'ja'
    elif (language == "Spanish"):
        file_name += 'es'
    elif (language == "Polish"):
        file_name += 'pl'
    elif (language == "Italian"):
        file_name += 'it'
    elif (language == "Catalan"):
        file_name += 'ca'
    elif (language == "Korean"):
        file_name += 'ko'
    elif (language == "French"):
        file_name += 'fr'
    elif (language == "Chinese" or language == "Chinese (Simplified)"):
        file_name += 'zh'
    elif (language == "Portuguese"):
        file_name += 'pt'
    elif (language == "Swedish"):
        file_name += 'sv'
    elif (language == "Greek"):
        file_name += 'el'
    elif (language == "Thai"):
        file_name += 'th'
    elif (language == "Dutch"):
        file_name += 'nl'
    elif (language == "English"):
        file_name += 'en'
        
    return file_name

def load_embedding_vector(lang):
    return Embedding.load(embedding_path + lang + '/embeddings_pkl.tar.bz2')

In [4]:
def proper_case(word):
    return word[0].upper() + word[1:]

def get_similarity(w1, w2, embeddings, size):
    
    try:
        w1_vec = embeddings[w1]
        w2_vec = embeddings[w2]
    except KeyError:
        try:
            w1 = proper_case(w1)
            w2 = proper_case(w2)
            w1_vec = embeddings[w1]
            w2_vec = embeddings[w2]
        except KeyError:
            return -10
    
    w1_neighbours = embeddings.nearest_neighbors(w1, top_k = size)
    w2_neighbours = embeddings.nearest_neighbors(w2, top_k = size)
    
    w1_neighbour_vectors = [embeddings[word] for word in w1_neighbours]
    w2_neighbour_vectors = [embeddings[word] for word in w2_neighbours]
    
    w1_cosine = np.average([cosine_similarity([w1_vec], [w2_n_vec]) for w2_n_vec in w2_neighbour_vectors])
    w2_cosine = np.average([cosine_similarity([w2_vec], [w1_n_vec]) for w1_n_vec in w1_neighbour_vectors])
    
    return np.average([w1_cosine, w2_cosine])

In [19]:
source = '../data/ms_final_experiments_with_wiki.csv'
pd_dataset = pd.read_csv(source)
pd_dataset = pd_dataset.sort_values(by = 'language', ascending = True)
pd_dataset.head()

Unnamed: 0,c,i,id,l1_c,l1_i,language,type,code,l2_sim_cc_5,l2_sim_cc_10,...,l1_sim_cc_25,l1_sim_cc_50,l1_sim_cc_75,l1_sim_cc_100,l2_sim_wiki_5,l2_sim_wiki_10,l2_sim_wiki_25,l2_sim_wiki_50,l2_sim_wiki_75,l2_sim_wiki_100
0,plans,projects,person_2799,plans,projectes,Catalan,RN,ca,0.315914,0.289572,...,0.513301,0.508249,0.497365,0.494275,0.256624,0.263855,0.216694,0.194096,0.187172,0.185074
221,especially,specially,person_2421,especialment,especialment,Catalan,RY,ca,0.342199,0.336672,...,0.676789,0.635269,0.615089,0.602047,0.153657,0.153794,0.138989,0.123324,0.112214,0.106564
220,waste,lose,person_2421,malgastar,perdre,Catalan,RV,ca,0.197119,0.175892,...,0.425895,0.418449,0.414079,0.411164,0.092023,0.086759,0.080194,0.080709,0.076916,0.081092
219,ending,end,person_1465,final,final,Catalan,RN,ca,0.591592,0.500977,...,0.708057,0.666229,0.637293,0.616535,0.650926,0.507235,0.374291,0.322232,0.303969,0.291704
218,shot,view,person_1465,disparat,mostra,Catalan,RN,ca,0.2104,0.187495,...,0.237338,0.222958,0.225796,0.222587,0.104407,0.092781,0.089016,0.079768,0.077828,0.076215


In [20]:
def run_experiment_english(dataframe):
    for vector_type in embedding_list:
        vec_type_start = datetime.now()
        embedding_file_name = get_embedding_file_name("English")
        print('Loading embeddings: ' + embedding_file_name)
        embeddings = load_embedding_vector(embedding_file_name)
        print('Embeddings ' + embedding_file_name + ' loaded in ' + str(datetime.now() - vec_type_start))
        for neighbour_size in neighbour_list:
            neighbour_size_start = datetime.now()
            l2_sim = []
            for ind, row in tqdm(dataframe.iterrows(), total=dataframe.shape[0]):
                w1 = row['c']
                w2 = row['i']
                l2_sim.append(get_similarity(w1, w2, embeddings, neighbour_size))
            dataframe['l2_sim_' + vector_type + '_' + str(neighbour_size)] = l2_sim
            print('Finished run for neighbour size: ' + str(neighbour_size) + ' in ' + str(datetime.now() - neighbour_size_start))
        print('Finished run for vector type: ' + vector_type + ' in ' + str(datetime.now() - vec_type_start))

In [22]:
run_experiment_english(pd_dataset)

Loading embeddings: en


  0%|          | 3/4937 [00:00<03:51, 21.34it/s]

Embeddings en loaded in 0:00:02.179992


100%|██████████| 4937/4937 [03:33<00:00, 23.14it/s]
  0%|          | 3/4937 [00:00<03:46, 21.78it/s]

Finished run for neighbour size: 5 in 0:03:33.382582


100%|██████████| 4937/4937 [03:39<00:00, 22.44it/s]
  0%|          | 2/4937 [00:00<04:10, 19.67it/s]

Finished run for neighbour size: 10 in 0:03:39.987259


100%|██████████| 4937/4937 [04:02<00:00, 20.39it/s]
  0%|          | 2/4937 [00:00<04:42, 17.48it/s]

Finished run for neighbour size: 25 in 0:04:02.131728


100%|██████████| 4937/4937 [04:36<00:00, 17.87it/s]
  0%|          | 2/4937 [00:00<05:13, 15.76it/s]

Finished run for neighbour size: 50 in 0:04:36.297518


100%|██████████| 4937/4937 [05:08<00:00, 16.01it/s]
  0%|          | 2/4937 [00:00<05:51, 14.04it/s]

Finished run for neighbour size: 75 in 0:05:08.412277


100%|██████████| 4937/4937 [05:43<00:00, 14.37it/s]

Finished run for neighbour size: 100 in 0:05:43.475218
Finished run for vector type: polyglot in 0:26:45.867292





In [23]:
pd_dataset.to_csv('../data/ms_final_polyglot_english_2.csv', index = False)

In [5]:
def get_experiment_model(language, vector_type):
    vec_type_start = datetime.now()
    embedding_file_name = get_embedding_file_name(language)
    print('Loading embeddings: ' + embedding_file_name)
    embeddings = load_embedding_vector(embedding_file_name)
    print('Embeddings ' + embedding_file_name + ' loaded in ' + str(datetime.now() - vec_type_start))
    return embeddings
new_columns = {'5': [], '10': [], '25': [], '50': [], '75': [], '100': []}
def run_experiment_l1(dataframe):
    current_lang = ''
    current_vector_type = ''
    
    for vector_type in embedding_list:
        vec_type_start = datetime.now()
        for language in language_list:
            language_start = datetime.now()
            embeddings = get_experiment_model(language, vector_type)
            for neighbour_size in neighbour_list:
                neighbour_size_start = datetime.now()
#                 l1_sim = []
                dset = dataframe[dataframe['language'] == language]
                for ind, row in tqdm(dset.iterrows(), total=dset.shape[0]):
                    w1 = row['l1_c']
                    w2 = row['l1_i']
                    new_columns[str(neighbour_size)].append(get_similarity(w1, w2, embeddings, neighbour_size))
#                 dataframe['l1_sim_' + vector_type + '_' + str(neighbour_size)] = l1_sim 
                print('Finished run for neighbour size: ' + str(neighbour_size) + ' in ' + str(datetime.now() - neighbour_size_start))
            print('Finished run for language: ' + language + ' in ' + str(datetime.now() - language_start))
        print('Finished run for vector type: ' + vector_type + ' in ' + str(datetime.now() - vec_type_start))

In [6]:
neighbour_list = [5, 10, 25, 50, 75, 100]
embedding_list = ['polyglot']
embedding_path = '../data/pretrained_embeddings/polyglot/embeddings2/'
source = '../data/ms_final_polyglot_english_2.csv'

pd_dataset = pd.read_csv(source)
pd_dataset = pd_dataset.sort_values(by = 'language', ascending = True)
pd_dataset.head()
language_list = pd_dataset['language'].unique()

In [None]:
run_experiment_l1(pd_dataset)

Loading embeddings: ca


  1%|          | 2/325 [00:00<00:17, 18.54it/s]

Embeddings ca loaded in 0:00:02.389188


100%|██████████| 325/325 [00:14<00:00, 21.96it/s]
  1%|          | 3/325 [00:00<00:15, 21.02it/s]

Finished run for neighbour size: 5 in 0:00:14.807869


100%|██████████| 325/325 [00:15<00:00, 21.31it/s]
  1%|          | 2/325 [00:00<00:16, 19.29it/s]

Finished run for neighbour size: 10 in 0:00:15.252374


100%|██████████| 325/325 [00:16<00:00, 19.62it/s]
  1%|          | 2/325 [00:00<00:19, 16.86it/s]

Finished run for neighbour size: 25 in 0:00:16.567938


100%|██████████| 325/325 [00:18<00:00, 17.51it/s]
  1%|          | 2/325 [00:00<00:23, 13.49it/s]

Finished run for neighbour size: 50 in 0:00:18.568238


100%|██████████| 325/325 [00:20<00:00, 15.68it/s]
  1%|          | 2/325 [00:00<00:23, 13.89it/s]

Finished run for neighbour size: 75 in 0:00:20.735994


100%|██████████| 325/325 [00:22<00:00, 14.17it/s]


Finished run for neighbour size: 100 in 0:00:22.942075
Finished run for language: Catalan in 0:01:51.264242
Loading embeddings: zh


  1%|          | 3/310 [00:00<00:14, 21.84it/s]

Embeddings zh loaded in 0:00:02.147104


100%|██████████| 310/310 [00:12<00:00, 23.98it/s]
  1%|          | 3/310 [00:00<00:13, 22.68it/s]

Finished run for neighbour size: 5 in 0:00:12.930677


100%|██████████| 310/310 [00:13<00:00, 23.39it/s]
  1%|          | 2/310 [00:00<00:15, 19.43it/s]

Finished run for neighbour size: 10 in 0:00:13.257554


100%|██████████| 310/310 [00:14<00:00, 21.41it/s]
  1%|          | 2/310 [00:00<00:17, 17.50it/s]

Finished run for neighbour size: 25 in 0:00:14.481830


100%|██████████| 310/310 [00:16<00:00, 18.84it/s]
  1%|          | 2/310 [00:00<00:20, 15.31it/s]

Finished run for neighbour size: 50 in 0:00:16.455381


100%|██████████| 310/310 [00:18<00:00, 16.83it/s]
  1%|          | 2/310 [00:00<00:20, 14.78it/s]

Finished run for neighbour size: 75 in 0:00:18.421487


100%|██████████| 310/310 [00:20<00:00, 15.13it/s]


Finished run for neighbour size: 100 in 0:00:20.495805
Finished run for language: Chinese (Simplified) in 0:01:38.195283
Loading embeddings: nl


 60%|██████    | 3/5 [00:00<00:00, 22.27it/s]

Embeddings nl loaded in 0:00:02.228380


100%|██████████| 5/5 [00:00<00:00, 22.29it/s]
 60%|██████    | 3/5 [00:00<00:00, 22.17it/s]

Finished run for neighbour size: 5 in 0:00:00.228454


100%|██████████| 5/5 [00:00<00:00, 21.97it/s]
 60%|██████    | 3/5 [00:00<00:00, 20.81it/s]

Finished run for neighbour size: 10 in 0:00:00.231913


100%|██████████| 5/5 [00:00<00:00, 20.54it/s]
 40%|████      | 2/5 [00:00<00:00, 17.92it/s]

Finished run for neighbour size: 25 in 0:00:00.247876


100%|██████████| 5/5 [00:00<00:00, 16.37it/s]
 40%|████      | 2/5 [00:00<00:00, 15.95it/s]

Finished run for neighbour size: 50 in 0:00:00.309900


100%|██████████| 5/5 [00:00<00:00, 15.76it/s]
 40%|████      | 2/5 [00:00<00:00, 14.26it/s]

Finished run for neighbour size: 75 in 0:00:00.321769


100%|██████████| 5/5 [00:00<00:00, 14.14it/s]


Finished run for neighbour size: 100 in 0:00:00.357744
Finished run for language: Dutch in 0:00:03.930616
Loading embeddings: fr


  0%|          | 3/794 [00:00<00:42, 18.49it/s]

Embeddings fr loaded in 0:00:02.134273


100%|██████████| 794/794 [00:34<00:00, 22.82it/s]
  0%|          | 3/794 [00:00<00:35, 22.16it/s]

Finished run for neighbour size: 5 in 0:00:34.792262


100%|██████████| 794/794 [00:35<00:00, 22.19it/s]
  0%|          | 3/794 [00:00<00:38, 20.41it/s]

Finished run for neighbour size: 10 in 0:00:35.791464


 66%|██████▌   | 521/794 [00:25<00:13, 20.48it/s]

In [9]:
for key in new_columns:
    pd_dataset['l1_sim_polyglot_' + str(key)] = new_columns[key]
#     print(len(new_columns[key]))

In [10]:
pd_dataset.to_csv('../data/ms_final_with_polyglot_2.csv', index = False)

In [11]:
pd_dataset.head()

Unnamed: 0,c,i,id,l1_c,l1_i,language,type,code,l2_sim_cc_5,l2_sim_cc_10,...,l2_sim_polyglot_25,l2_sim_polyglot_50,l2_sim_polyglot_75,l2_sim_polyglot_100,l1_sim_polyglot_5,l1_sim_polyglot_10,l1_sim_polyglot_25,l1_sim_polyglot_50,l1_sim_polyglot_75,l1_sim_polyglot_100
0,plans,projects,person_2799,plans,projectes,Catalan,RN,ca,0.315914,0.289572,...,0.652385,0.64229,0.636711,0.636448,0.706547,0.719866,0.709139,0.681242,0.676333,0.667461
221,independence,freedom,person_2132,independència,llibertat,Catalan,RN,ca,0.4783,0.455483,...,0.633258,0.621581,0.608173,0.601868,0.67352,0.658533,0.656046,0.656317,0.647447,0.646497
220,stars,starts,person_2127,estrelles,comença,Catalan,RN,ca,0.068227,0.068642,...,0.312807,0.32328,0.315747,0.309649,-0.086862,-0.071272,-0.057636,-0.068105,-0.065801,-0.070737
219,time,hours,person_2127,temps,hores,Catalan,RN,ca,0.464017,0.407275,...,0.270096,0.236612,0.215306,0.203645,0.254001,0.308201,0.277933,0.270908,0.253394,0.249989
218,coming,following,person_2127,venint,següents,Catalan,RJ,ca,0.183514,0.166062,...,0.274542,0.268298,0.258854,0.258467,0.017321,-0.002845,-0.019284,-0.027436,-0.031775,-0.024451


In [3]:
english_embedding = Embedding.load(embedding_path + 'en/embeddings_pkl.tar.bz2')
russian_embedding = Embedding.load(embedding_path + 'ru/embeddings_pkl.tar.bz2')

In [4]:
getting_n = english_embedding.nearest_neighbors("getting", top_k = 10)
acq_n = english_embedding.nearest_neighbors("acquiring", top_k = 10)

w1_n = russian_embedding.nearest_neighbors("получение", top_k = 10)
w2_n = russian_embedding.nearest_neighbors("приобретения", top_k = 10)

print(getting_n)
print(acq_n)
print(w1_n)
print(w2_n)

['pulling', 'putting', 'keeping', 'sneaking', 'LEFTnot', 'fetching', 'carrying', 'practically', 'staking', 'soliciting']
['adopting', 'supplying', 'overseeing', 'establishing', 'securing', 'obtaining', 'administering', 'purchasing', 'introducing', 'executing']
['устранение', 'сохранение', 'поддержание', 'создание', 'осуществление', 'восстановление', 'составление', 'поощрение', 'улучшение', 'уничтожение']
['достижения', 'установления', 'покупки', 'признания', 'использования', 'преобразования', 'формирования', 'посещения', 'объявления', 'построения']
