In [1]:
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from polyglot.mapping import Embedding

In [2]:
neighbour_list = [5, 10, 25, 50, 75, 100]
embedding_list = ['polyglot']
embedding_path = '../data/pretrained_embeddings/polyglot/embeddings2/'

In [2]:
def get_embedding_file_name(language):
    file_name = ''
    if (language == "Russian"):
        file_name += 'ru'
    elif (language == "Turkish"):
        file_name += 'tr'
    elif (language == "German"):
        file_name += 'de'
    elif (language == "Japanese"):
        file_name += 'ja'
    elif (language == "Spanish"):
        file_name += 'es'
    elif (language == "Polish"):
        file_name += 'pl'
    elif (language == "Italian"):
        file_name += 'it'
    elif (language == "Catalan"):
        file_name += 'ca'
    elif (language == "Korean"):
        file_name += 'ko'
    elif (language == "French"):
        file_name += 'fr'
    elif (language == "Chinese" or language == "Chinese (Simplified)"):
        file_name += 'zh'
    elif (language == "Portuguese"):
        file_name += 'pt'
    elif (language == "Swedish"):
        file_name += 'sv'
    elif (language == "Greek"):
        file_name += 'el'
    elif (language == "Thai"):
        file_name += 'th'
    elif (language == "Dutch"):
        file_name += 'nl'
    elif (language == "English"):
        file_name += 'en'
        
    return file_name

def load_embedding_vector(lang):
    return Embedding.load(embedding_path + lang + '/embeddings_pkl.tar.bz2')

In [3]:
def get_similarity(w1, w2, embeddings, size):
    
    try:
        w1_vec = embeddings[w1]
        w2_vec = embeddings[w2]
    except KeyError:
        return -10
    
    w1_neighbours = embeddings.nearest_neighbors(w1, top_k = size)
    w2_neighbours = embeddings.nearest_neighbors(w2, top_k = size)
    
    w1_neighbour_vectors = [embeddings[word] for word in w1_neighbours]
    w2_neighbour_vectors = [embeddings[word] for word in w2_neighbours]
    
    w1_cosine = np.average([cosine_similarity([w1_vec], [w2_n_vec]) for w2_n_vec in w2_neighbour_vectors])
    w2_cosine = np.average([cosine_similarity([w2_vec], [w1_n_vec]) for w1_n_vec in w1_neighbour_vectors])
    
    return np.average([w1_cosine, w2_cosine])

In [46]:
source = '../data/ms_final_experiments_with_wiki.csv'
pd_dataset = pd.read_csv(source)
pd_dataset = pd_dataset.sort_values(by = 'language', ascending = True)
pd_dataset.head()

Unnamed: 0,c,i,id,l1_c,l1_i,language,type,code,l2_sim_cc_5,l2_sim_cc_10,...,l1_sim_cc_25,l1_sim_cc_50,l1_sim_cc_75,l1_sim_cc_100,l2_sim_wiki_5,l2_sim_wiki_10,l2_sim_wiki_25,l2_sim_wiki_50,l2_sim_wiki_75,l2_sim_wiki_100
0,plans,projects,person_2799,plans,projectes,Catalan,RN,ca,0.315914,0.289572,...,0.513301,0.508249,0.497365,0.494275,0.256624,0.263855,0.216694,0.194096,0.187172,0.185074
221,especially,specially,person_2421,especialment,especialment,Catalan,RY,ca,0.342199,0.336672,...,0.676789,0.635269,0.615089,0.602047,0.153657,0.153794,0.138989,0.123324,0.112214,0.106564
220,waste,lose,person_2421,malgastar,perdre,Catalan,RV,ca,0.197119,0.175892,...,0.425895,0.418449,0.414079,0.411164,0.092023,0.086759,0.080194,0.080709,0.076916,0.081092
219,ending,end,person_1465,final,final,Catalan,RN,ca,0.591592,0.500977,...,0.708057,0.666229,0.637293,0.616535,0.650926,0.507235,0.374291,0.322232,0.303969,0.291704
218,shot,view,person_1465,disparat,mostra,Catalan,RN,ca,0.2104,0.187495,...,0.237338,0.222958,0.225796,0.222587,0.104407,0.092781,0.089016,0.079768,0.077828,0.076215


In [47]:
def run_experiment_english(dataframe):
    for vector_type in embedding_list:
        vec_type_start = datetime.now()
        embedding_file_name = get_embedding_file_name("English")
        print('Loading embeddings: ' + embedding_file_name)
        embeddings = load_embedding_vector(embedding_file_name)
        print('Embeddings ' + embedding_file_name + ' loaded in ' + str(datetime.now() - vec_type_start))
        for neighbour_size in neighbour_list:
            neighbour_size_start = datetime.now()
            l2_sim = []
            for ind, row in tqdm(dataframe.iterrows(), total=dataframe.shape[0]):
                w1 = row['c']
                w2 = row['i']
                l2_sim.append(get_similarity(w1, w2, embeddings, neighbour_size))
            dataframe['l2_sim_' + vector_type + '_' + str(neighbour_size)] = l2_sim
            print('Finished run for neighbour size: ' + str(neighbour_size) + ' in ' + str(datetime.now() - neighbour_size_start))
        print('Finished run for vector type: ' + vector_type + ' in ' + str(datetime.now() - vec_type_start))

In [48]:
run_experiment_english(pd_dataset)

Loading embeddings: en


  0%|          | 3/4937 [00:00<03:52, 21.23it/s]

Embeddings en loaded in 0:00:02.218950


100%|██████████| 4937/4937 [03:36<00:00, 22.76it/s]
  0%|          | 3/4937 [00:00<03:52, 21.22it/s]

Finished run for neighbour size: 5 in 0:03:36.879297


100%|██████████| 4937/4937 [03:43<00:00, 22.11it/s]
  0%|          | 2/4937 [00:00<04:14, 19.38it/s]

Finished run for neighbour size: 10 in 0:03:43.256651


100%|██████████| 4937/4937 [04:01<00:00, 20.42it/s]
  0%|          | 2/4937 [00:00<04:57, 16.58it/s]

Finished run for neighbour size: 25 in 0:04:01.723889


100%|██████████| 4937/4937 [04:37<00:00, 17.80it/s]
  0%|          | 2/4937 [00:00<05:19, 15.43it/s]

Finished run for neighbour size: 50 in 0:04:37.384221


100%|██████████| 4937/4937 [05:10<00:00, 15.91it/s]
  0%|          | 2/4937 [00:00<05:47, 14.18it/s]

Finished run for neighbour size: 75 in 0:05:10.384337


100%|██████████| 4937/4937 [05:45<00:00, 14.30it/s]

Finished run for neighbour size: 100 in 0:05:45.228115
Finished run for vector type: polyglot in 0:26:57.076164





In [54]:
pd_dataset.to_csv('../data/ms_final_polyglot_english.csv', index = False)

In [4]:
def get_experiment_model(language, vector_type):
    vec_type_start = datetime.now()
    embedding_file_name = get_embedding_file_name(language)
    print('Loading embeddings: ' + embedding_file_name)
    embeddings = load_embedding_vector(embedding_file_name)
    print('Embeddings ' + embedding_file_name + ' loaded in ' + str(datetime.now() - vec_type_start))
    return embeddings
new_columns = {'5': [], '10': [], '25': [], '50': [], '75': [], '100': []}
def run_experiment_l1(dataframe):
    current_lang = ''
    current_vector_type = ''
    
    for vector_type in embedding_list:
        vec_type_start = datetime.now()
        for language in language_list:
            language_start = datetime.now()
            embeddings = get_experiment_model(language, vector_type)
            for neighbour_size in neighbour_list:
                neighbour_size_start = datetime.now()
#                 l1_sim = []
                dset = dataframe[dataframe['language'] == language]
                for ind, row in tqdm(dset.iterrows(), total=dset.shape[0]):
                    w1 = row['l1_c']
                    w2 = row['l1_i']
                    new_columns[str(neighbour_size)].append(get_similarity(w1, w2, embeddings, neighbour_size))
#                 dataframe['l1_sim_' + vector_type + '_' + str(neighbour_size)] = l1_sim 
                print('Finished run for neighbour size: ' + str(neighbour_size) + ' in ' + str(datetime.now() - neighbour_size_start))
            print('Finished run for language: ' + language + ' in ' + str(datetime.now() - language_start))
        print('Finished run for vector type: ' + vector_type + ' in ' + str(datetime.now() - vec_type_start))

In [5]:
neighbour_list = [5, 10, 25, 50, 75, 100]
embedding_list = ['polyglot']
embedding_path = '../data/pretrained_embeddings/polyglot/embeddings2/'
source = '../data/ms_final_polyglot_english.csv'

pd_dataset = pd.read_csv(source)
pd_dataset = pd_dataset.sort_values(by = 'language', ascending = True)
pd_dataset.head()
language_list = pd_dataset['language'].unique()

In [6]:
run_experiment_l1(pd_dataset)

Loading embeddings: ca


  1%|          | 2/325 [00:00<00:17, 18.62it/s]

Embeddings ca loaded in 0:00:02.360815


100%|██████████| 325/325 [00:14<00:00, 21.76it/s]
  1%|          | 3/325 [00:00<00:15, 20.82it/s]

Finished run for neighbour size: 5 in 0:00:14.940685


100%|██████████| 325/325 [00:15<00:00, 21.24it/s]
  1%|          | 2/325 [00:00<00:16, 19.28it/s]

Finished run for neighbour size: 10 in 0:00:15.302875


100%|██████████| 325/325 [00:16<00:00, 19.60it/s]
  1%|          | 2/325 [00:00<00:19, 16.79it/s]

Finished run for neighbour size: 25 in 0:00:16.585558


100%|██████████| 325/325 [00:18<00:00, 17.36it/s]
  1%|          | 2/325 [00:00<00:21, 14.83it/s]

Finished run for neighbour size: 50 in 0:00:18.730687


100%|██████████| 325/325 [00:20<00:00, 15.67it/s]
  1%|          | 2/325 [00:00<00:27, 11.83it/s]

Finished run for neighbour size: 75 in 0:00:20.748252


100%|██████████| 325/325 [00:22<00:00, 14.17it/s]


Finished run for neighbour size: 100 in 0:00:22.934544
Finished run for language: Catalan in 0:01:51.603862
Loading embeddings: zh


  1%|          | 2/310 [00:00<00:15, 19.74it/s]

Embeddings zh loaded in 0:00:02.155713


100%|██████████| 310/310 [00:13<00:00, 23.77it/s]
  1%|          | 3/310 [00:00<00:14, 21.42it/s]

Finished run for neighbour size: 5 in 0:00:13.043911


100%|██████████| 310/310 [00:13<00:00, 23.04it/s]
  1%|          | 3/310 [00:00<00:14, 20.64it/s]

Finished run for neighbour size: 10 in 0:00:13.457090


100%|██████████| 310/310 [00:14<00:00, 21.36it/s]
  1%|          | 2/310 [00:00<00:20, 15.01it/s]

Finished run for neighbour size: 25 in 0:00:14.519836


100%|██████████| 310/310 [00:16<00:00, 18.73it/s]
  1%|          | 2/310 [00:00<00:18, 16.27it/s]

Finished run for neighbour size: 50 in 0:00:16.554347


100%|██████████| 310/310 [00:18<00:00, 16.78it/s]
  1%|          | 2/310 [00:00<00:22, 13.62it/s]

Finished run for neighbour size: 75 in 0:00:18.478450


100%|██████████| 310/310 [00:20<00:00, 15.10it/s]


Finished run for neighbour size: 100 in 0:00:20.540344
Finished run for language: Chinese (Simplified) in 0:01:38.755089
Loading embeddings: nl


 60%|██████    | 3/5 [00:00<00:00, 22.78it/s]

Embeddings nl loaded in 0:00:02.129841


100%|██████████| 5/5 [00:00<00:00, 22.64it/s]
 40%|████      | 2/5 [00:00<00:00, 17.62it/s]

Finished run for neighbour size: 5 in 0:00:00.225480


100%|██████████| 5/5 [00:00<00:00, 20.01it/s]
 60%|██████    | 3/5 [00:00<00:00, 20.54it/s]

Finished run for neighbour size: 10 in 0:00:00.254377


100%|██████████| 5/5 [00:00<00:00, 20.28it/s]
 40%|████      | 2/5 [00:00<00:00, 17.91it/s]

Finished run for neighbour size: 25 in 0:00:00.251044


100%|██████████| 5/5 [00:00<00:00, 17.78it/s]
 40%|████      | 2/5 [00:00<00:00, 16.31it/s]

Finished run for neighbour size: 50 in 0:00:00.286093


100%|██████████| 5/5 [00:00<00:00, 15.73it/s]
 40%|████      | 2/5 [00:00<00:00, 14.29it/s]

Finished run for neighbour size: 75 in 0:00:00.322302


100%|██████████| 5/5 [00:00<00:00, 13.97it/s]


Finished run for neighbour size: 100 in 0:00:00.362300
Finished run for language: Dutch in 0:00:03.835173
Loading embeddings: fr


  0%|          | 3/794 [00:00<00:34, 22.75it/s]

Embeddings fr loaded in 0:00:02.168066


100%|██████████| 794/794 [00:34<00:00, 22.83it/s]
  0%|          | 3/794 [00:00<00:36, 21.78it/s]

Finished run for neighbour size: 5 in 0:00:34.787100


100%|██████████| 794/794 [00:35<00:00, 22.12it/s]
  0%|          | 3/794 [00:00<00:39, 20.22it/s]

Finished run for neighbour size: 10 in 0:00:35.904645


100%|██████████| 794/794 [00:38<00:00, 20.38it/s]
  0%|          | 2/794 [00:00<00:43, 18.11it/s]

Finished run for neighbour size: 25 in 0:00:38.971529


100%|██████████| 794/794 [00:44<00:00, 17.91it/s]
  0%|          | 2/794 [00:00<00:49, 16.04it/s]

Finished run for neighbour size: 50 in 0:00:44.349649


100%|██████████| 794/794 [00:49<00:00, 16.02it/s]
  0%|          | 2/794 [00:00<00:55, 14.34it/s]

Finished run for neighbour size: 75 in 0:00:49.564546


100%|██████████| 794/794 [00:55<00:00, 14.39it/s]


Finished run for neighbour size: 100 in 0:00:55.163228
Finished run for language: French in 0:04:20.912312
Loading embeddings: de


  1%|          | 3/285 [00:00<00:12, 22.82it/s]

Embeddings de loaded in 0:00:02.182337


100%|██████████| 285/285 [00:08<00:00, 32.01it/s]
  1%|          | 3/285 [00:00<00:12, 21.88it/s]

Finished run for neighbour size: 5 in 0:00:08.908319


100%|██████████| 285/285 [00:09<00:00, 30.64it/s]
  1%|          | 3/285 [00:00<00:13, 20.54it/s]

Finished run for neighbour size: 10 in 0:00:09.305011


100%|██████████| 285/285 [00:10<00:00, 28.38it/s]
  1%|          | 2/285 [00:00<00:15, 17.75it/s]

Finished run for neighbour size: 25 in 0:00:10.045627


100%|██████████| 285/285 [00:11<00:00, 24.92it/s]
  1%|          | 2/285 [00:00<00:17, 15.96it/s]

Finished run for neighbour size: 50 in 0:00:11.441224


100%|██████████| 285/285 [00:12<00:00, 22.42it/s]
  1%|          | 2/285 [00:00<00:22, 12.32it/s]

Finished run for neighbour size: 75 in 0:00:12.717241


100%|██████████| 285/285 [00:14<00:00, 20.06it/s]


Finished run for neighbour size: 100 in 0:00:14.216862
Finished run for language: German in 0:01:08.820712
Loading embeddings: el


  1%|          | 3/353 [00:00<00:15, 22.43it/s]

Embeddings el loaded in 0:00:02.281267


100%|██████████| 353/353 [00:14<00:00, 24.78it/s]
  1%|          | 3/353 [00:00<00:15, 22.34it/s]

Finished run for neighbour size: 5 in 0:00:14.249009


100%|██████████| 353/353 [00:14<00:00, 23.91it/s]
  1%|          | 2/353 [00:00<00:17, 19.97it/s]

Finished run for neighbour size: 10 in 0:00:14.771139


100%|██████████| 353/353 [00:16<00:00, 21.99it/s]
  1%|          | 2/353 [00:00<00:19, 17.91it/s]

Finished run for neighbour size: 25 in 0:00:16.053968


100%|██████████| 353/353 [00:18<00:00, 19.44it/s]
  1%|          | 2/353 [00:00<00:21, 16.25it/s]

Finished run for neighbour size: 50 in 0:00:18.167293


100%|██████████| 353/353 [00:20<00:00, 17.43it/s]
  1%|          | 2/353 [00:00<00:30, 11.42it/s]

Finished run for neighbour size: 75 in 0:00:20.253081


100%|██████████| 353/353 [00:22<00:00, 15.52it/s]


Finished run for neighbour size: 100 in 0:00:22.748327
Finished run for language: Greek in 0:01:48.527925
Loading embeddings: it


  1%|          | 3/335 [00:00<00:14, 22.96it/s]

Embeddings it loaded in 0:00:02.129993


100%|██████████| 335/335 [00:14<00:00, 22.90it/s]
  1%|          | 3/335 [00:00<00:15, 21.66it/s]

Finished run for neighbour size: 5 in 0:00:14.634722


100%|██████████| 335/335 [00:15<00:00, 22.22it/s]
  1%|          | 3/335 [00:00<00:16, 20.68it/s]

Finished run for neighbour size: 10 in 0:00:15.080319


100%|██████████| 335/335 [00:16<00:00, 20.42it/s]
  1%|          | 2/335 [00:00<00:25, 13.03it/s]

Finished run for neighbour size: 25 in 0:00:16.411680


100%|██████████| 335/335 [00:18<00:00, 18.03it/s]
  1%|          | 2/335 [00:00<00:20, 16.36it/s]

Finished run for neighbour size: 50 in 0:00:18.582865


100%|██████████| 335/335 [00:20<00:00, 16.20it/s]
  1%|          | 2/335 [00:00<00:22, 14.48it/s]

Finished run for neighbour size: 75 in 0:00:20.685767


100%|██████████| 335/335 [00:23<00:00, 14.42it/s]


Finished run for neighbour size: 100 in 0:00:23.228915
Finished run for language: Italian in 0:01:50.758668
Loading embeddings: ja


  9%|▉         | 18/192 [00:00<00:01, 137.12it/s]

Embeddings ja loaded in 0:00:02.156944


100%|██████████| 192/192 [00:00<00:00, 196.64it/s]
  9%|▉         | 18/192 [00:00<00:01, 134.38it/s]

Finished run for neighbour size: 5 in 0:00:00.981164


100%|██████████| 192/192 [00:01<00:00, 188.23it/s]
  9%|▉         | 18/192 [00:00<00:01, 123.96it/s]

Finished run for neighbour size: 10 in 0:00:01.024385


100%|██████████| 192/192 [00:01<00:00, 175.30it/s]
  6%|▌         | 11/192 [00:00<00:01, 99.40it/s]

Finished run for neighbour size: 25 in 0:00:01.099575


100%|██████████| 192/192 [00:01<00:00, 154.31it/s]
  6%|▌         | 11/192 [00:00<00:02, 85.26it/s]

Finished run for neighbour size: 50 in 0:00:01.248758


100%|██████████| 192/192 [00:01<00:00, 135.36it/s]
  6%|▌         | 11/192 [00:00<00:02, 80.21it/s]

Finished run for neighbour size: 75 in 0:00:01.423014


100%|██████████| 192/192 [00:01<00:00, 123.62it/s]


Finished run for neighbour size: 100 in 0:00:01.557443
Finished run for language: Japanese in 0:00:09.494765
Loading embeddings: ko


  1%|          | 2/185 [00:00<00:16, 11.24it/s]

Embeddings ko loaded in 0:00:02.584285


100%|██████████| 185/185 [00:14<00:00, 12.37it/s]
  1%|          | 2/185 [00:00<00:16, 11.18it/s]

Finished run for neighbour size: 5 in 0:00:14.960468


100%|██████████| 185/185 [00:15<00:00, 12.07it/s]
  1%|          | 2/185 [00:00<00:17, 10.74it/s]

Finished run for neighbour size: 10 in 0:00:15.327421


100%|██████████| 185/185 [00:16<00:00, 11.54it/s]
  1%|          | 1/185 [00:00<00:18,  9.88it/s]

Finished run for neighbour size: 25 in 0:00:16.031164


100%|██████████| 185/185 [00:17<00:00, 10.75it/s]
  1%|          | 1/185 [00:00<00:20,  9.11it/s]

Finished run for neighbour size: 50 in 0:00:17.205877


100%|██████████| 185/185 [00:18<00:00, 10.03it/s]
  1%|          | 1/185 [00:00<00:20,  8.84it/s]

Finished run for neighbour size: 75 in 0:00:18.455561


100%|██████████| 185/185 [00:19<00:00,  9.47it/s]


Finished run for neighbour size: 100 in 0:00:19.547384
Finished run for language: Korean in 0:01:44.121378
Loading embeddings: pl


  0%|          | 1/295 [00:00<00:52,  5.59it/s]

Embeddings pl loaded in 0:00:05.227718


100%|██████████| 295/295 [00:51<00:00,  5.72it/s]
  0%|          | 1/295 [00:00<00:55,  5.26it/s]

Finished run for neighbour size: 5 in 0:00:51.560283


100%|██████████| 295/295 [00:51<00:00,  5.69it/s]
  0%|          | 1/295 [00:00<00:53,  5.48it/s]

Finished run for neighbour size: 10 in 0:00:51.884041


100%|██████████| 295/295 [00:52<00:00,  5.57it/s]
  0%|          | 1/295 [00:00<00:55,  5.26it/s]

Finished run for neighbour size: 25 in 0:00:52.958680


100%|██████████| 295/295 [00:54<00:00,  5.39it/s]
  0%|          | 1/295 [00:00<00:58,  5.05it/s]

Finished run for neighbour size: 50 in 0:00:54.758139


100%|██████████| 295/295 [00:56<00:00,  5.22it/s]
  0%|          | 0/295 [00:00<?, ?it/s]

Finished run for neighbour size: 75 in 0:00:56.550155


100%|██████████| 295/295 [00:58<00:00,  5.02it/s]


Finished run for neighbour size: 100 in 0:00:58.721064
Finished run for language: Polish in 0:05:31.666275
Loading embeddings: pt


  1%|          | 3/284 [00:00<00:13, 21.61it/s]

Embeddings pt loaded in 0:00:02.203310


100%|██████████| 284/284 [00:12<00:00, 22.67it/s]
  1%|          | 3/284 [00:00<00:12, 21.66it/s]

Finished run for neighbour size: 5 in 0:00:12.533741


100%|██████████| 284/284 [00:12<00:00, 22.12it/s]
  1%|          | 3/284 [00:00<00:13, 20.35it/s]

Finished run for neighbour size: 10 in 0:00:12.845631


100%|██████████| 284/284 [00:13<00:00, 20.40it/s]
  1%|          | 2/284 [00:00<00:15, 17.81it/s]

Finished run for neighbour size: 25 in 0:00:13.928295


100%|██████████| 284/284 [00:15<00:00, 17.89it/s]
  1%|          | 2/284 [00:00<00:17, 16.16it/s]

Finished run for neighbour size: 50 in 0:00:15.878070


100%|██████████| 284/284 [00:17<00:00, 16.08it/s]
  1%|          | 2/284 [00:00<00:19, 14.70it/s]

Finished run for neighbour size: 75 in 0:00:17.663085


100%|██████████| 284/284 [00:19<00:00, 14.42it/s]


Finished run for neighbour size: 100 in 0:00:19.704952
Finished run for language: Portuguese in 0:01:34.768534
Loading embeddings: ru


  1%|          | 3/340 [00:00<00:14, 22.76it/s]

Embeddings ru loaded in 0:00:02.223475


100%|██████████| 340/340 [00:13<00:00, 24.48it/s]
  1%|          | 3/340 [00:00<00:15, 22.34it/s]

Finished run for neighbour size: 5 in 0:00:13.890975


100%|██████████| 340/340 [00:14<00:00, 23.98it/s]
  1%|          | 3/340 [00:00<00:16, 20.33it/s]

Finished run for neighbour size: 10 in 0:00:14.180258


100%|██████████| 340/340 [00:15<00:00, 21.78it/s]
  1%|          | 2/340 [00:00<00:19, 17.30it/s]

Finished run for neighbour size: 25 in 0:00:15.617731


100%|██████████| 340/340 [00:17<00:00, 19.36it/s]
  1%|          | 2/340 [00:00<00:22, 14.72it/s]

Finished run for neighbour size: 50 in 0:00:17.570568


100%|██████████| 340/340 [00:19<00:00, 17.28it/s]
  1%|          | 2/340 [00:00<00:23, 14.63it/s]

Finished run for neighbour size: 75 in 0:00:19.679851


100%|██████████| 340/340 [00:21<00:00, 15.47it/s]


Finished run for neighbour size: 100 in 0:00:21.988659
Finished run for language: Russian in 0:01:45.155064
Loading embeddings: es


  0%|          | 3/796 [00:00<00:34, 22.76it/s]

Embeddings es loaded in 0:00:02.187653


100%|██████████| 796/796 [00:34<00:00, 23.12it/s]
  0%|          | 3/796 [00:00<00:36, 21.91it/s]

Finished run for neighbour size: 5 in 0:00:34.436467


100%|██████████| 796/796 [00:35<00:00, 22.47it/s]
  0%|          | 3/796 [00:00<00:38, 20.36it/s]

Finished run for neighbour size: 10 in 0:00:35.437274


100%|██████████| 796/796 [00:38<00:00, 20.63it/s]
  0%|          | 2/796 [00:00<00:44, 17.95it/s]

Finished run for neighbour size: 25 in 0:00:38.585655


100%|██████████| 796/796 [00:43<00:00, 18.13it/s]
  0%|          | 2/796 [00:00<00:50, 15.79it/s]

Finished run for neighbour size: 50 in 0:00:43.908057


100%|██████████| 796/796 [00:48<00:00, 16.30it/s]
  0%|          | 2/796 [00:00<00:55, 14.36it/s]

Finished run for neighbour size: 75 in 0:00:48.850557


100%|██████████| 796/796 [00:54<00:00, 14.60it/s]


Finished run for neighbour size: 100 in 0:00:54.512751
Finished run for language: Spanish in 0:04:17.922946
Loading embeddings: sv


  7%|▋         | 3/44 [00:00<00:01, 22.89it/s]

Embeddings sv loaded in 0:00:02.136387


100%|██████████| 44/44 [00:01<00:00, 24.38it/s]
  7%|▋         | 3/44 [00:00<00:01, 21.94it/s]

Finished run for neighbour size: 5 in 0:00:01.809750


100%|██████████| 44/44 [00:01<00:00, 23.63it/s]
  7%|▋         | 3/44 [00:00<00:02, 20.47it/s]

Finished run for neighbour size: 10 in 0:00:01.866582


100%|██████████| 44/44 [00:01<00:00, 22.07it/s]
  5%|▍         | 2/44 [00:00<00:02, 17.99it/s]

Finished run for neighbour size: 25 in 0:00:01.998861


100%|██████████| 44/44 [00:02<00:00, 19.34it/s]
  5%|▍         | 2/44 [00:00<00:02, 16.24it/s]

Finished run for neighbour size: 50 in 0:00:02.279816


100%|██████████| 44/44 [00:02<00:00, 17.37it/s]
  5%|▍         | 2/44 [00:00<00:02, 14.43it/s]

Finished run for neighbour size: 75 in 0:00:02.538301


100%|██████████| 44/44 [00:02<00:00, 15.39it/s]


Finished run for neighbour size: 100 in 0:00:02.862942
Finished run for language: Swedish in 0:00:15.496261
Loading embeddings: th


  4%|▍         | 5/122 [00:00<00:02, 41.94it/s]

Embeddings th loaded in 0:00:01.201089


100%|██████████| 122/122 [00:02<00:00, 49.61it/s]
  4%|▍         | 5/122 [00:00<00:02, 40.05it/s]

Finished run for neighbour size: 5 in 0:00:02.463985


100%|██████████| 122/122 [00:02<00:00, 45.92it/s]
  3%|▎         | 4/122 [00:00<00:03, 33.98it/s]

Finished run for neighbour size: 10 in 0:00:02.661124


100%|██████████| 122/122 [00:03<00:00, 40.21it/s]
  2%|▏         | 3/122 [00:00<00:04, 27.77it/s]

Finished run for neighbour size: 25 in 0:00:03.038478


100%|██████████| 122/122 [00:03<00:00, 32.81it/s]
  2%|▏         | 3/122 [00:00<00:05, 23.54it/s]

Finished run for neighbour size: 50 in 0:00:03.723119


100%|██████████| 122/122 [00:04<00:00, 27.80it/s]
  2%|▏         | 3/122 [00:00<00:05, 20.74it/s]

Finished run for neighbour size: 75 in 0:00:04.392451


100%|██████████| 122/122 [00:05<00:00, 24.11it/s]


Finished run for neighbour size: 100 in 0:00:05.064608
Finished run for language: Thai in 0:00:22.551788
Loading embeddings: tr


  1%|▏         | 4/272 [00:00<00:08, 30.51it/s]

Embeddings tr loaded in 0:00:02.168491


100%|██████████| 272/272 [00:10<00:00, 27.07it/s]
  1%|▏         | 4/272 [00:00<00:09, 29.66it/s]

Finished run for neighbour size: 5 in 0:00:10.052989


100%|██████████| 272/272 [00:10<00:00, 26.29it/s]
  1%|▏         | 4/272 [00:00<00:09, 27.25it/s]

Finished run for neighbour size: 10 in 0:00:10.351336


100%|██████████| 272/272 [00:11<00:00, 23.89it/s]
  1%|          | 2/272 [00:00<00:17, 15.66it/s]

Finished run for neighbour size: 25 in 0:00:11.392198


100%|██████████| 272/272 [00:12<00:00, 21.39it/s]
  1%|          | 2/272 [00:00<00:16, 16.21it/s]

Finished run for neighbour size: 50 in 0:00:12.722004


100%|██████████| 272/272 [00:14<00:00, 19.02it/s]
  1%|          | 2/272 [00:00<00:18, 14.70it/s]

Finished run for neighbour size: 75 in 0:00:14.308182


100%|██████████| 272/272 [00:15<00:00, 17.10it/s]

Finished run for neighbour size: 100 in 0:00:15.908215
Finished run for language: Turkish in 0:01:16.905993
Finished run for vector type: polyglot in 0:29:41.297570





In [8]:
for key in new_columns:
    pd_dataset['l1_sim_polyglot_' + str(key)] = new_columns[key]

In [9]:
pd_dataset.to_csv('../data/ms_final_with_polyglot.csv', index = False)

In [10]:
pd_dataset.head()

Unnamed: 0,c,i,id,l1_c,l1_i,language,type,code,l2_sim_cc_5,l2_sim_cc_10,...,l2_sim_polyglot_25,l2_sim_polyglot_50,l2_sim_polyglot_75,l2_sim_polyglot_100,l1_sim_polyglot_5,l1_sim_polyglot_10,l1_sim_polyglot_25,l1_sim_polyglot_50,l1_sim_polyglot_75,l1_sim_polyglot_100
0,plans,projects,person_2799,plans,projectes,Catalan,RN,ca,0.315914,0.289572,...,0.652385,0.64229,0.636711,0.636448,0.706547,0.719866,0.709139,0.681242,0.676333,0.667461
221,independence,freedom,person_2132,independència,llibertat,Catalan,RN,ca,0.4783,0.455483,...,0.633258,0.621581,0.608173,0.601868,0.67352,0.658533,0.656046,0.656317,0.647447,0.646497
220,stars,starts,person_2127,estrelles,comença,Catalan,RN,ca,0.068227,0.068642,...,0.312807,0.32328,0.315747,0.309649,-0.086862,-0.071272,-0.057636,-0.068105,-0.065801,-0.070737
219,time,hours,person_2127,temps,hores,Catalan,RN,ca,0.464017,0.407275,...,0.270096,0.236612,0.215306,0.203645,0.254001,0.308201,0.277933,0.270908,0.253394,0.249989
218,coming,following,person_2127,venint,següents,Catalan,RJ,ca,0.183514,0.166062,...,0.274542,0.268298,0.258854,0.258467,0.017321,-0.002845,-0.019284,-0.027436,-0.031775,-0.024451
