In [1]:
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from polyglot.mapping import Embedding

In [2]:
neighbour_list = [5, 10, 25, 50, 75, 100]
embedding_list = ['polyglot']
embedding_path = '../data/pretrained_embeddings/polyglot/embeddings2/'

In [3]:
def get_embedding_file_name(language):
    file_name = ''
    if (language == "Russian"):
        file_name += 'ru'
    elif (language == "Turkish"):
        file_name += 'tr'
    elif (language == "German"):
        file_name += 'de'
    elif (language == "Japanese"):
        file_name += 'ja'
    elif (language == "Spanish"):
        file_name += 'es'
    elif (language == "Polish"):
        file_name += 'pl'
    elif (language == "Italian"):
        file_name += 'it'
    elif (language == "Catalan"):
        file_name += 'ca'
    elif (language == "Korean"):
        file_name += 'ko'
    elif (language == "French"):
        file_name += 'fr'
    elif (language == "Chinese" or language == "Chinese (Simplified)"):
        file_name += 'zh'
    elif (language == "Portuguese"):
        file_name += 'pt'
    elif (language == "Swedish"):
        file_name += 'sv'
    elif (language == "Greek"):
        file_name += 'el'
    elif (language == "Thai"):
        file_name += 'th'
    elif (language == "Dutch"):
        file_name += 'nl'
    elif (language == "English"):
        file_name += 'en'
        
    return file_name

def load_embedding_vector(lang):
    embeddings = Embedding.load(embedding_path + lang + '/embeddings_pkl.tar.bz2')
    embeddings = embeddings.normalize_words()
    return embeddings

In [4]:
def proper_case(word):
    return word[0].upper() + word[1:]

def get_similarity(w1, w2, embeddings, size):
    
    try:
        w1_vec = embeddings[w1]
        w2_vec = embeddings[w2]
    except KeyError:
        try:
            w1 = proper_case(w1)
            w2 = proper_case(w2)
            w1_vec = embeddings[w1]
            w2_vec = embeddings[w2]
        except KeyError:
            return -10
    
    w1_neighbours = embeddings.nearest_neighbors(w1, top_k = size)
    w2_neighbours = embeddings.nearest_neighbors(w2, top_k = size)
    
    w1_neighbour_vectors = [embeddings[word] for word in w1_neighbours]
    w2_neighbour_vectors = [embeddings[word] for word in w2_neighbours]
    
    w1_cosine = np.average([cosine_similarity([w1_vec], [w2_n_vec]) for w2_n_vec in w2_neighbour_vectors])
    w2_cosine = np.average([cosine_similarity([w2_vec], [w1_n_vec]) for w1_n_vec in w1_neighbour_vectors])
    
    return np.average([w1_cosine, w2_cosine])

In [5]:
source = '../data/ms_final_experiments_with_wiki.csv'
pd_dataset = pd.read_csv(source)
pd_dataset = pd_dataset.sort_values(by = 'language', ascending = True)
pd_dataset.head()

Unnamed: 0,c,i,id,l1_c,l1_i,language,type,code,l2_sim_cc_5,l2_sim_cc_10,...,l1_sim_cc_25,l1_sim_cc_50,l1_sim_cc_75,l1_sim_cc_100,l2_sim_wiki_5,l2_sim_wiki_10,l2_sim_wiki_25,l2_sim_wiki_50,l2_sim_wiki_75,l2_sim_wiki_100
0,plans,projects,person_2799,plans,projectes,Catalan,RN,ca,0.315914,0.289572,...,0.513301,0.508249,0.497365,0.494275,0.256624,0.263855,0.216694,0.194096,0.187172,0.185074
221,especially,specially,person_2421,especialment,especialment,Catalan,RY,ca,0.342199,0.336672,...,0.676789,0.635269,0.615089,0.602047,0.153657,0.153794,0.138989,0.123324,0.112214,0.106564
220,waste,lose,person_2421,malgastar,perdre,Catalan,RV,ca,0.197119,0.175892,...,0.425895,0.418449,0.414079,0.411164,0.092023,0.086759,0.080194,0.080709,0.076916,0.081092
219,ending,end,person_1465,final,final,Catalan,RN,ca,0.591592,0.500977,...,0.708057,0.666229,0.637293,0.616535,0.650926,0.507235,0.374291,0.322232,0.303969,0.291704
218,shot,view,person_1465,disparat,mostra,Catalan,RN,ca,0.2104,0.187495,...,0.237338,0.222958,0.225796,0.222587,0.104407,0.092781,0.089016,0.079768,0.077828,0.076215


In [6]:
def run_experiment_english(dataframe):
    for vector_type in embedding_list:
        vec_type_start = datetime.now()
        embedding_file_name = get_embedding_file_name("English")
        print('Loading embeddings: ' + embedding_file_name)
        embeddings = load_embedding_vector(embedding_file_name)
        print('Embeddings ' + embedding_file_name + ' loaded in ' + str(datetime.now() - vec_type_start))
        for neighbour_size in neighbour_list:
            neighbour_size_start = datetime.now()
            l2_sim = []
            for ind, row in tqdm(dataframe.iterrows(), total=dataframe.shape[0]):
                w1 = row['c']
                w2 = row['i']
                l2_sim.append(get_similarity(w1, w2, embeddings, neighbour_size))
            dataframe['l2_sim_' + vector_type + '_' + str(neighbour_size)] = l2_sim
            print('Finished run for neighbour size: ' + str(neighbour_size) + ' in ' + str(datetime.now() - neighbour_size_start))
        print('Finished run for vector type: ' + vector_type + ' in ' + str(datetime.now() - vec_type_start))

In [7]:
run_experiment_english(pd_dataset)

Loading embeddings: en


  0%|          | 2/4937 [00:00<04:12, 19.54it/s]

Embeddings en loaded in 0:00:02.239987


100%|██████████| 4937/4937 [03:35<00:00, 22.88it/s]
  0%|          | 3/4937 [00:00<03:46, 21.77it/s]

Finished run for neighbour size: 5 in 0:03:35.774328


100%|██████████| 4937/4937 [03:42<00:00, 22.15it/s]
  0%|          | 2/4937 [00:00<04:14, 19.40it/s]

Finished run for neighbour size: 10 in 0:03:42.852787


100%|██████████| 4937/4937 [04:02<00:00, 20.32it/s]
  0%|          | 2/4937 [00:00<04:51, 16.93it/s]

Finished run for neighbour size: 25 in 0:04:02.928900


100%|██████████| 4937/4937 [04:38<00:00, 17.76it/s]
  0%|          | 2/4937 [00:00<05:21, 15.33it/s]

Finished run for neighbour size: 50 in 0:04:38.020663


100%|██████████| 4937/4937 [05:11<00:00, 15.85it/s]
  0%|          | 2/4937 [00:00<05:49, 14.12it/s]

Finished run for neighbour size: 75 in 0:05:11.450604


100%|██████████| 4937/4937 [05:48<00:00, 14.19it/s]

Finished run for neighbour size: 100 in 0:05:48.021875
Finished run for vector type: polyglot in 0:27:01.289648





In [8]:
pd_dataset.to_csv('../data/ms_final_polyglot_english_3.csv', index = False)

In [9]:
def get_experiment_model(language, vector_type):
    vec_type_start = datetime.now()
    embedding_file_name = get_embedding_file_name(language)
    print('Loading embeddings: ' + embedding_file_name)
    embeddings = load_embedding_vector(embedding_file_name)
    print('Embeddings ' + embedding_file_name + ' loaded in ' + str(datetime.now() - vec_type_start))
    return embeddings
new_columns = {'5': [], '10': [], '25': [], '50': [], '75': [], '100': []}
def run_experiment_l1(dataframe):
    current_lang = ''
    current_vector_type = ''
    
    for vector_type in embedding_list:
        vec_type_start = datetime.now()
        for language in language_list:
            language_start = datetime.now()
            embeddings = get_experiment_model(language, vector_type)
            for neighbour_size in neighbour_list:
                neighbour_size_start = datetime.now()
#                 l1_sim = []
                dset = dataframe[dataframe['language'] == language]
                for ind, row in tqdm(dset.iterrows(), total=dset.shape[0]):
                    w1 = row['l1_c']
                    w2 = row['l1_i']
                    new_columns[str(neighbour_size)].append(get_similarity(w1, w2, embeddings, neighbour_size))
#                 dataframe['l1_sim_' + vector_type + '_' + str(neighbour_size)] = l1_sim 
                print('Finished run for neighbour size: ' + str(neighbour_size) + ' in ' + str(datetime.now() - neighbour_size_start))
            print('Finished run for language: ' + language + ' in ' + str(datetime.now() - language_start))
        print('Finished run for vector type: ' + vector_type + ' in ' + str(datetime.now() - vec_type_start))

In [11]:
neighbour_list = [5, 10, 25, 50, 75, 100]
embedding_list = ['polyglot']
embedding_path = '../data/pretrained_embeddings/polyglot/embeddings2/'
source = '../data/ms_final_polyglot_english_3.csv'

pd_dataset = pd.read_csv(source)
pd_dataset = pd_dataset.sort_values(by = 'language', ascending = True)
pd_dataset.head()
language_list = pd_dataset['language'].unique()

In [12]:
run_experiment_l1(pd_dataset)

Loading embeddings: ca


  1%|          | 2/325 [00:00<00:17, 18.90it/s]

Embeddings ca loaded in 0:00:02.396885


100%|██████████| 325/325 [00:14<00:00, 21.96it/s]
  1%|          | 3/325 [00:00<00:15, 20.53it/s]

Finished run for neighbour size: 5 in 0:00:14.801677


100%|██████████| 325/325 [00:15<00:00, 21.41it/s]
  1%|          | 2/325 [00:00<00:16, 19.29it/s]

Finished run for neighbour size: 10 in 0:00:15.187926


100%|██████████| 325/325 [00:16<00:00, 19.70it/s]
  1%|          | 2/325 [00:00<00:18, 17.30it/s]

Finished run for neighbour size: 25 in 0:00:16.501834


100%|██████████| 325/325 [00:18<00:00, 17.49it/s]
  1%|          | 2/325 [00:00<00:22, 14.58it/s]

Finished run for neighbour size: 50 in 0:00:18.582384


100%|██████████| 325/325 [00:20<00:00, 15.64it/s]
  1%|          | 2/325 [00:00<00:23, 14.02it/s]

Finished run for neighbour size: 75 in 0:00:20.782808


100%|██████████| 325/325 [00:23<00:00, 14.13it/s]


Finished run for neighbour size: 100 in 0:00:23.008228
Finished run for language: Catalan in 0:01:51.262206
Loading embeddings: zh


  1%|          | 3/310 [00:00<00:13, 23.35it/s]

Embeddings zh loaded in 0:00:02.147062


100%|██████████| 310/310 [00:12<00:00, 24.13it/s]
  1%|          | 3/310 [00:00<00:13, 22.86it/s]

Finished run for neighbour size: 5 in 0:00:12.852711


100%|██████████| 310/310 [00:13<00:00, 23.49it/s]
  1%|          | 2/310 [00:00<00:15, 19.76it/s]

Finished run for neighbour size: 10 in 0:00:13.203564


100%|██████████| 310/310 [00:14<00:00, 21.49it/s]
  1%|          | 2/310 [00:00<00:17, 18.06it/s]

Finished run for neighbour size: 25 in 0:00:14.432572


100%|██████████| 310/310 [00:16<00:00, 18.78it/s]
  1%|          | 2/310 [00:00<00:18, 16.42it/s]

Finished run for neighbour size: 50 in 0:00:16.511780


100%|██████████| 310/310 [00:18<00:00, 16.73it/s]
  1%|          | 2/310 [00:00<00:21, 14.59it/s]

Finished run for neighbour size: 75 in 0:00:18.531675


100%|██████████| 310/310 [00:21<00:00, 14.46it/s]


Finished run for neighbour size: 100 in 0:00:21.436917
Finished run for language: Chinese (Simplified) in 0:01:39.120198
Loading embeddings: nl


 60%|██████    | 3/5 [00:00<00:00, 22.42it/s]

Embeddings nl loaded in 0:00:02.241336


100%|██████████| 5/5 [00:00<00:00, 22.12it/s]
 60%|██████    | 3/5 [00:00<00:00, 21.90it/s]

Finished run for neighbour size: 5 in 0:00:00.230671


100%|██████████| 5/5 [00:00<00:00, 21.55it/s]
 40%|████      | 2/5 [00:00<00:00, 18.34it/s]

Finished run for neighbour size: 10 in 0:00:00.236418


100%|██████████| 5/5 [00:00<00:00, 18.59it/s]
 40%|████      | 2/5 [00:00<00:00, 17.23it/s]

Finished run for neighbour size: 25 in 0:00:00.273117


100%|██████████| 5/5 [00:00<00:00, 16.91it/s]
 40%|████      | 2/5 [00:00<00:00, 15.52it/s]

Finished run for neighbour size: 50 in 0:00:00.300800


100%|██████████| 5/5 [00:00<00:00, 15.42it/s]
 40%|████      | 2/5 [00:00<00:00, 13.54it/s]

Finished run for neighbour size: 75 in 0:00:00.328604


100%|██████████| 5/5 [00:00<00:00, 13.17it/s]


Finished run for neighbour size: 100 in 0:00:00.384134
Finished run for language: Dutch in 0:00:03.998979
Loading embeddings: fr


  0%|          | 3/794 [00:00<00:36, 21.66it/s]

Embeddings fr loaded in 0:00:02.301798


100%|██████████| 794/794 [00:36<00:00, 21.97it/s]
  0%|          | 3/794 [00:00<00:37, 21.30it/s]

Finished run for neighbour size: 5 in 0:00:36.145767


100%|██████████| 794/794 [00:36<00:00, 21.51it/s]
  0%|          | 2/794 [00:00<00:41, 19.05it/s]

Finished run for neighbour size: 10 in 0:00:36.920478


100%|██████████| 794/794 [00:40<00:00, 19.60it/s]
  0%|          | 2/794 [00:00<00:45, 17.45it/s]

Finished run for neighbour size: 25 in 0:00:40.515618


100%|██████████| 794/794 [00:46<00:00, 17.20it/s]
  0%|          | 2/794 [00:00<00:53, 14.93it/s]

Finished run for neighbour size: 50 in 0:00:46.175357


100%|██████████| 794/794 [00:52<00:00, 15.17it/s]
  0%|          | 2/794 [00:00<00:56, 13.97it/s]

Finished run for neighbour size: 75 in 0:00:52.343931


100%|██████████| 794/794 [00:58<00:00, 13.67it/s]


Finished run for neighbour size: 100 in 0:00:58.100692
Finished run for language: French in 0:04:32.507271
Loading embeddings: de


  1%|          | 3/285 [00:00<00:13, 21.47it/s]

Embeddings de loaded in 0:00:02.267136


100%|██████████| 285/285 [00:12<00:00, 22.20it/s]
  1%|          | 3/285 [00:00<00:13, 20.65it/s]

Finished run for neighbour size: 5 in 0:00:12.843715


100%|██████████| 285/285 [00:13<00:00, 21.64it/s]
  1%|          | 2/285 [00:00<00:14, 19.48it/s]

Finished run for neighbour size: 10 in 0:00:13.177081


100%|██████████| 285/285 [00:14<00:00, 19.66it/s]
  1%|          | 2/285 [00:00<00:16, 17.12it/s]

Finished run for neighbour size: 25 in 0:00:14.496985


100%|██████████| 285/285 [00:16<00:00, 17.19it/s]
  1%|          | 2/285 [00:00<00:19, 14.87it/s]

Finished run for neighbour size: 50 in 0:00:16.580611


100%|██████████| 285/285 [00:18<00:00, 15.38it/s]
  1%|          | 2/285 [00:00<00:20, 14.00it/s]

Finished run for neighbour size: 75 in 0:00:18.537572


100%|██████████| 285/285 [00:20<00:00, 13.87it/s]


Finished run for neighbour size: 100 in 0:00:20.545246
Finished run for language: German in 0:01:38.452614
Loading embeddings: el


  1%|          | 3/353 [00:00<00:16, 21.66it/s]

Embeddings el loaded in 0:00:02.299375


100%|██████████| 353/353 [00:14<00:00, 23.77it/s]
  1%|          | 3/353 [00:00<00:16, 20.83it/s]

Finished run for neighbour size: 5 in 0:00:14.856631


100%|██████████| 353/353 [00:15<00:00, 23.10it/s]
  1%|          | 2/353 [00:00<00:18, 19.36it/s]

Finished run for neighbour size: 10 in 0:00:15.285549


100%|██████████| 353/353 [00:16<00:00, 21.26it/s]
  1%|          | 2/353 [00:00<00:20, 17.02it/s]

Finished run for neighbour size: 25 in 0:00:16.604694


100%|██████████| 353/353 [00:18<00:00, 19.10it/s]
  1%|          | 2/353 [00:00<00:22, 15.45it/s]

Finished run for neighbour size: 50 in 0:00:18.490369


100%|██████████| 353/353 [00:20<00:00, 17.07it/s]
  1%|          | 2/353 [00:00<00:24, 14.53it/s]

Finished run for neighbour size: 75 in 0:00:20.689939


100%|██████████| 353/353 [00:22<00:00, 15.52it/s]


Finished run for neighbour size: 100 in 0:00:22.754670
Finished run for language: Greek in 0:01:50.984892
Loading embeddings: it


  1%|          | 3/335 [00:00<00:15, 21.98it/s]

Embeddings it loaded in 0:00:02.139436


100%|██████████| 335/335 [00:14<00:00, 23.21it/s]
  1%|          | 3/335 [00:00<00:14, 22.66it/s]

Finished run for neighbour size: 5 in 0:00:14.441021


100%|██████████| 335/335 [00:14<00:00, 22.51it/s]
  1%|          | 3/335 [00:00<00:16, 20.73it/s]

Finished run for neighbour size: 10 in 0:00:14.885667


100%|██████████| 335/335 [00:16<00:00, 20.60it/s]
  1%|          | 2/335 [00:00<00:27, 12.16it/s]

Finished run for neighbour size: 25 in 0:00:16.263962


100%|██████████| 335/335 [00:18<00:00, 17.90it/s]
  1%|          | 2/335 [00:00<00:20, 16.23it/s]

Finished run for neighbour size: 50 in 0:00:18.714382


100%|██████████| 335/335 [00:20<00:00, 16.07it/s]
  1%|          | 2/335 [00:00<00:22, 14.73it/s]

Finished run for neighbour size: 75 in 0:00:20.855683


100%|██████████| 335/335 [00:23<00:00, 14.42it/s]


Finished run for neighbour size: 100 in 0:00:23.230505
Finished run for language: Italian in 0:01:50.537719
Loading embeddings: ja


  9%|▉         | 18/192 [00:00<00:01, 135.18it/s]

Embeddings ja loaded in 0:00:02.206417


100%|██████████| 192/192 [00:00<00:00, 201.43it/s]
  9%|▉         | 18/192 [00:00<00:01, 120.28it/s]

Finished run for neighbour size: 5 in 0:00:00.957311


100%|██████████| 192/192 [00:00<00:00, 192.31it/s]
  9%|▉         | 18/192 [00:00<00:01, 102.67it/s]

Finished run for neighbour size: 10 in 0:00:01.002570


100%|██████████| 192/192 [00:01<00:00, 174.33it/s]
  6%|▌         | 11/192 [00:00<00:01, 96.56it/s]

Finished run for neighbour size: 25 in 0:00:01.106631


100%|██████████| 192/192 [00:01<00:00, 154.91it/s]
  6%|▌         | 11/192 [00:00<00:02, 89.17it/s]

Finished run for neighbour size: 50 in 0:00:01.269596


100%|██████████| 192/192 [00:01<00:00, 139.15it/s]
  6%|▌         | 11/192 [00:00<00:02, 80.96it/s]

Finished run for neighbour size: 75 in 0:00:01.384152


100%|██████████| 192/192 [00:01<00:00, 122.94it/s]


Finished run for neighbour size: 100 in 0:00:01.565974
Finished run for language: Japanese in 0:00:09.496275
Loading embeddings: ko


  1%|          | 2/185 [00:00<00:09, 18.43it/s]

Embeddings ko loaded in 0:00:02.596239


100%|██████████| 185/185 [00:09<00:00, 20.55it/s]
  1%|          | 2/185 [00:00<00:09, 18.62it/s]

Finished run for neighbour size: 5 in 0:00:09.008977


100%|██████████| 185/185 [00:09<00:00, 19.99it/s]
  1%|          | 2/185 [00:00<00:11, 16.34it/s]

Finished run for neighbour size: 10 in 0:00:09.256868


100%|██████████| 185/185 [00:10<00:00, 18.46it/s]
  1%|          | 2/185 [00:00<00:11, 15.54it/s]

Finished run for neighbour size: 25 in 0:00:10.026238


100%|██████████| 185/185 [00:11<00:00, 16.78it/s]
  1%|          | 2/185 [00:00<00:13, 13.73it/s]

Finished run for neighbour size: 50 in 0:00:11.030760


100%|██████████| 185/185 [00:12<00:00, 14.97it/s]
  1%|          | 2/185 [00:00<00:14, 12.92it/s]

Finished run for neighbour size: 75 in 0:00:12.365298


100%|██████████| 185/185 [00:13<00:00, 13.94it/s]


Finished run for neighbour size: 100 in 0:00:13.276638
Finished run for language: Korean in 0:01:07.565167
Loading embeddings: pl


  0%|          | 1/295 [00:00<00:51,  5.73it/s]

Embeddings pl loaded in 0:00:05.288269


100%|██████████| 295/295 [00:51<00:00,  5.72it/s]
  0%|          | 1/295 [00:00<00:51,  5.74it/s]

Finished run for neighbour size: 5 in 0:00:51.566306


100%|██████████| 295/295 [00:52<00:00,  5.67it/s]
  0%|          | 1/295 [00:00<00:52,  5.58it/s]

Finished run for neighbour size: 10 in 0:00:52.076555


100%|██████████| 295/295 [00:53<00:00,  5.52it/s]
  0%|          | 1/295 [00:00<00:55,  5.32it/s]

Finished run for neighbour size: 25 in 0:00:53.402128


100%|██████████| 295/295 [00:55<00:00,  5.34it/s]
  0%|          | 1/295 [00:00<00:58,  5.05it/s]

Finished run for neighbour size: 50 in 0:00:55.252539


100%|██████████| 295/295 [00:57<00:00,  5.16it/s]
  0%|          | 0/295 [00:00<?, ?it/s]

Finished run for neighbour size: 75 in 0:00:57.194472


100%|██████████| 295/295 [00:59<00:00,  4.97it/s]


Finished run for neighbour size: 100 in 0:00:59.329990
Finished run for language: Polish in 0:05:34.119216
Loading embeddings: pt


  1%|          | 3/284 [00:00<00:12, 22.18it/s]

Embeddings pt loaded in 0:00:02.171654


100%|██████████| 284/284 [00:12<00:00, 22.99it/s]
  1%|          | 3/284 [00:00<00:12, 22.23it/s]

Finished run for neighbour size: 5 in 0:00:12.356523


100%|██████████| 284/284 [00:12<00:00, 22.26it/s]
  1%|          | 3/284 [00:00<00:13, 20.51it/s]

Finished run for neighbour size: 10 in 0:00:12.760531


100%|██████████| 284/284 [00:13<00:00, 20.40it/s]
  1%|          | 2/284 [00:00<00:15, 17.83it/s]

Finished run for neighbour size: 25 in 0:00:13.924064


100%|██████████| 284/284 [00:15<00:00, 17.88it/s]
  1%|          | 2/284 [00:00<00:18, 15.26it/s]

Finished run for neighbour size: 50 in 0:00:15.889091


100%|██████████| 284/284 [00:17<00:00, 15.88it/s]
  1%|          | 2/284 [00:00<00:20, 13.69it/s]

Finished run for neighbour size: 75 in 0:00:17.893673


100%|██████████| 284/284 [00:19<00:00, 14.37it/s]


Finished run for neighbour size: 100 in 0:00:19.760908
Finished run for language: Portuguese in 0:01:34.767731
Loading embeddings: ru


  1%|          | 3/340 [00:00<00:16, 20.96it/s]

Embeddings ru loaded in 0:00:02.200803


100%|██████████| 340/340 [00:13<00:00, 24.72it/s]
  1%|          | 3/340 [00:00<00:15, 22.01it/s]

Finished run for neighbour size: 5 in 0:00:13.759816


100%|██████████| 340/340 [00:14<00:00, 23.94it/s]
  1%|          | 3/340 [00:00<00:16, 20.51it/s]

Finished run for neighbour size: 10 in 0:00:14.208219


100%|██████████| 340/340 [00:15<00:00, 22.01it/s]
  1%|          | 2/340 [00:00<00:18, 17.95it/s]

Finished run for neighbour size: 25 in 0:00:15.454916


100%|██████████| 340/340 [00:17<00:00, 19.28it/s]
  1%|          | 2/340 [00:00<00:22, 15.02it/s]

Finished run for neighbour size: 50 in 0:00:17.637652


100%|██████████| 340/340 [00:19<00:00, 17.19it/s]
  1%|          | 2/340 [00:00<00:23, 14.38it/s]

Finished run for neighbour size: 75 in 0:00:19.779119


100%|██████████| 340/340 [00:22<00:00, 15.39it/s]


Finished run for neighbour size: 100 in 0:00:22.090392
Finished run for language: Russian in 0:01:45.134512
Loading embeddings: es


  0%|          | 3/796 [00:00<00:36, 21.60it/s]

Embeddings es loaded in 0:00:02.149734


100%|██████████| 796/796 [00:34<00:00, 22.86it/s]
  0%|          | 3/796 [00:00<00:36, 22.02it/s]

Finished run for neighbour size: 5 in 0:00:34.830200


100%|██████████| 796/796 [00:35<00:00, 22.21it/s]
  0%|          | 3/796 [00:00<00:38, 20.56it/s]

Finished run for neighbour size: 10 in 0:00:35.843827


100%|██████████| 796/796 [00:39<00:00, 20.31it/s]
  0%|          | 2/796 [00:00<00:43, 18.06it/s]

Finished run for neighbour size: 25 in 0:00:39.203596


100%|██████████| 796/796 [00:44<00:00, 17.84it/s]
  0%|          | 2/796 [00:00<00:49, 16.02it/s]

Finished run for neighbour size: 50 in 0:00:44.629467


100%|██████████| 796/796 [00:50<00:00, 15.88it/s]
  0%|          | 2/796 [00:00<00:55, 14.22it/s]

Finished run for neighbour size: 75 in 0:00:50.130409


100%|██████████| 796/796 [00:55<00:00, 14.33it/s]


Finished run for neighbour size: 100 in 0:00:55.560093
Finished run for language: Spanish in 0:04:22.351785
Loading embeddings: sv


  5%|▍         | 2/44 [00:00<00:02, 19.87it/s]

Embeddings sv loaded in 0:00:02.154899


100%|██████████| 44/44 [00:01<00:00, 24.58it/s]
  7%|▋         | 3/44 [00:00<00:01, 22.42it/s]

Finished run for neighbour size: 5 in 0:00:01.794967


100%|██████████| 44/44 [00:01<00:00, 24.45it/s]
  5%|▍         | 2/44 [00:00<00:02, 14.80it/s]

Finished run for neighbour size: 10 in 0:00:01.804251


100%|██████████| 44/44 [00:02<00:00, 21.99it/s]
  5%|▍         | 2/44 [00:00<00:02, 15.31it/s]

Finished run for neighbour size: 25 in 0:00:02.005339


100%|██████████| 44/44 [00:02<00:00, 19.35it/s]
  5%|▍         | 2/44 [00:00<00:02, 16.31it/s]

Finished run for neighbour size: 50 in 0:00:02.278443


100%|██████████| 44/44 [00:02<00:00, 17.25it/s]
  5%|▍         | 2/44 [00:00<00:02, 14.43it/s]

Finished run for neighbour size: 75 in 0:00:02.555343


100%|██████████| 44/44 [00:02<00:00, 15.26it/s]


Finished run for neighbour size: 100 in 0:00:02.888539
Finished run for language: Swedish in 0:00:15.489008
Loading embeddings: th


  4%|▍         | 5/122 [00:00<00:02, 42.83it/s]

Embeddings th loaded in 0:00:01.209929


100%|██████████| 122/122 [00:02<00:00, 50.05it/s]
  4%|▍         | 5/122 [00:00<00:02, 40.62it/s]

Finished run for neighbour size: 5 in 0:00:02.442406


100%|██████████| 122/122 [00:02<00:00, 47.11it/s]
  3%|▎         | 4/122 [00:00<00:03, 33.40it/s]

Finished run for neighbour size: 10 in 0:00:02.594190


100%|██████████| 122/122 [00:03<00:00, 40.42it/s]
  2%|▏         | 3/122 [00:00<00:04, 25.70it/s]

Finished run for neighbour size: 25 in 0:00:03.022548


100%|██████████| 122/122 [00:03<00:00, 32.63it/s]
  2%|▏         | 3/122 [00:00<00:04, 23.83it/s]

Finished run for neighbour size: 50 in 0:00:03.743705


100%|██████████| 122/122 [00:04<00:00, 27.64it/s]
  2%|▏         | 3/122 [00:00<00:05, 20.58it/s]

Finished run for neighbour size: 75 in 0:00:04.418298


100%|██████████| 122/122 [00:05<00:00, 24.09it/s]


Finished run for neighbour size: 100 in 0:00:05.069293
Finished run for language: Thai in 0:00:22.504053
Loading embeddings: tr


  1%|▏         | 4/272 [00:00<00:08, 30.83it/s]

Embeddings tr loaded in 0:00:02.149041


100%|██████████| 272/272 [00:09<00:00, 27.32it/s]
  1%|▏         | 4/272 [00:00<00:08, 29.98it/s]

Finished run for neighbour size: 5 in 0:00:09.962776


100%|██████████| 272/272 [00:10<00:00, 26.47it/s]
  1%|▏         | 4/272 [00:00<00:09, 27.59it/s]

Finished run for neighbour size: 10 in 0:00:10.279054


100%|██████████| 272/272 [00:11<00:00, 24.22it/s]
  1%|          | 2/272 [00:00<00:14, 18.03it/s]

Finished run for neighbour size: 25 in 0:00:11.235794


100%|██████████| 272/272 [00:12<00:00, 21.14it/s]
  1%|          | 2/272 [00:00<00:17, 15.86it/s]

Finished run for neighbour size: 50 in 0:00:12.871450


100%|██████████| 272/272 [00:14<00:00, 18.97it/s]
  1%|          | 2/272 [00:00<00:20, 13.09it/s]

Finished run for neighbour size: 75 in 0:00:14.341214


100%|██████████| 272/272 [00:15<00:00, 17.02it/s]

Finished run for neighbour size: 100 in 0:00:15.990070
Finished run for language: Turkish in 0:01:16.831570
Finished run for vector type: polyglot in 0:29:55.123786





In [13]:
for key in new_columns:
    pd_dataset['l1_sim_polyglot_' + str(key)] = new_columns[key]
#     print(len(new_columns[key]))

In [14]:
pd_dataset.to_csv('../data/ms_final_with_polyglot_3.csv', index = False)

In [15]:
pd_dataset.head()

Unnamed: 0,c,i,id,l1_c,l1_i,language,type,code,l2_sim_cc_5,l2_sim_cc_10,...,l2_sim_polyglot_25,l2_sim_polyglot_50,l2_sim_polyglot_75,l2_sim_polyglot_100,l1_sim_polyglot_5,l1_sim_polyglot_10,l1_sim_polyglot_25,l1_sim_polyglot_50,l1_sim_polyglot_75,l1_sim_polyglot_100
0,plans,projects,person_2799,plans,projectes,Catalan,RN,ca,0.315914,0.289572,...,0.666137,0.654442,0.646425,0.649371,0.751942,0.744027,0.726129,0.70585,0.694399,0.688358
221,independence,freedom,person_2132,independència,llibertat,Catalan,RN,ca,0.4783,0.455483,...,0.641429,0.614032,0.611689,0.604454,0.666178,0.661519,0.652071,0.664238,0.66661,0.661649
220,stars,starts,person_2127,estrelles,comença,Catalan,RN,ca,0.068227,0.068642,...,0.306266,0.308084,0.302099,0.312348,-0.095584,-0.052711,-0.037385,-0.041637,-0.040575,-0.040005
219,time,hours,person_2127,temps,hores,Catalan,RN,ca,0.464017,0.407275,...,0.279604,0.247636,0.219364,0.209386,0.25388,0.316569,0.298065,0.284307,0.273589,0.26786
218,coming,following,person_2127,venint,següents,Catalan,RJ,ca,0.183514,0.166062,...,0.310895,0.281859,0.282857,0.279149,0.053143,0.028941,0.003454,-0.000485,0.012008,0.015254


In [3]:
english_embedding = Embedding.load(embedding_path + 'en/embeddings_pkl.tar.bz2')
russian_embedding = Embedding.load(embedding_path + 'ru/embeddings_pkl.tar.bz2')

In [4]:
getting_n = english_embedding.nearest_neighbors("getting", top_k = 10)
acq_n = english_embedding.nearest_neighbors("acquiring", top_k = 10)

w1_n = russian_embedding.nearest_neighbors("получение", top_k = 10)
w2_n = russian_embedding.nearest_neighbors("приобретения", top_k = 10)

print(getting_n)
print(acq_n)
print(w1_n)
print(w2_n)

['pulling', 'putting', 'keeping', 'sneaking', 'LEFTnot', 'fetching', 'carrying', 'practically', 'staking', 'soliciting']
['adopting', 'supplying', 'overseeing', 'establishing', 'securing', 'obtaining', 'administering', 'purchasing', 'introducing', 'executing']
['устранение', 'сохранение', 'поддержание', 'создание', 'осуществление', 'восстановление', 'составление', 'поощрение', 'улучшение', 'уничтожение']
['достижения', 'установления', 'покупки', 'признания', 'использования', 'преобразования', 'формирования', 'посещения', 'объявления', 'построения']
