In [1]:
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm

In [2]:
def get_similarity(w1, w2, embeddings, size):
    try:
        w1_vector = embeddings[w1]
        w2_vector = embeddings[w2]
    except KeyError:
        return -10
    
    w1_neighbours_vectors = embeddings.similar_by_vector(w1_vector, topn = size)
    w2_neighbours_vectors = embeddings.similar_by_vector(w2_vector, topn = size)
    
    w1_neighbour_words = [embed[0] for embed in w1_neighbours_vectors]
    w2_neighbour_words = [embed[0] for embed in w2_neighbours_vectors]
    
    w1_cosine = np.average([cosine_similarity([w1_vector], [embeddings[w2]]) for w2 in w2_neighbour_words])
    w2_cosine = np.average([cosine_similarity([w2_vector], [embeddings[w1]]) for w1 in w1_neighbour_words])
    
    return np.average([w1_cosine, w2_cosine])

In [3]:
source = '../data/experiment_final.csv'
pd_dataset = pd.read_csv(source)
pd_dataset = pd_dataset.sort_values(by = 'language', ascending = True)
pd_dataset.head()

Unnamed: 0,c,i,id,l1_c,l1_i,language,type,code
2827,plans,projects,person_2799,plans,projectes,Catalan,RN,ca
2733,waste,lose,person_2421,malgastar,perdre,Catalan,RV,ca
2732,ending,end,person_1465,final,final,Catalan,RN,ca
2731,shot,view,person_1465,disparat,mostra,Catalan,RN,ca
2730,do,join,person_1465,fer,unir,Catalan,RV,ca


In [41]:
def get_embedding_file_name(language, file_type):
    file_name = file_type + '.'
    if (language == "Russian"):
        file_name += 'ru'
    elif (language == "Turkish"):
        file_name += 'tr'
    elif (language == "German"):
        file_name += 'de'
    elif (language == "Japanese"):
        file_name += 'ja'
    elif (language == "Spanish"):
        file_name += 'es'
    elif (language == "Polish"):
        file_name += 'pl'
    elif (language == "Italian"):
        file_name += 'it'
    elif (language == "Catalan"):
        file_name += 'ca'
    elif (language == "Korean"):
        file_name += 'ko'
    elif (language == "French"):
        file_name += 'fr'
    elif (language == "Chinese" or language == "Chinese (Simplified)"):
        file_name += 'zh'
    elif (language == "Portuguese"):
        file_name += 'pt'
    elif (language == "Swedish"):
        file_name += 'sv'
    elif (language == "Greek"):
        file_name += 'el'
    elif (language == "Thai"):
        file_name += 'th'
    elif (language == "Dutch"):
        file_name += 'nl'
    elif (language == "English"):
        file_name += 'en.300'
        
    file_name += '.vec' 
    return file_name

def load_embedding_vector(file_name):
    return KeyedVectors.load_word2vec_format(embedding_path + file_name, binary=False, unicode_errors='ignore')  # C text format

In [5]:
neighbour_list = [5, 10, 25, 50, 75, 100]
embedding_list = ['cc']
embedding_path = '../data/pretrained_embeddings/'

In [6]:
def run_experiment_english(dataframe):
    for vector_type in embedding_list:
        vec_type_start = datetime.now()
        embedding_file_name = get_embedding_file_name("English", vector_type)
        print('Loading embeddings: ' + embedding_file_name)
        embeddings = load_embedding_vector(embedding_file_name)
        print('Embeddings ' + embedding_file_name + ' loaded in ' + str(datetime.now() - vec_type_start))
        for neighbour_size in neighbour_list:
            neighbour_size_start = datetime.now()
            l2_sim = []
            for ind, row in tqdm(dataframe.iterrows(), total=dataframe.shape[0]):
                w1 = row['c']
                w2 = row['i']
                l2_sim.append(get_similarity(w1, w2, embeddings, neighbour_size))
            dataframe['l2_sim_' + vector_type + '_' + str(neighbour_size)] = l2_sim
            print('Finished run for neighbour size: ' + str(neighbour_size) + ' in ' + str(datetime.now() - neighbour_size_start))
        print('Finished run for vector type: ' + vector_type + ' in ' + str(datetime.now() - vec_type_start))

In [7]:
run_experiment_english(pd_dataset)

Loading embeddings: cc.en.300.vec


  0%|          | 0/4937 [00:00<?, ?it/s]

Embeddings cc.en.300.vec loaded in 0:07:06.017325


100%|██████████| 4937/4937 [11:50<00:00,  6.95it/s]
  0%|          | 1/4937 [00:00<13:25,  6.13it/s]

Finished run for neighbour size: 5 in 0:11:50.284184


100%|██████████| 4937/4937 [11:58<00:00,  6.87it/s]
  0%|          | 1/4937 [00:00<14:06,  5.83it/s]

Finished run for neighbour size: 10 in 0:11:58.396067


100%|██████████| 4937/4937 [12:34<00:00,  6.54it/s]
  0%|          | 1/4937 [00:00<15:04,  5.46it/s]

Finished run for neighbour size: 25 in 0:12:34.896380


100%|██████████| 4937/4937 [13:32<00:00,  6.07it/s]
  0%|          | 0/4937 [00:00<?, ?it/s]

Finished run for neighbour size: 50 in 0:13:32.924132


100%|██████████| 4937/4937 [14:34<00:00,  5.64it/s]
  0%|          | 0/4937 [00:00<?, ?it/s]

Finished run for neighbour size: 75 in 0:14:34.614367


100%|██████████| 4937/4937 [15:35<00:00,  5.28it/s]


Finished run for neighbour size: 100 in 0:15:35.121350
Finished run for vector type: cc in 1:27:12.255671


In [10]:
pd_dataset.to_csv('../data/ms_final_exnglish_experiments2.csv', index = False)

In [20]:
language_list = pd_dataset['language'].unique()
# pd_dataset[pd_dataset['language'] == 'Catalan'].count()
language_list

array(['Catalan', 'Chinese (Simplified)', 'Dutch', 'French', 'German',
       'Greek', 'Italian', 'Japanese', 'Korean', 'Polish', 'Portuguese',
       'Russian', 'Spanish', 'Swedish', 'Thai', 'Turkish'], dtype=object)

In [26]:
def get_experiment_model(language, vector_type):
    vec_type_start = datetime.now()
    embedding_file_name = get_embedding_file_name(language, vector_type)
    print('Loading embeddings: ' + embedding_file_name)
    embeddings = load_embedding_vector(embedding_file_name)
    print('Embeddings ' + embedding_file_name + ' loaded in ' + str(datetime.now() - vec_type_start))
    return embeddings
new_columns = {'5': [], '10': [], '25': [], '50': [], '75': [], '100': []}
def run_experiment_l1(dataframe):
    current_lang = ''
    current_vector_type = ''
    
    for vector_type in embedding_list:
        vec_type_start = datetime.now()
        for language in language_list:
            language_start = datetime.now()
            embeddings = get_experiment_model(language, vector_type)
            for neighbour_size in neighbour_list:
                neighbour_size_start = datetime.now()
#                 l1_sim = []
                dset = dataframe[dataframe['language'] == language]
                for ind, row in tqdm(dset.iterrows(), total=dset.shape[0]):
                    w1 = row['l1_c']
                    w2 = row['l1_i']
                    new_columns[str(neighbour_size)].append(get_similarity(w1, w2, embeddings, neighbour_size))
#                 dataframe['l1_sim_' + vector_type + '_' + str(neighbour_size)] = l1_sim 
                print('Finished run for neighbour size: ' + str(neighbour_size) + ' in ' + str(datetime.now() - neighbour_size_start))
            print('Finished run for language: ' + language + ' in ' + str(datetime.now() - language_start))
        print('Finished run for vector type: ' + vector_type + ' in ' + str(datetime.now() - vec_type_start))

In [27]:
run_experiment_l1(pd_dataset)

Loading embeddings: cc.ca.vec


  0%|          | 0/325 [00:00<?, ?it/s]

Embeddings cc.ca.vec loaded in 0:07:09.292250


100%|██████████| 325/325 [00:55<00:00,  5.90it/s]
  0%|          | 1/325 [00:00<00:49,  6.54it/s]

Finished run for neighbour size: 5 in 0:00:55.072082


100%|██████████| 325/325 [00:51<00:00,  6.26it/s]
  0%|          | 1/325 [00:00<00:51,  6.27it/s]

Finished run for neighbour size: 10 in 0:00:51.956015


100%|██████████| 325/325 [00:54<00:00,  5.99it/s]
  0%|          | 1/325 [00:00<00:56,  5.76it/s]

Finished run for neighbour size: 25 in 0:00:54.256719


100%|██████████| 325/325 [00:58<00:00,  5.59it/s]
  0%|          | 1/325 [00:00<00:59,  5.45it/s]

Finished run for neighbour size: 50 in 0:00:58.187528


100%|██████████| 325/325 [01:02<00:00,  5.23it/s]
  0%|          | 1/325 [00:00<01:03,  5.13it/s]

Finished run for neighbour size: 75 in 0:01:02.130179


100%|██████████| 325/325 [01:06<00:00,  4.91it/s]


Finished run for neighbour size: 100 in 0:01:06.134163
Finished run for language: Catalan in 0:12:57.030242
Loading embeddings: cc.zh.vec


  0%|          | 0/310 [00:00<?, ?it/s]

Embeddings cc.zh.vec loaded in 0:07:07.722281


100%|██████████| 310/310 [00:52<00:00,  5.94it/s]
  0%|          | 1/310 [00:00<00:47,  6.50it/s]

Finished run for neighbour size: 5 in 0:00:52.232433


100%|██████████| 310/310 [00:49<00:00,  6.30it/s]
  0%|          | 1/310 [00:00<00:50,  6.07it/s]

Finished run for neighbour size: 10 in 0:00:49.233231


100%|██████████| 310/310 [00:51<00:00,  6.08it/s]
  0%|          | 1/310 [00:00<00:53,  5.75it/s]

Finished run for neighbour size: 25 in 0:00:51.016803


100%|██████████| 310/310 [00:54<00:00,  5.66it/s]
  0%|          | 1/310 [00:00<00:56,  5.44it/s]

Finished run for neighbour size: 50 in 0:00:54.760819


100%|██████████| 310/310 [01:01<00:00,  5.01it/s]
  0%|          | 0/310 [00:00<?, ?it/s]

Finished run for neighbour size: 75 in 0:01:01.918040


100%|██████████| 310/310 [01:03<00:00,  4.90it/s]


Finished run for neighbour size: 100 in 0:01:03.325404
Finished run for language: Chinese (Simplified) in 0:12:40.587322
Loading embeddings: cc.nl.vec


  0%|          | 0/5 [00:00<?, ?it/s]

Embeddings cc.nl.vec loaded in 0:07:00.517282


100%|██████████| 5/5 [00:04<00:00,  1.03it/s]
 20%|██        | 1/5 [00:00<00:00,  7.17it/s]

Finished run for neighbour size: 5 in 0:00:04.837619


100%|██████████| 5/5 [00:00<00:00,  6.53it/s]
 20%|██        | 1/5 [00:00<00:00,  6.47it/s]

Finished run for neighbour size: 10 in 0:00:00.769387


100%|██████████| 5/5 [00:00<00:00,  6.20it/s]
 20%|██        | 1/5 [00:00<00:00,  6.25it/s]

Finished run for neighbour size: 25 in 0:00:00.810457


100%|██████████| 5/5 [00:00<00:00,  5.82it/s]
 20%|██        | 1/5 [00:00<00:00,  5.86it/s]

Finished run for neighbour size: 50 in 0:00:00.863132


100%|██████████| 5/5 [00:00<00:00,  5.41it/s]
 20%|██        | 1/5 [00:00<00:00,  5.47it/s]

Finished run for neighbour size: 75 in 0:00:00.928672


100%|██████████| 5/5 [00:01<00:00,  4.26it/s]


Finished run for neighbour size: 100 in 0:00:01.180490
Finished run for language: Dutch in 0:07:10.280808
Loading embeddings: cc.fr.vec


  0%|          | 0/794 [00:00<?, ?it/s]

Embeddings cc.fr.vec loaded in 0:07:14.978056


100%|██████████| 794/794 [01:54<00:00,  6.95it/s]
  0%|          | 1/794 [00:00<01:41,  7.84it/s]

Finished run for neighbour size: 5 in 0:01:54.232184


100%|██████████| 794/794 [01:52<00:00,  7.05it/s]
  0%|          | 1/794 [00:00<01:43,  7.65it/s]

Finished run for neighbour size: 10 in 0:01:52.561033


100%|██████████| 794/794 [01:59<00:00,  6.64it/s]
  0%|          | 1/794 [00:00<01:53,  6.96it/s]

Finished run for neighbour size: 25 in 0:01:59.572932


100%|██████████| 794/794 [02:09<00:00,  6.14it/s]
  0%|          | 1/794 [00:00<02:03,  6.40it/s]

Finished run for neighbour size: 50 in 0:02:09.366889


100%|██████████| 794/794 [02:18<00:00,  5.73it/s]
  0%|          | 1/794 [00:00<02:12,  6.00it/s]

Finished run for neighbour size: 75 in 0:02:18.655738


100%|██████████| 794/794 [02:30<00:00,  5.29it/s]


Finished run for neighbour size: 100 in 0:02:30.193006
Finished run for language: French in 0:19:59.921305
Loading embeddings: cc.de.vec


  0%|          | 0/285 [00:00<?, ?it/s]

Embeddings cc.de.vec loaded in 0:07:02.294696


100%|██████████| 285/285 [00:47<00:00,  6.01it/s]
  0%|          | 1/285 [00:00<00:46,  6.11it/s]

Finished run for neighbour size: 5 in 0:00:47.396841


100%|██████████| 285/285 [00:44<00:00,  6.47it/s]
  0%|          | 1/285 [00:00<00:47,  5.99it/s]

Finished run for neighbour size: 10 in 0:00:44.050140


100%|██████████| 285/285 [00:46<00:00,  6.17it/s]
  0%|          | 1/285 [00:00<00:53,  5.29it/s]

Finished run for neighbour size: 25 in 0:00:46.164080


100%|██████████| 285/285 [00:49<00:00,  5.78it/s]
  0%|          | 1/285 [00:00<00:54,  5.25it/s]

Finished run for neighbour size: 50 in 0:00:49.281134


100%|██████████| 285/285 [00:53<00:00,  5.36it/s]
  0%|          | 0/285 [00:00<?, ?it/s]

Finished run for neighbour size: 75 in 0:00:53.172270


100%|██████████| 285/285 [00:56<00:00,  5.03it/s]


Finished run for neighbour size: 100 in 0:00:56.627613
Finished run for language: German in 0:11:59.373275
Loading embeddings: cc.el.vec


  0%|          | 0/353 [00:00<?, ?it/s]

Embeddings cc.el.vec loaded in 0:07:10.691420


100%|██████████| 353/353 [01:00<00:00,  5.84it/s]
  0%|          | 1/353 [00:00<00:55,  6.31it/s]

Finished run for neighbour size: 5 in 0:01:00.410533


100%|██████████| 353/353 [00:57<00:00,  6.16it/s]
  0%|          | 1/353 [00:00<00:58,  5.98it/s]

Finished run for neighbour size: 10 in 0:00:57.344507


100%|██████████| 353/353 [00:59<00:00,  5.91it/s]
  0%|          | 1/353 [00:00<01:02,  5.63it/s]

Finished run for neighbour size: 25 in 0:00:59.701793


100%|██████████| 353/353 [01:04<00:00,  5.50it/s]
  0%|          | 1/353 [00:00<01:07,  5.23it/s]

Finished run for neighbour size: 50 in 0:01:04.199789


100%|██████████| 353/353 [01:08<00:00,  5.16it/s]
  0%|          | 0/353 [00:00<?, ?it/s]

Finished run for neighbour size: 75 in 0:01:08.444983


100%|██████████| 353/353 [01:12<00:00,  4.86it/s]


Finished run for neighbour size: 100 in 0:01:12.675986
Finished run for language: Greek in 0:13:33.860647
Loading embeddings: cc.it.vec


  0%|          | 0/335 [00:00<?, ?it/s]

Embeddings cc.it.vec loaded in 0:07:04.711908


100%|██████████| 335/335 [00:55<00:00,  6.01it/s]
  0%|          | 1/335 [00:00<00:53,  6.24it/s]

Finished run for neighbour size: 5 in 0:00:55.725125


100%|██████████| 335/335 [00:52<00:00,  6.36it/s]
  0%|          | 1/335 [00:00<00:56,  5.96it/s]

Finished run for neighbour size: 10 in 0:00:52.674337


100%|██████████| 335/335 [00:55<00:00,  6.08it/s]
  0%|          | 1/335 [00:00<01:03,  5.23it/s]

Finished run for neighbour size: 25 in 0:00:55.094282


100%|██████████| 335/335 [00:59<00:00,  5.67it/s]
  0%|          | 1/335 [00:00<01:04,  5.21it/s]

Finished run for neighbour size: 50 in 0:00:59.123614


100%|██████████| 335/335 [01:03<00:00,  5.29it/s]
  0%|          | 0/335 [00:00<?, ?it/s]

Finished run for neighbour size: 75 in 0:01:03.346365


100%|██████████| 335/335 [01:07<00:00,  4.97it/s]


Finished run for neighbour size: 100 in 0:01:07.382719
Finished run for language: Italian in 0:12:58.447443
Loading embeddings: cc.ja.vec


  0%|          | 0/192 [00:00<?, ?it/s]

Embeddings cc.ja.vec loaded in 0:07:09.714264


100%|██████████| 192/192 [00:31<00:00,  6.16it/s]
  1%|          | 1/192 [00:00<00:29,  6.48it/s]

Finished run for neighbour size: 5 in 0:00:31.151865


100%|██████████| 192/192 [00:27<00:00,  7.00it/s]
  0%|          | 0/192 [00:00<?, ?it/s]

Finished run for neighbour size: 10 in 0:00:27.428547


100%|██████████| 192/192 [00:29<00:00,  6.59it/s]
  1%|          | 1/192 [00:00<00:33,  5.77it/s]

Finished run for neighbour size: 25 in 0:00:29.127981


100%|██████████| 192/192 [00:31<00:00,  6.13it/s]
  1%|          | 1/192 [00:00<00:35,  5.42it/s]

Finished run for neighbour size: 50 in 0:00:31.342228


100%|██████████| 192/192 [00:33<00:00,  5.70it/s]
  0%|          | 0/192 [00:00<?, ?it/s]

Finished run for neighbour size: 75 in 0:00:33.706907


100%|██████████| 192/192 [00:36<00:00,  5.32it/s]


Finished run for neighbour size: 100 in 0:00:36.078619
Finished run for language: Japanese in 0:10:18.938850
Loading embeddings: cc.ko.vec


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0: invalid continuation byte

In [30]:
for key in new_columns:
    print(str(key) + ':' + str(len(new_columns[key])))

5:2599
10:2599
25:2599
50:2599
75:2599
100:2599


In [31]:
pending_langs = ['Korean', 'Polish', 'Portuguese', 'Russian', 'Spanish', 'Swedish', 'Thai', 'Turkish']

In [32]:
test = KeyedVectors.load_word2vec_format(embedding_path + 'cc.ko.vec', binary=False)  # C text format

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0: invalid continuation byte

In [39]:
back_up_col = {}
for key in new_columns.keys():
    if (key not in back_up_col):
        back_up_col[key] = []
    for row in new_columns[key]:
        back_up_col[key].append(str(row))

In [40]:
import json

with open('back_up_tex.txt', 'w') as file:
     file.write(json.dumps(back_up_col)) # 

In [43]:
def run_experiment_l1(dataframe):
    current_lang = ''
    current_vector_type = ''
    
    for vector_type in embedding_list:
        vec_type_start = datetime.now()
        for language in pending_langs:
            language_start = datetime.now()
            embeddings = get_experiment_model(language, vector_type)
            for neighbour_size in neighbour_list:
                neighbour_size_start = datetime.now()
#                 l1_sim = []
                dset = dataframe[dataframe['language'] == language]
                for ind, row in tqdm(dset.iterrows(), total=dset.shape[0]):
                    w1 = row['l1_c']
                    w2 = row['l1_i']
                    new_columns[str(neighbour_size)].append(get_similarity(w1, w2, embeddings, neighbour_size))
#                 dataframe['l1_sim_' + vector_type + '_' + str(neighbour_size)] = l1_sim 
                print('Finished run for neighbour size: ' + str(neighbour_size) + ' in ' + str(datetime.now() - neighbour_size_start))
            print('Finished run for language: ' + language + ' in ' + str(datetime.now() - language_start))
        print('Finished run for vector type: ' + vector_type + ' in ' + str(datetime.now() - vec_type_start))

In [44]:
run_experiment_l1(pd_dataset)

Loading embeddings: cc.ko.vec


  0%|          | 0/185 [00:00<?, ?it/s]

Embeddings cc.ko.vec loaded in 0:07:11.711635


100%|██████████| 185/185 [00:31<00:00,  5.80it/s]
  1%|          | 1/185 [00:00<00:25,  7.15it/s]

Finished run for neighbour size: 5 in 0:00:31.921732


100%|██████████| 185/185 [00:28<00:00,  6.47it/s]
  1%|          | 1/185 [00:00<00:27,  6.70it/s]

Finished run for neighbour size: 10 in 0:00:28.585561


100%|██████████| 185/185 [00:29<00:00,  6.20it/s]
  1%|          | 1/185 [00:00<00:29,  6.28it/s]

Finished run for neighbour size: 25 in 0:00:29.833846


100%|██████████| 185/185 [00:32<00:00,  5.76it/s]
  1%|          | 1/185 [00:00<00:32,  5.68it/s]

Finished run for neighbour size: 50 in 0:00:32.110382


100%|██████████| 185/185 [00:34<00:00,  5.37it/s]
  1%|          | 1/185 [00:00<00:35,  5.17it/s]

Finished run for neighbour size: 75 in 0:00:34.466499


100%|██████████| 185/185 [00:36<00:00,  5.06it/s]


Finished run for neighbour size: 100 in 0:00:36.531080
Finished run for language: Korean in 0:10:25.162136
Loading embeddings: cc.pl.vec


  0%|          | 0/295 [00:00<?, ?it/s]

Embeddings cc.pl.vec loaded in 0:07:03.711049


100%|██████████| 295/295 [00:51<00:00,  5.69it/s]
  0%|          | 1/295 [00:00<00:44,  6.59it/s]

Finished run for neighbour size: 5 in 0:00:51.833794


100%|██████████| 295/295 [00:48<00:00,  6.10it/s]
  0%|          | 1/295 [00:00<00:46,  6.30it/s]

Finished run for neighbour size: 10 in 0:00:48.401648


100%|██████████| 295/295 [00:50<00:00,  5.83it/s]
  0%|          | 1/295 [00:00<00:55,  5.32it/s]

Finished run for neighbour size: 25 in 0:00:50.638030


100%|██████████| 295/295 [00:54<00:00,  5.45it/s]
  0%|          | 0/295 [00:00<?, ?it/s]

Finished run for neighbour size: 50 in 0:00:54.111388


100%|██████████| 295/295 [00:57<00:00,  5.11it/s]
  0%|          | 1/295 [00:00<00:57,  5.13it/s]

Finished run for neighbour size: 75 in 0:00:57.763103


100%|██████████| 295/295 [01:01<00:00,  4.79it/s]


Finished run for neighbour size: 100 in 0:01:01.604794
Finished run for language: Polish in 0:12:28.506445
Loading embeddings: cc.pt.vec


  0%|          | 0/284 [00:00<?, ?it/s]

Embeddings cc.pt.vec loaded in 0:07:16.397818


100%|██████████| 284/284 [00:50<00:00,  5.66it/s]
  0%|          | 1/284 [00:00<00:49,  5.76it/s]

Finished run for neighbour size: 5 in 0:00:50.188008


100%|██████████| 284/284 [00:46<00:00,  6.10it/s]
  0%|          | 1/284 [00:00<00:52,  5.40it/s]

Finished run for neighbour size: 10 in 0:00:46.557223


100%|██████████| 284/284 [00:48<00:00,  5.80it/s]
  0%|          | 0/284 [00:00<?, ?it/s]

Finished run for neighbour size: 25 in 0:00:48.948331


100%|██████████| 284/284 [00:52<00:00,  5.43it/s]
  0%|          | 0/284 [00:00<?, ?it/s]

Finished run for neighbour size: 50 in 0:00:52.331980


100%|██████████| 284/284 [00:55<00:00,  5.10it/s]
  0%|          | 0/284 [00:00<?, ?it/s]

Finished run for neighbour size: 75 in 0:00:55.721916


100%|██████████| 284/284 [00:59<00:00,  4.80it/s]


Finished run for neighbour size: 100 in 0:00:59.114639
Finished run for language: Portuguese in 0:12:29.729882
Loading embeddings: cc.ru.vec


  0%|          | 0/340 [00:00<?, ?it/s]

Embeddings cc.ru.vec loaded in 0:07:04.674586


100%|██████████| 340/340 [00:55<00:00,  6.15it/s]
  0%|          | 1/340 [00:00<00:56,  6.03it/s]

Finished run for neighbour size: 5 in 0:00:55.329443


100%|██████████| 340/340 [00:52<00:00,  6.52it/s]
  0%|          | 1/340 [00:00<00:58,  5.82it/s]

Finished run for neighbour size: 10 in 0:00:52.170976


100%|██████████| 340/340 [00:54<00:00,  6.21it/s]
  0%|          | 1/340 [00:00<01:02,  5.41it/s]

Finished run for neighbour size: 25 in 0:00:54.729058


100%|██████████| 340/340 [00:58<00:00,  5.78it/s]
  0%|          | 0/340 [00:00<?, ?it/s]

Finished run for neighbour size: 50 in 0:00:58.808903


100%|██████████| 340/340 [01:02<00:00,  5.41it/s]
  0%|          | 0/340 [00:00<?, ?it/s]

Finished run for neighbour size: 75 in 0:01:02.816858


100%|██████████| 340/340 [01:06<00:00,  5.08it/s]


Finished run for neighbour size: 100 in 0:01:06.911859
Finished run for language: Russian in 0:12:55.888151
Loading embeddings: cc.es.vec


  0%|          | 0/796 [00:00<?, ?it/s]

Embeddings cc.es.vec loaded in 0:07:04.909528


100%|██████████| 796/796 [02:09<00:00,  6.16it/s]
  0%|          | 1/796 [00:00<01:56,  6.83it/s]

Finished run for neighbour size: 5 in 0:02:09.237948


100%|██████████| 796/796 [02:06<00:00,  6.27it/s]
  0%|          | 1/796 [00:00<02:00,  6.59it/s]

Finished run for neighbour size: 10 in 0:02:06.916814


100%|██████████| 796/796 [02:12<00:00,  5.99it/s]
  0%|          | 1/796 [00:00<02:10,  6.09it/s]

Finished run for neighbour size: 25 in 0:02:12.992009


100%|██████████| 796/796 [02:22<00:00,  5.57it/s]
  0%|          | 1/796 [00:00<02:19,  5.68it/s]

Finished run for neighbour size: 50 in 0:02:22.876694


100%|██████████| 796/796 [02:32<00:00,  5.21it/s]
  0%|          | 1/796 [00:00<02:32,  5.23it/s]

Finished run for neighbour size: 75 in 0:02:32.705465


100%|██████████| 796/796 [02:42<00:00,  4.90it/s]


Finished run for neighbour size: 100 in 0:02:42.560131
Finished run for language: Spanish in 0:21:12.629313
Loading embeddings: cc.sv.vec


  0%|          | 0/44 [00:00<?, ?it/s]

Embeddings cc.sv.vec loaded in 0:07:05.588783


100%|██████████| 44/44 [00:11<00:00,  3.89it/s]
  2%|▏         | 1/44 [00:00<00:06,  6.20it/s]

Finished run for neighbour size: 5 in 0:00:11.307525


100%|██████████| 44/44 [00:07<00:00,  5.95it/s]
  2%|▏         | 1/44 [00:00<00:07,  6.04it/s]

Finished run for neighbour size: 10 in 0:00:07.401635


100%|██████████| 44/44 [00:07<00:00,  5.79it/s]
  0%|          | 0/44 [00:00<?, ?it/s]

Finished run for neighbour size: 25 in 0:00:07.604443


100%|██████████| 44/44 [00:08<00:00,  5.28it/s]
  2%|▏         | 1/44 [00:00<00:08,  5.26it/s]

Finished run for neighbour size: 50 in 0:00:08.340819


100%|██████████| 44/44 [00:08<00:00,  5.01it/s]
  0%|          | 0/44 [00:00<?, ?it/s]

Finished run for neighbour size: 75 in 0:00:08.796875


100%|██████████| 44/44 [00:09<00:00,  4.73it/s]


Finished run for neighbour size: 100 in 0:00:09.302184
Finished run for language: Swedish in 0:07:58.744868
Loading embeddings: cc.th.vec


  0%|          | 0/122 [00:00<?, ?it/s]

Embeddings cc.th.vec loaded in 0:07:06.783297


100%|██████████| 122/122 [00:21<00:00,  5.69it/s]
  1%|          | 1/122 [00:00<00:20,  5.84it/s]

Finished run for neighbour size: 5 in 0:00:21.442978


100%|██████████| 122/122 [00:17<00:00,  6.82it/s]
  1%|          | 1/122 [00:00<00:19,  6.09it/s]

Finished run for neighbour size: 10 in 0:00:17.885704


100%|██████████| 122/122 [00:18<00:00,  6.55it/s]
  1%|          | 1/122 [00:00<00:20,  5.84it/s]

Finished run for neighbour size: 25 in 0:00:18.643837


100%|██████████| 122/122 [00:20<00:00,  6.05it/s]
  1%|          | 1/122 [00:00<00:21,  5.51it/s]

Finished run for neighbour size: 50 in 0:00:20.156787


100%|██████████| 122/122 [00:21<00:00,  5.65it/s]
  1%|          | 1/122 [00:00<00:23,  5.17it/s]

Finished run for neighbour size: 75 in 0:00:21.580317


100%|██████████| 122/122 [00:23<00:00,  5.29it/s]


Finished run for neighbour size: 100 in 0:00:23.078599
Finished run for language: Thai in 0:09:09.982138
Loading embeddings: cc.tr.vec


  0%|          | 0/272 [00:00<?, ?it/s]

Embeddings cc.tr.vec loaded in 0:07:03.522402


100%|██████████| 272/272 [00:48<00:00,  5.64it/s]
  0%|          | 1/272 [00:00<00:44,  6.16it/s]

Finished run for neighbour size: 5 in 0:00:48.240702


100%|██████████| 272/272 [00:44<00:00,  6.07it/s]
  0%|          | 1/272 [00:00<00:47,  5.70it/s]

Finished run for neighbour size: 10 in 0:00:44.815759


100%|██████████| 272/272 [00:46<00:00,  5.82it/s]
  0%|          | 1/272 [00:00<00:49,  5.46it/s]

Finished run for neighbour size: 25 in 0:00:46.766195


100%|██████████| 272/272 [00:50<00:00,  5.42it/s]
  0%|          | 0/272 [00:00<?, ?it/s]

Finished run for neighbour size: 50 in 0:00:50.226212


100%|██████████| 272/272 [00:53<00:00,  5.09it/s]
  0%|          | 0/272 [00:00<?, ?it/s]

Finished run for neighbour size: 75 in 0:00:53.489758


100%|██████████| 272/272 [00:56<00:00,  4.78it/s]


Finished run for neighbour size: 100 in 0:00:56.892230
Finished run for language: Turkish in 0:12:04.380668
Finished run for vector type: cc in 1:38:45.024622


In [48]:
for key in new_columns:
    print(str(key) + ':' + str(len(new_columns[key])))

5:4937
10:4937
25:4937
50:4937
75:4937
100:4937


In [50]:
pd_dataset.head()

Unnamed: 0,c,i,id,l1_c,l1_i,language,type,code,l2_sim_cc_5,l2_sim_cc_10,l2_sim_cc_25,l2_sim_cc_50,l2_sim_cc_75,l2_sim_cc_100
2827,plans,projects,person_2799,plans,projectes,Catalan,RN,ca,0.315914,0.289572,0.261106,0.246446,0.235596,0.228922
2733,waste,lose,person_2421,malgastar,perdre,Catalan,RV,ca,0.197119,0.175892,0.164163,0.14997,0.149026,0.142875
2732,ending,end,person_1465,final,final,Catalan,RN,ca,0.591592,0.500977,0.443042,0.402888,0.387598,0.379977
2731,shot,view,person_1465,disparat,mostra,Catalan,RN,ca,0.2104,0.187495,0.18507,0.189705,0.1673,0.161793
2730,do,join,person_1465,fer,unir,Catalan,RV,ca,0.225478,0.194971,0.201452,0.19757,0.192571,0.189414


In [52]:
for key in new_columns:
    pd_dataset['l1_sim_cc_' + str(key)] = new_columns[key]

In [54]:
pd_dataset.to_csv('../data/ms_final_experiments.csv', index = False)

In [55]:
back_up_col = {}
for key in new_columns.keys():
    if (key not in back_up_col):
        back_up_col[key] = []
    for row in new_columns[key]:
        back_up_col[key].append(str(row))

with open('final_similarity_dict.txt', 'w') as file:
     file.write(json.dumps(back_up_col)) # 