In [1]:
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from polyglot.mapping import Embedding

In [2]:
neighbour_list = [5, 10, 25, 50, 75, 100]
embedding_list = ['polyglot', 'cc']
embedding_path = '../data/pretrained_embeddings/'
# embedding_path = '../data/pretrained_embeddings/polyglot/embeddings2/'

In [3]:
def get_embedding_file_name(language, file_type):
    lang_code = ''
    file_name = ''
    if (language == "Russian"):
        lang_code += 'ru'
    elif (language == "Turkish"):
        lang_code += 'tr'
    elif (language == "German"):
        lang_code += 'de'
    elif (language == "Japanese"):
        lang_code += 'ja'
    elif (language == "Spanish"):
        lang_code += 'es'
    elif (language == "Polish"):
        lang_code += 'pl'
    elif (language == "Italian"):
        lang_code += 'it'
    elif (language == "Catalan"):
        lang_code += 'ca'
    elif (language == "Korean"):
        lang_code += 'ko'
    elif (language == "French"):
        lang_code += 'fr'
    elif (language == "Chinese" or language == "Chinese (Simplified)"):
        lang_code += 'zh'
    elif (language == "Portuguese"):
        lang_code += 'pt'
    elif (language == "Swedish"):
        lang_code += 'sv'
    elif (language == "Greek"):
        lang_code += 'el'
    elif (language == "Thai"):
        lang_code += 'th'
    elif (language == "Dutch"):
        lang_code += 'nl'
    elif (language == "English" and file_type == 'cc'):
        lang_code += 'en.300'
    elif (language == "English" and file_type == 'polyglot'):
        lang_code += 'en'
        
    if (file_type == 'cc'):
        file_name = embedding_path + file_type + '.' + lang_code + '.vec'
    elif (file_type == 'polyglot'):
        file_name = embedding_path + file_type + '/embeddings2/' + lang_code + '/embeddings_pkl.tar.bz2'
    return file_name

def load_embedding_vector(language, file_type):
    file_name = get_embedding_file_name(language, file_type)
    print('Loading file: ' + file_name)
    if (file_type == 'cc'):
        print('Loading fasttext')
        vec = KeyedVectors.load_word2vec_format(file_name, binary=False, unicode_errors='ignore')  # C text format
    elif (file_type == 'polyglot'):
        print('Loading polyglot')
        vec = Embedding.load(file_name)
        vec = vec.normalize_words()
    return vec

In [4]:
def proper_case(word):
    return word[0].upper() + word[1:]

def get_neigbours(word, embeddings, vector_type, size, sorting_type = 'desc'):
    res1 = []
    res2 = []
    
    try:
        w = embeddings[word]
    except KeyError:
        if (vector_type == 'polyglot'):
            word = proper_case(word)
            try:
                w = embeddings[word]
            except KeyError:
                return -10, -10
        else:
            return -10, -10
    if (vector_type == 'polyglot'):
        n = embeddings.nearest_neighbors(word, top_k = size)
        n_v = [embeddings[word] for word in n]
        n_s = cosine_similarity([embeddings[word]], n_v)
        for index, item in enumerate(n_s[0]):
            res1.append(n[index])
            res2.append({'word': n[index], 'similarity': str(item)})
    elif (vector_type == 'cc'):
        n = embeddings.similar_by_vector(embeddings[word], topn = size + 1)
        for item in n[1:]:
            res1.append(item[0])
            res2.append({'word': item[0], 'similarity': str(item[1])})
    if (sorting_type == 'asc'):
        res1 = res1[::-1]
        res2 = res2[::-1]
    return res1, res2

In [5]:
# source = '../data/ms_final_with_polyglot_2.csv'
# pd_dataset = pd.read_csv(source)
# pd_dataset = pd_dataset.sort_values(by = 'language', ascending = True)
# pd_dataset.head()

In [6]:
# test = load_embedding_vector("English", "polyglot")

In [7]:
# test = test.normalize_words()
# n = test.nearest_neighbors("hello")
# print(n)
# n_v = [test[word] for word in n]
# n_s = cosine_similarity([test["hello"]], n_v)

# for i, item in enumerate(n_s[0]):
#     print(item)
# res1, res2 = get_neigbours("hello", test, "polyglot", 10, 'desc')
# print(res1)
# print(res2)

In [8]:
# test = load_embedding_vector("English", "cc")

In [9]:
# n = test.similar_by_vector(test["hello"])
# def get_formatted_neighbours(neighbours, vector_type):
#     res1 = []
#     res2 = []
#     return res1, res2

In [10]:
source = '../data/fasttext_experiments.csv'
pd_dataset = pd.read_csv(source)
pd_dataset = pd_dataset.sort_values(by = 'language', ascending = True)
pd_dataset.head()

Unnamed: 0,c,i,id,l1_c,l1_i,language,type,code,l2_sim_cc_5,l2_sim_cc_10,...,l2_sim_cc_50,l2_sim_cc_75,l2_sim_cc_100,l1_sim_cc_5,l1_sim_cc_10,l1_sim_cc_25,l1_sim_cc_50,l1_sim_cc_75,l1_sim_cc_100,case_id
0,plans,projects,person_2799,plans,projectes,Catalan,RN,ca,0.315914,0.289572,...,0.246446,0.235596,0.228922,0.53406,0.509746,0.513301,0.508249,0.497365,0.494275,case_0001
221,stars,starts,person_2127,estrelles,comença,Catalan,RN,ca,0.068227,0.068642,...,0.062946,0.059115,0.056152,0.168406,0.16743,0.177438,0.174349,0.173036,0.175587,case_0222
220,time,hours,person_2127,temps,hores,Catalan,RN,ca,0.464017,0.407275,...,0.373413,0.359142,0.35413,0.380932,0.359519,0.346928,0.349675,0.35266,0.349513,case_0221
219,coming,following,person_2127,venint,següents,Catalan,RJ,ca,0.183514,0.166062,...,0.170379,0.165272,0.161477,0.182136,0.181615,0.211532,0.218019,0.222701,0.225519,case_0220
218,equipment,material,person_2127,equip,material,Catalan,RN,ca,0.279342,0.251406,...,0.225062,0.223828,0.219889,0.352026,0.340105,0.32313,0.316861,0.310028,0.312223,case_0219


In [11]:
fasttext_result_set = {}
fasttext_result_set_sim = {}
def get_english_data(dataframe, vector_type, rset1, rset2):
    vec_type_start = datetime.now()
    print('Loading embeddings')
    embeddings = load_embedding_vector("English", vector_type)
    print('Embeddings for english, ' + vector_type + ' vectors loaded in ' + str(datetime.now() - vec_type_start))
    for ind, row in tqdm(dataframe.iterrows(), total=dataframe.shape[0]):
        c = row['c']
        i = row['i']
        case_id = row['case_id']
        c_n, c_n_d = get_neigbours(c, embeddings, vector_type, 100, 'desc')
        i_n, i_n_d = get_neigbours(i, embeddings, vector_type, 100, 'desc')
        
        if (case_id not in rset1):
            rset1[case_id] = {}
        if (case_id not in rset2):
            rset2[case_id] = {}
        
        rset1[case_id]['i_nn'] = i_n
        rset1[case_id]['c_nn'] = c_n
        rset2[case_id]['i_nn'] = i_n_d
        rset2[case_id]['c_nn'] = c_n_d


In [12]:
# get_english_data(pd_dataset, "cc", fasttext_result_set, fasttext_result_set_sim)

In [13]:
import json

In [14]:
# with open('fasttext_result_set_english.json', 'w') as fp:
#     json.dump(fasttext_result_set, fp)
# with open('fasttext_result_set_sim_english.json', 'w') as fp:
#     json.dump(fasttext_result_set_sim, fp)


In [15]:
language_list = pd_dataset['language'].unique()

def get_l1_data(dataframe, language_list, vector_type, rset1, rset2):
    current_lang = ''
    for language in language_list:
        language_start = datetime.now()
        vec_type_start = datetime.now()
        print('Loading embeddings')
        embeddings = load_embedding_vector(language, vector_type)
        print('Embeddings for ' + language + ' , ' + vector_type + ' vectors loaded in ' + str(datetime.now() - vec_type_start))
        dset = dataframe[dataframe['language'] == language]
        for ind, row in tqdm(dset.iterrows(), total=dset.shape[0]):
            c = row['l1_c']
            i = row['l1_i']
            case_id = row['case_id']
            c_n, c_n_d = get_neigbours(c, embeddings, vector_type, 100, 'desc')
            i_n, i_n_d = get_neigbours(i, embeddings, vector_type, 100, 'desc')

            if (case_id not in rset1):
                rset1[case_id] = {}
            if (case_id not in rset2):
                rset2[case_id] = {}

            rset1[case_id]['l1_i_nn'] = i_n
            rset1[case_id]['l1_c_nn'] = c_n
            rset2[case_id]['l1_i_nn'] = i_n_d
            rset2[case_id]['l1_c_nn'] = c_n_d

        print('Finished run for language: ' + language + ' in ' + str(datetime.now() - language_start))

In [16]:
# get_l1_data(pd_dataset, language_list, "cc", fasttext_result_set, fasttext_result_set_sim)

In [17]:
# with open('fasttext_result_set.json', 'w') as fp:
#     json.dump(fasttext_result_set, fp)
# with open('fasttext_result_set_sim.json', 'w') as fp:
#     json.dump(fasttext_result_set_sim, fp)


In [18]:
source = '../data/polyglot_experiment.csv'
pd2_dataset = pd.read_csv(source)
pd2_dataset = pd2_dataset.sort_values(by = 'language', ascending = True)
pd2_dataset.head()

Unnamed: 0,c,i,id,l1_c,l1_i,language,type,code,l2_sim_cc_5,l2_sim_cc_10,...,l2_sim_polyglot_50,l2_sim_polyglot_75,l2_sim_polyglot_100,l1_sim_polyglot_5,l1_sim_polyglot_10,l1_sim_polyglot_25,l1_sim_polyglot_50,l1_sim_polyglot_75,l1_sim_polyglot_100,case_id
0,plans,projects,person_2799,plans,projectes,Catalan,RN,ca,0.315914,0.289572,...,0.64229,0.636711,0.636448,0.706547,0.719866,0.709139,0.681242,0.676333,0.667461,case_0001
211,especially,specially,person_2421,especialment,especialment,Catalan,RY,ca,0.342199,0.336672,...,0.5139,0.486702,0.479133,0.828464,0.793318,0.749138,0.71661,0.689578,0.673222,case_0009
210,waste,lose,person_2421,malgastar,perdre,Catalan,RV,ca,0.197119,0.175892,...,0.098449,0.101967,0.105505,0.63133,0.636749,0.636339,0.637038,0.635354,0.634739,case_0002
209,ending,end,person_1465,final,final,Catalan,RN,ca,0.591592,0.500977,...,0.409819,0.411196,0.40494,0.784284,0.765748,0.735284,0.709147,0.690993,0.676256,case_0003
208,bored,boring,person_1642,avorrit,avorrit,Catalan,RJ,ca,0.513438,0.50285,...,0.627174,0.611835,0.5976,0.733787,0.727427,0.721897,0.708325,0.698112,0.69043,case_0026


In [19]:
poly_result_set = {}
poly_result_set_sim = {}

In [20]:
poly_result_set = {}
poly_result_set_sim = {}
fasttext_result_set = {}
fasttext_result_set_sim = {}
language_list = pd_dataset['language'].unique()
language_list2 = pd2_dataset['language'].unique()

get_english_data(pd_dataset, "cc", fasttext_result_set, fasttext_result_set_sim)
with open('fasttext_result_set_eng.json', 'w') as fp:
    json.dump(fasttext_result_set, fp)
with open('fasttext_result_set_sim_eng.json', 'w') as fp:
    json.dump(fasttext_result_set_sim, fp)
get_l1_data(pd_dataset, language_list, "cc", fasttext_result_set, fasttext_result_set_sim)
with open('fasttext_result_set.json', 'w') as fp:
    json.dump(fasttext_result_set, fp)
with open('fasttext_result_set_sim.json', 'w') as fp:
    json.dump(fasttext_result_set_sim, fp)
get_english_data(pd2_dataset, "polyglot", poly_result_set, poly_result_set_sim)
with open('poly_result_set_eng.json', 'w') as fp:
    json.dump(poly_result_set, fp)
with open('poly_result_set_sim_eng.json', 'w') as fp:
    json.dump(poly_result_set_sim, fp)
get_l1_data(pd2_dataset, language_list2, "polyglot", poly_result_set, poly_result_set_sim)
with open('poly_result_set.json', 'w') as fp:
    json.dump(poly_result_set, fp)
with open('poly_result_set_sim.json', 'w') as fp:
    json.dump(poly_result_set_sim, fp)

Loading embeddings


  0%|          | 0/4937 [00:00<?, ?it/s]

Loading file: ../data/pretrained_embeddings/cc.en.300.vec
Loading fasttext
Embeddings for english, cc vectors loaded in 0:07:09.103982


100%|██████████| 4937/4937 [11:48<00:00,  6.97it/s]


Loading embeddings
Loading file: ../data/pretrained_embeddings/cc.ca.vec
Loading fasttext


  0%|          | 0/325 [00:00<?, ?it/s]

Embeddings for Catalan , cc vectors loaded in 0:07:01.579917


100%|██████████| 325/325 [00:55<00:00,  5.89it/s]


Finished run for language: Catalan in 0:07:56.736568
Loading embeddings
Loading file: ../data/pretrained_embeddings/cc.zh.vec
Loading fasttext


  0%|          | 0/310 [00:00<?, ?it/s]

Embeddings for Chinese (Simplified) , cc vectors loaded in 0:07:06.395914


100%|██████████| 310/310 [00:52<00:00,  5.94it/s]


Finished run for language: Chinese (Simplified) in 0:07:58.625196
Loading embeddings
Loading file: ../data/pretrained_embeddings/cc.nl.vec
Loading fasttext


  0%|          | 0/5 [00:00<?, ?it/s]

Embeddings for Dutch , cc vectors loaded in 0:07:01.502012


100%|██████████| 5/5 [00:04<00:00,  1.03it/s]


Finished run for language: Dutch in 0:07:06.351772
Loading embeddings
Loading file: ../data/pretrained_embeddings/cc.fr.vec
Loading fasttext


  0%|          | 0/794 [00:00<?, ?it/s]

Embeddings for French , cc vectors loaded in 0:07:09.132825


100%|██████████| 794/794 [01:54<00:00,  6.96it/s]


Finished run for language: French in 0:09:03.195464
Loading embeddings
Loading file: ../data/pretrained_embeddings/cc.de.vec
Loading fasttext


  0%|          | 0/285 [00:00<?, ?it/s]

Embeddings for German , cc vectors loaded in 0:07:01.714963


100%|██████████| 285/285 [00:47<00:00,  6.05it/s]


Finished run for language: German in 0:07:48.861614
Loading embeddings
Loading file: ../data/pretrained_embeddings/cc.el.vec
Loading fasttext


  0%|          | 0/353 [00:00<?, ?it/s]

Embeddings for Greek , cc vectors loaded in 0:07:15.235736


100%|██████████| 353/353 [01:00<00:00,  5.85it/s]


Finished run for language: Greek in 0:08:15.551755
Loading embeddings
Loading file: ../data/pretrained_embeddings/cc.it.vec
Loading fasttext


  0%|          | 0/335 [00:00<?, ?it/s]

Embeddings for Italian , cc vectors loaded in 0:07:07.340468


100%|██████████| 335/335 [00:55<00:00,  6.01it/s]


Finished run for language: Italian in 0:08:03.103744
Loading embeddings
Loading file: ../data/pretrained_embeddings/cc.ja.vec
Loading fasttext


  0%|          | 0/192 [00:00<?, ?it/s]

Embeddings for Japanese , cc vectors loaded in 0:07:13.301315


100%|██████████| 192/192 [00:30<00:00,  6.23it/s]


Finished run for language: Japanese in 0:07:44.129173
Loading embeddings
Loading file: ../data/pretrained_embeddings/cc.ko.vec
Loading fasttext


  0%|          | 0/185 [00:00<?, ?it/s]

Embeddings for Korean , cc vectors loaded in 0:07:09.108958


100%|██████████| 185/185 [00:31<00:00,  5.83it/s]


Finished run for language: Korean in 0:07:40.850976
Loading embeddings
Loading file: ../data/pretrained_embeddings/cc.pl.vec
Loading fasttext


  0%|          | 0/295 [00:00<?, ?it/s]

Embeddings for Polish , cc vectors loaded in 0:07:05.725739


100%|██████████| 295/295 [00:51<00:00,  5.74it/s]


Finished run for language: Polish in 0:07:57.105373
Loading embeddings
Loading file: ../data/pretrained_embeddings/cc.pt.vec
Loading fasttext


  0%|          | 0/284 [00:00<?, ?it/s]

Embeddings for Portuguese , cc vectors loaded in 0:07:00.727999


100%|██████████| 284/284 [00:49<00:00,  5.74it/s]


Finished run for language: Portuguese in 0:07:50.226254
Loading embeddings
Loading file: ../data/pretrained_embeddings/cc.ru.vec
Loading fasttext


  0%|          | 0/340 [00:00<?, ?it/s]

Embeddings for Russian , cc vectors loaded in 0:07:15.373764


100%|██████████| 340/340 [00:54<00:00,  6.21it/s]


Finished run for language: Russian in 0:08:10.162014
Loading embeddings
Loading file: ../data/pretrained_embeddings/cc.es.vec
Loading fasttext


  0%|          | 0/796 [00:00<?, ?it/s]

Embeddings for Spanish , cc vectors loaded in 0:07:08.690533


100%|██████████| 796/796 [02:08<00:00,  6.22it/s]


Finished run for language: Spanish in 0:09:16.767241
Loading embeddings
Loading file: ../data/pretrained_embeddings/cc.sv.vec
Loading fasttext


  0%|          | 0/44 [00:00<?, ?it/s]

Embeddings for Swedish , cc vectors loaded in 0:07:05.156546


100%|██████████| 44/44 [00:11<00:00,  3.93it/s]


Finished run for language: Swedish in 0:07:16.367477
Loading embeddings
Loading file: ../data/pretrained_embeddings/cc.th.vec
Loading fasttext


  0%|          | 0/122 [00:00<?, ?it/s]

Embeddings for Thai , cc vectors loaded in 0:07:09.325433


100%|██████████| 122/122 [00:21<00:00,  5.76it/s]


Finished run for language: Thai in 0:07:30.522057
Loading embeddings
Loading file: ../data/pretrained_embeddings/cc.tr.vec
Loading fasttext


  0%|          | 0/272 [00:00<?, ?it/s]

Embeddings for Turkish , cc vectors loaded in 0:07:07.057581


100%|██████████| 272/272 [00:47<00:00,  5.70it/s]


Finished run for language: Turkish in 0:07:54.767356
Loading embeddings
Loading file: ../data/pretrained_embeddings/polyglot/embeddings2/en/embeddings_pkl.tar.bz2
Loading polyglot


  0%|          | 2/4449 [00:00<03:52, 19.13it/s]

Embeddings for english, polyglot vectors loaded in 0:00:02.177144


100%|██████████| 4449/4449 [03:17<00:00, 22.56it/s]


Loading embeddings
Loading file: ../data/pretrained_embeddings/polyglot/embeddings2/ca/embeddings_pkl.tar.bz2
Loading polyglot


  1%|          | 2/311 [00:00<00:22, 13.76it/s]

Embeddings for Catalan , polyglot vectors loaded in 0:00:02.319872


100%|██████████| 311/311 [00:14<00:00, 21.03it/s]


Finished run for language: Catalan in 0:00:17.116839
Loading embeddings
Loading file: ../data/pretrained_embeddings/polyglot/embeddings2/zh/embeddings_pkl.tar.bz2
Loading polyglot


  1%|          | 2/288 [00:00<00:15, 17.98it/s]

Embeddings for Chinese (Simplified) , polyglot vectors loaded in 0:00:02.149796


100%|██████████| 288/288 [00:12<00:00, 22.75it/s]


Finished run for language: Chinese (Simplified) in 0:00:14.812962
Loading embeddings
Loading file: ../data/pretrained_embeddings/polyglot/embeddings2/nl/embeddings_pkl.tar.bz2
Loading polyglot


 60%|██████    | 3/5 [00:00<00:00, 23.24it/s]

Embeddings for Dutch , polyglot vectors loaded in 0:00:02.148005


100%|██████████| 5/5 [00:00<00:00, 23.09it/s]


Finished run for language: Dutch in 0:00:02.369277
Loading embeddings
Loading file: ../data/pretrained_embeddings/polyglot/embeddings2/fr/embeddings_pkl.tar.bz2
Loading polyglot


  0%|          | 3/767 [00:00<00:33, 22.77it/s]

Embeddings for French , polyglot vectors loaded in 0:00:02.189251


100%|██████████| 767/767 [00:34<00:00, 22.33it/s]


Finished run for language: French in 0:00:36.542739
Loading embeddings
Loading file: ../data/pretrained_embeddings/polyglot/embeddings2/de/embeddings_pkl.tar.bz2
Loading polyglot


  1%|          | 3/275 [00:00<00:11, 22.85it/s]

Embeddings for German , polyglot vectors loaded in 0:00:02.161819


100%|██████████| 275/275 [00:12<00:00, 22.39it/s]


Finished run for language: German in 0:00:14.448951
Loading embeddings
Loading file: ../data/pretrained_embeddings/polyglot/embeddings2/el/embeddings_pkl.tar.bz2
Loading polyglot


  1%|          | 3/320 [00:00<00:14, 22.13it/s]

Embeddings for Greek , polyglot vectors loaded in 0:00:02.215674


100%|██████████| 320/320 [00:14<00:00, 22.42it/s]


Finished run for language: Greek in 0:00:16.495443
Loading embeddings
Loading file: ../data/pretrained_embeddings/polyglot/embeddings2/it/embeddings_pkl.tar.bz2
Loading polyglot


  1%|          | 3/326 [00:00<00:14, 23.05it/s]

Embeddings for Italian , polyglot vectors loaded in 0:00:02.147150


100%|██████████| 326/326 [00:14<00:00, 22.31it/s]


Finished run for language: Italian in 0:00:16.761423
Loading embeddings
Loading file: ../data/pretrained_embeddings/polyglot/embeddings2/ko/embeddings_pkl.tar.bz2
Loading polyglot


  0%|          | 0/166 [00:00<?, ?it/s]

Embeddings for Korean , polyglot vectors loaded in 0:00:02.648320


100%|██████████| 166/166 [00:16<00:00, 10.32it/s]


Finished run for language: Korean in 0:00:18.734360
Loading embeddings
Loading file: ../data/pretrained_embeddings/polyglot/embeddings2/pl/embeddings_pkl.tar.bz2
Loading polyglot


  0%|          | 1/287 [00:00<00:56,  5.06it/s]

Embeddings for Polish , polyglot vectors loaded in 0:00:05.350116


100%|██████████| 287/287 [00:55<00:00,  5.16it/s]


Finished run for language: Polish in 0:01:00.995176
Loading embeddings
Loading file: ../data/pretrained_embeddings/polyglot/embeddings2/pt/embeddings_pkl.tar.bz2
Loading polyglot


  1%|          | 3/276 [00:00<00:12, 22.24it/s]

Embeddings for Portuguese , polyglot vectors loaded in 0:00:02.213266


100%|██████████| 276/276 [00:12<00:00, 22.66it/s]


Finished run for language: Portuguese in 0:00:14.400013
Loading embeddings
Loading file: ../data/pretrained_embeddings/polyglot/embeddings2/ru/embeddings_pkl.tar.bz2
Loading polyglot


  1%|          | 3/301 [00:00<00:13, 22.57it/s]

Embeddings for Russian , polyglot vectors loaded in 0:00:02.220222


100%|██████████| 301/301 [00:13<00:00, 22.21it/s]


Finished run for language: Russian in 0:00:15.776340
Loading embeddings
Loading file: ../data/pretrained_embeddings/polyglot/embeddings2/es/embeddings_pkl.tar.bz2
Loading polyglot


  0%|          | 3/759 [00:00<00:35, 21.46it/s]

Embeddings for Spanish , polyglot vectors loaded in 0:00:02.189325


100%|██████████| 759/759 [00:33<00:00, 22.47it/s]


Finished run for language: Spanish in 0:00:35.971155
Loading embeddings
Loading file: ../data/pretrained_embeddings/polyglot/embeddings2/sv/embeddings_pkl.tar.bz2
Loading polyglot


  8%|▊         | 3/40 [00:00<00:01, 23.11it/s]

Embeddings for Swedish , polyglot vectors loaded in 0:00:02.195466


100%|██████████| 40/40 [00:01<00:00, 22.70it/s]


Finished run for language: Swedish in 0:00:03.962193
Loading embeddings
Loading file: ../data/pretrained_embeddings/polyglot/embeddings2/th/embeddings_pkl.tar.bz2
Loading polyglot


  5%|▍         | 5/102 [00:00<00:02, 41.67it/s]

Embeddings for Thai , polyglot vectors loaded in 0:00:01.230054


100%|██████████| 102/102 [00:02<00:00, 42.34it/s]


Finished run for language: Thai in 0:00:03.644029
Loading embeddings
Loading file: ../data/pretrained_embeddings/polyglot/embeddings2/tr/embeddings_pkl.tar.bz2
Loading polyglot


  1%|          | 2/226 [00:00<00:13, 17.03it/s]

Embeddings for Turkish , polyglot vectors loaded in 0:00:02.302487


100%|██████████| 226/226 [00:09<00:00, 22.66it/s]


Finished run for language: Turkish in 0:00:12.282282


In [25]:
# for key in fasttext_result_set:
#     print(key + ':' + str(len(fasttext_result_set[key])))
print(fasttext_result_set['case_3309'].keys())
print(fasttext_result_set_sim['case_3309'].keys())

dict_keys(['i_nn', 'c_nn', 'l1_i_nn', 'l1_c_nn'])
dict_keys(['i_nn', 'c_nn', 'l1_i_nn', 'l1_c_nn'])


In [26]:
# fasttext_result_set

{'case_0001': {},
 'i_nn': ['gives',
  'giving',
  'gave',
  'Give',
  'provide',
  'get',
  'gve',
  'want',
  'take',
  'ask',
  'let',
  'try',
  'given',
  'bring',
  'chance',
  'allow',
  'help',
  '.Give',
  'make',
  'need',
  'offer',
  'lend',
  'send',
  'you',
  'tell',
  'put',
  'gving',
  'givng',
  '.give',
  'it.Give',
  'time.Give',
  'opportunity',
  'gice',
  'me.Give',
  'wanted',
  'giveing',
  'encourage',
  'GIve',
  'you.Give',
  'them.Give',
  'beg',
  'invite',
  'andgive',
  'them',
  'lets',
  'leave',
  'givs',
  'giv',
  'givin',
  'implore',
  'come',
  'receive',
  'convince',
  'givem',
  'wants',
  'remind',
  'Gave',
  'bestow',
  "'ll",
  'Gives',
  'add',
  'oppurtunity',
  'opporunity',
  'seek',
  'hope',
  'entice',
  'deprive',
  'togive',
  'advise',
  'oppotunity',
  'enable',
  'deliver',
  'guve',
  'persuade',
  'will',
  'deserve',
  'gladly',
  'nudge',
  'gimme',
  'withhold',
  'givee',
  'impart',
  'look',
  'obtain',
  'owe',
  'abl