In [1]:
import io
import os
import numpy as np
from numpy.linalg import svd
from sklearn.metrics.pairwise import cosine_similarity
from mono_embedding_loading import load_monolingual_embedding
from subspace_creation import create_translation_dictionary, extract_seed_dictionary, align_monolingual_subspaces
from multi_embedding_learning import learn_projection_matrix, LEARNING_METHODS
from multi_embedding_evaluation import evaluate_multilingual_embedding
from googletrans import Translator

# Inducing multilingual word embedding - DEMO

### Load fastText monolingual embeddings

In [6]:
emb_en, id2word_en, word2id_en = load_monolingual_embedding(path = 'fastText_mon_emb/wiki.en.vec', n_max=50000)
emb_fr, id2word_fr, word2id_fr = load_monolingual_embedding(path = 'fastText_mon_emb/wiki.fr.vec', n_max=50000)

In [4]:
%timeit load_monolingual_embedding(path = 'fastText_mon_emb/wiki.en.vec', n_max=50000)

2.97 s ± 304 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
print(emb_en.shape)
print({k: id2word_en[k] for k in list(id2word_en)[:100]})

(50000, 300)
{0: ',', 1: '.', 2: 'the', 3: '</s>', 4: 'of', 5: '-', 6: 'in', 7: 'and', 8: "'", 9: ')', 10: '(', 11: 'to', 12: 'a', 13: 'is', 14: 'was', 15: 'on', 16: 's', 17: 'for', 18: 'as', 19: 'by', 20: 'that', 21: 'it', 22: 'with', 23: 'from', 24: 'at', 25: 'he', 26: 'this', 27: 'be', 28: 'i', 29: 'an', 30: 'utc', 31: 'his', 32: 'not', 33: '–', 34: 'are', 35: 'or', 36: 'talk', 37: 'which', 38: 'also', 39: 'has', 40: 'were', 41: 'but', 42: 'have', 43: '#', 44: 'one', 45: 'rd', 46: 'new', 47: 'first', 48: 'page', 49: 'no', 50: 'you', 51: 'they', 52: 'had', 53: 'article', 54: 't', 55: 'who', 56: '?', 57: 'all', 58: 'their', 59: 'there', 60: 'been', 61: 'made', 62: 'its', 63: 'people', 64: 'may', 65: 'after', 66: '%', 67: 'other', 68: 'should', 69: 'two', 70: 'score', 71: 'her', 72: 'can', 73: 'would', 74: 'more', 75: 'if', 76: 'she', 77: 'about', 78: 'when', 79: 'time', 80: 'team', 81: 'american', 82: 'such', 83: 'th', 84: 'do', 85: 'discussion', 86: 'links', 87: 'only', 88: 'some', 8

### Optional: Create expert dictionary using Google Translate API

In [None]:
# does not work since Google API's request limit is exceeded -> use MUSE expert dictionary
google_dict = create_translation_dictionary(id2word_en.values(), 'en', 'fr', 20000, write_to_path='expert_dictionaries/GOOGLE_en-fr.0-20000.txt')

In [None]:
print({k: google_dict[k] for k in list(google_dict)[:10]})

### Extract seed dictionary from expert dictionary using the vocabularies from monolingual word embeddings

In [8]:
seed_dict_indices_train, seed_dict_words_train = extract_seed_dictionary('expert_dictionaries/MUSE_en-fr.0-5000.txt', s_word2id=word2id_en, t_word2id=word2id_fr)
print('\n', seed_dict_words_train[:100])

Found 10369 valid translation pairs.
503 other pairs contained at least one unknown word (0 in source language, 503 in target language).

 [('the', 'le'), ('the', 'les'), ('the', 'la'), ('and', 'et'), ('was', 'fut'), ('was', 'etait'), ('was', 'était'), ('for', 'pour'), ('that', 'que'), ('that', 'cela'), ('with', 'avec'), ('from', 'du'), ('from', 'de'), ('from', 'depuis'), ('this', 'ceci'), ('this', 'cet'), ('this', 'cette'), ('this', 'cela'), ('this', 'ce'), ('utc', 'utc'), ('his', 'sa'), ('his', 'his'), ('his', 'ses'), ('his', 'son'), ('not', 'not'), ('not', 'non'), ('not', 'pas'), ('are', 'sont'), ('talk', 'parler'), ('talk', 'parle'), ('talk', 'talk'), ('talk', 'parlez'), ('which', 'lesquels'), ('which', 'laquelle'), ('which', 'lequel'), ('also', 'également'), ('also', 'aussi'), ('were', 'étaient'), ('but', 'mais'), ('have', 'avoir'), ('have', 'ont'), ('one', 'un'), ('one', 'une'), ('one', 'one'), ('new', 'nouveau'), ('new', 'nouvelle'), ('new', 'nouvelles'), ('new', 'nouveaux'), ('

In [3]:
%timeit extract_seed_dictionary('expert_dictionaries/MUSE_en-fr.0-5000.txt', s_word2id=word2id_en, t_word2id=word2id_fr)

12.8 ms ± 288 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Create aligned monolingual subspaces from seed dictionary

In [9]:
X_s_train, X_t_train = align_monolingual_subspaces(emb_en, emb_fr, seed_dict_indices_train)

Resulting subspace dimension: (10369, 300)


### Put it all together: Learn projection matrix W from training dictionary

In [2]:
W = learn_projection_matrix('fastText_mon_emb/wiki.en.vec', 'fastText_mon_emb/wiki.fr.vec', 'expert_dictionaries/MUSE_en-fr.0-5000.txt')

Found 10369 valid translation pairs.
503 other pairs contained at least one unknown word (0 in source language, 503 in target language).
Resulting subspace dimension: (10369, 300)


### Apply learned projection matrix W to test dictionary using evaluation measure precision@k, k = 1, 5, 10

In [3]:
accuracy_1, translations_1 = evaluate_multilingual_embedding('fastText_mon_emb/wiki.en.vec', 'fastText_mon_emb/wiki.fr.vec', W, 'expert_dictionaries/MUSE_en-fr.5000-6500.txt', k=1)

Found 2943 valid translation pairs.
0 other pairs contained at least one unknown word (0 in source language, 0 in target language).


In [19]:
print('Accuracy with k=1: {}'.format(accuracy_1))
print('-' * 60)
print('Examples of Top 1 translations:')
for word in ['recommend', 'geographical', 'developer']:
    print('{} -> {}'.format(word, translations_1[word]))

Accuracy with k=1: 0.752
------------------------------------------------------------
Examples of Top 1 translations:
recommend -> ['recommander']
geographical -> ['géographiques']
developer -> ['développeur']


In [17]:
accuracy_5, translations_5 = evaluate_multilingual_embedding('fastText_mon_emb/wiki.en.vec', 'fastText_mon_emb/wiki.fr.vec', W, 'expert_dictionaries/MUSE_en-fr.5000-6500.txt', k=5)

Found 2943 valid translation pairs.
0 other pairs contained at least one unknown word (0 in source language, 0 in target language).


In [21]:
print('Accuracy with k=5: {}'.format(accuracy_5))
print('-' * 60)
print('Examples of Top 5 translations:')
for word in ['recommend', 'geographical', 'developer']:
    print('{} -> {}'.format(word, translations_5[word]))

Accuracy with k=5: 0.8773333333333333
------------------------------------------------------------
Examples of Top 5 translations:
recommend -> ['recommander', 'recommande', 'proposerais', 'demanderais', 'souhaitable']
geographical -> ['géographiques', 'geographique', 'biogéographique', 'géographies', 'géographique']
developer -> ['développeur', 'développeurs', 'softworks', 'programmeur', 'solidworks']


In [20]:
accuracy_10, translations_10 = evaluate_multilingual_embedding('fastText_mon_emb/wiki.en.vec', 'fastText_mon_emb/wiki.fr.vec', W, 'expert_dictionaries/MUSE_en-fr.5000-6500.txt', k=10)

Found 2943 valid translation pairs.
0 other pairs contained at least one unknown word (0 in source language, 0 in target language).


In [22]:
print('Accuracy with k=10: {}'.format(accuracy_10))
print('-' * 60)
print('Examples of Top 5 translations:')
for word in ['recommend', 'geographical', 'developer']:
    print('{} -> {}'.format(word, translations_10[word]))

Accuracy with k=10: 0.9006666666666666
------------------------------------------------------------
Examples of Top 5 translations:
recommend -> ['recommander', 'recommande', 'proposerais', 'demanderais', 'souhaitable', 'apprécierais', 'conseillerais', 'proposerai', 'envisagez', 'déconseiller']
geographical -> ['géographiques', 'geographique', 'biogéographique', 'géographies', 'géographique', 'biogéographiques', 'géographiquement', 'géomorphologique', 'phytogéographie', 'géomorphologiques']
developer -> ['développeur', 'développeurs', 'softworks', 'programmeur', 'solidworks', 'activision', 'software', 'microïds', 'entrepreneur', 'concepteur']
