In [1]:
import io
import os
import numpy as np
from numpy.linalg import svd
from sklearn.metrics.pairwise import cosine_similarity
from mono_embedding_loading import load_monolingual_embedding
from subspace_creation import create_translation_dictionary, extract_seed_dictionary, align_monolingual_subspaces
from multi_embedding_learning import learn_projection_matrix, LEARNING_METHODS
from multi_embedding_evaluation import evaluate_multilingual_embedding
from googletrans import Translator

# Inducing multilingual word embedding - DEMO

### Load fastText monolingual embeddings

In [None]:
emb_en, id2word_en, word2id_en = load_monolingual_embedding(path = 'fastText_mon_emb/wiki.en.vec', n_max=50000)
emb_de, id2word_de, word2id_de = load_monolingual_embedding(path = 'fastText_mon_emb/wiki.de.vec', n_max=50000)

In [4]:
%timeit load_monolingual_embedding(path = 'fastText_mon_emb/wiki.en.vec', n_max=50000)

2.97 s ± 304 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
print(emb_de.shape)
print({k: id2word_de[k] for k in list(id2word_de)[:100]})

(50000, 300)
{0: '.', 1: ',', 2: '</s>', 3: '-', 4: 'der', 5: ')', 6: 'die', 7: '(', 8: 'und', 9: 'in', 10: 'von', 11: "'", 12: 'den', 13: 'im', 14: 'das', 15: 'des', 16: 'mit', 17: 'ist', 18: 'er', 19: 'zu', 20: 'für', 21: 'auf', 22: 'ein', 23: 'als', 24: 'dem', 25: 'eine', 26: 'wurde', 27: '–', 28: 'auch', 29: 'sich', 30: 'nicht', 31: 'an', 32: 'es', 33: 'nach', 34: 'war', 35: 'bei', 36: 'aus', 37: 'rd', 38: 'am', 39: 'sie', 40: 'cest', 41: 'bis', 42: '"', 43: 'ich', 44: 'einer', 45: 'zum', 46: 'werden', 47: 'sind', 48: 'oder', 49: 'wird', 50: 'durch', 51: 'cet', 52: 'score', 53: 'zur', 54: 'einen', 55: 'dass', 56: 'über', 57: 'einem', 58: 'um', 59: 'aber', 60: '?', 61: 'wie', 62: 'nur', 63: 'noch', 64: 'so', 65: 'unter', 66: 'hat', 67: 'artikel', 68: 'man', 69: 's', 70: 'vor', 71: 'wurden', 72: '/', 73: 'sein', 74: 'da', 75: 'de', 76: 'diese', 77: 'the', 78: 'vom', 79: 'kann', 80: 'of', 81: 'hier', 82: 'diskussion', 83: 'a', 84: 'seine', 85: 'seit', 86: 'dieser', 87: 'jahr', 88: 'we

### Optional: Create expert dictionary using Google Translate API

In [None]:
# does not work since Google API's request limit is exceeded -> use MUSE expert dictionary
google_dict = create_translation_dictionary(id2word_en.values(), 'en', 'fr', 20000, write_to_path='expert_dictionaries/GOOGLE_en-fr.0-20000.txt')

In [None]:
print({k: google_dict[k] for k in list(google_dict)[:10]})

### Extract seed dictionary from expert dictionary using the vocabularies from monolingual word embeddings

In [8]:
seed_dict_indices_train, seed_dict_words_train = extract_seed_dictionary('expert_dictionaries/MUSE_en-de.0-5000.txt', s_word2id=word2id_en, t_word2id=word2id_de)
print('\n', seed_dict_words_train[:100])

Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).

 [('the', 'die'), ('the', 'der'), ('the', 'dem'), ('the', 'den'), ('the', 'das'), ('and', 'sowie'), ('and', 'und'), ('was', 'war'), ('was', 'wurde'), ('for', 'für'), ('that', 'dass'), ('that', 'das'), ('with', 'mit'), ('from', 'vom'), ('from', 'von'), ('from', 'ab'), ('from', 'aus'), ('this', 'dieser'), ('this', 'diese'), ('this', 'das'), ('utc', 'utc'), ('his', 'seinem'), ('his', 'seinen'), ('his', 'seine'), ('his', 'sein'), ('his', 'seiner'), ('not', 'not'), ('not', 'nicht'), ('not', 'kein'), ('are', 'sind'), ('talk', 'vortrag'), ('talk', 'gespräch'), ('talk', 'reden'), ('talk', 'talk'), ('which', 'welches'), ('which', 'welcher'), ('which', 'welche'), ('which', 'welchen'), ('also', 'ausserdem'), ('also', 'ebenso'), ('also', 'außerdem'), ('also', 'ebenfalls'), ('also', 'auch'), ('has', 'verfügt'), ('has', 'hat'), ('were', 'wurde

In [9]:
%timeit extract_seed_dictionary('expert_dictionaries/MUSE_en-de.0-5000.txt', s_word2id=word2id_en, t_word2id=word2id_de)

Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Found 13700 valid translation pairs in expert dictio

Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Found 13700 valid translation pairs in expert dictio

### Create aligned monolingual subspaces from seed dictionary

In [10]:
X_s_train, X_t_train = align_monolingual_subspaces(emb_en, emb_de, seed_dict_indices_train)

Resulting subspace dimension: (13700, 300)


### Put it all together: Learn projection matrix W from training dictionary

In [2]:
W = learn_projection_matrix('../../../../../04_Data/fastText_mon_emb/wiki.en.vec', '../../../../../04_Data/fastText_mon_emb/wiki.de.vec', '../../../../../04_Data/expert_dictionaries/en-de/MUSE_en-de.0-5000.txt')


Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Resulting subspace dimension: (13700, 300)


In [12]:
# also works if only word embeddings and word2id dictionaries are specified
W_2 = learn_projection_matrix(emb_en, emb_de, 'expert_dictionaries/MUSE_en-de.0-5000.txt', word2id_en, word2id_de)

Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Resulting subspace dimension: (13700, 300)


In [13]:
sum(sum(W != W_2)) # same results

0

### Apply learned projection matrix W to test multilingual word embedding using evaluation measure precision@k

#### Test functionality of evalution function

In [14]:
emb_en_eval, id2word_en_eval, word2id_en_eval = load_monolingual_embedding(path = 'fastText_mon_emb/wiki.en.vec', n_max=10000)
emb_de_eval, id2word_de_eval, word2id_de_eval = load_monolingual_embedding(path = 'fastText_mon_emb/wiki.de.vec', n_max=20000)

In [15]:
accuracy_arrays, _ = evaluate_multilingual_embedding(emb_en_eval, emb_de_eval, W, 'expert_dictionaries/MUSE_en-de.5000-6500.txt', word2id_en_eval, word2id_de_eval, k=1)

Found 2327 valid translation pairs in expert dictionary.
1333 other pairs contained at least one unknown word (0 in source language, 1333 in target language).
Aims to find correct translations between 1365 source words and 2146 target words.


In [5]:
# no word2id dictionary needed if whole fastText paths are specified
accuracy_textfile, _ = evaluate_multilingual_embedding('fastText_mon_emb/wiki.en.vec', 'fastText_mon_emb/wiki.de.vec', W, 'expert_dictionaries/MUSE_en-de.5000-6500.txt', s_nmax=10000, t_nmax=20000, k=1)

Found 2327 valid translation pairs in expert dictionary.
1333 other pairs contained at least one unknown word (0 in source language, 1333 in target language).
Aims to find correct translations between 1365 source words and 2146 target words.


In [6]:
accuracy_arrays == accuracy_textfile # same results

True

#### Evaluate multilingual word embedding

In [11]:
accuracy_1, translations_1 = evaluate_multilingual_embedding('fastText_mon_emb/wiki.en.vec', 'fastText_mon_emb/wiki.de.vec', W, 'expert_dictionaries/MUSE_en-de.5000-6500.txt', k=1)

Found 3660 valid translation pairs.
0 other pairs contained at least one unknown word (0 in source language, 0 in target language).


In [12]:
print('Accuracy with k=1: {}'.format(accuracy_1))
print('-' * 60)
print('Examples of Top 1 translations:')
for word in ['recommend', 'geographical', 'developer']:
    print('{} -> {}'.format(word, translations_1[word]))

Accuracy with k=1: 0.6893333333333334
------------------------------------------------------------
Examples of Top 1 translations:
recommend -> ['empfehle']
geographical -> ['geografische']
developer -> ['entwickler']


In [8]:
accuracy_5, translations_5 = evaluate_multilingual_embedding('fastText_mon_emb/wiki.en.vec', 'fastText_mon_emb/wiki.de.vec', W, 'expert_dictionaries/MUSE_en-de.5000-6500.txt', k=5)

Found 3660 valid translation pairs.
0 other pairs contained at least one unknown word (0 in source language, 0 in target language).


In [9]:
print('Accuracy with k=5: {}'.format(accuracy_5))
print('-' * 60)
print('Examples of Top 5 translations:')
for word in ['recommend', 'geographical', 'developer']:
    print('{} -> {}'.format(word, translations_5[word]))

Accuracy with k=5: 0.86
------------------------------------------------------------
Examples of Top 5 translations:
recommend -> ['empfehle', 'vorschlagen', 'empfehlen', 'zweifelsfall', 'hilfreich']
geographical -> ['geografische', 'geographischen', 'geographische', 'geografischen', 'geografisch']
developer -> ['entwickler', 'projektentwickler', 'hauptentwickler', 'mitentwickler', 'softwareentwickler']


In [10]:
accuracy_10, translations_10 = evaluate_multilingual_embedding('fastText_mon_emb/wiki.en.vec', 'fastText_mon_emb/wiki.de.vec', W, 'expert_dictionaries/MUSE_en-de.5000-6500.txt', k=10)

Found 3660 valid translation pairs.
0 other pairs contained at least one unknown word (0 in source language, 0 in target language).


In [11]:
print('Accuracy with k=10: {}'.format(accuracy_10))
print('-' * 60)
print('Examples of Top 5 translations:')
for word in ['recommend', 'geographical', 'developer']:
    print('{} -> {}'.format(word, translations_10[word]))

Accuracy with k=10: 0.8953333333333333
------------------------------------------------------------
Examples of Top 5 translations:
recommend -> ['empfehle', 'vorschlagen', 'empfehlen', 'zweifelsfall', 'hilfreich', 'sinnvoll', 'abraten', 'befürworte', 'anzuraten', 'überlege']
geographical -> ['geografische', 'geographischen', 'geographische', 'geografischen', 'geografisch', 'geographisch', 'geografischer', 'geographisches', 'geografisches', 'geographischer']
developer -> ['entwickler', 'projektentwickler', 'hauptentwickler', 'mitentwickler', 'softwareentwickler', 'entwicklers', 'spieleentwickler', 'entwicklerstudio', 'softwarefirma', 'entwicklerstudios']
