## Imports

In [2]:
import importlib

In [3]:
from ir_crosslingual.embeddings import embeddings
importlib.reload(embeddings)

<module 'ir_crosslingual.embeddings.embeddings' from '/Users/i500969/Desktop/Admin/Uni-Mannheim/02_Courses/2020_FSS/Information-Retrieval/03_Project/03_Implementation/07_Word-Embeddings/ir-crosslingual/ir_crosslingual/embeddings/embeddings.py'>

## Induction of multilingual word embeddings

### Load fastText monolingual embeddings

In [4]:
english = embeddings.WordEmbeddings('en')
english.load_vec_embeddings(n_max=50000)

german = embeddings.WordEmbeddings('de')
german.load_vec_embeddings(n_max=50000)

In [5]:
print(german.embeddings.shape)
print({k: german.word2id[k] for k in list(german.word2id)[:100]})

(50000, 300)
{'.': 0, ',': 1, '</s>': 2, '-': 3, 'der': 4, ')': 5, 'die': 6, '(': 7, 'und': 8, 'in': 9, 'von': 10, "'": 11, 'den': 12, 'im': 13, 'das': 14, 'des': 15, 'mit': 16, 'ist': 17, 'er': 18, 'zu': 19, 'für': 20, 'auf': 21, 'ein': 22, 'als': 23, 'dem': 24, 'eine': 25, 'wurde': 26, '–': 27, 'auch': 28, 'sich': 29, 'nicht': 30, 'an': 31, 'es': 32, 'nach': 33, 'war': 34, 'bei': 35, 'aus': 36, 'rd': 37, 'am': 38, 'sie': 39, 'cest': 40, 'bis': 41, '"': 42, 'ich': 43, 'einer': 44, 'zum': 45, 'werden': 46, 'sind': 47, 'oder': 48, 'wird': 49, 'durch': 50, 'cet': 51, 'score': 52, 'zur': 53, 'einen': 54, 'dass': 55, 'über': 56, 'einem': 57, 'um': 58, 'aber': 59, '?': 60, 'wie': 61, 'nur': 62, 'noch': 63, 'so': 64, 'unter': 65, 'hat': 66, 'artikel': 67, 'man': 68, 's': 69, 'vor': 70, 'wurden': 71, '/': 72, 'sein': 73, 'da': 74, 'de': 75, 'diese': 76, 'the': 77, 'vom': 78, 'kann': 79, 'of': 80, 'hier': 81, 'diskussion': 82, 'a': 83, 'seine': 84, 'seit': 85, 'dieser': 86, 'jahr': 87, 'wenn':

### Extraction of seed dictionaries

In [6]:
embeddings.WordEmbeddings.set_seed_dictionary(src_lang='en', trg_lang='de')
embeddings.WordEmbeddings.set_seed_dictionary(src_lang='de', trg_lang='en')

---- INFO: Found 13700 valid translation pairs in expert dictionary.
---- INFO: 977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
---- INFO: Found 10604 valid translation pairs in expert dictionary.
---- INFO: 262 other pairs contained at least one unknown word (0 in source language, 262 in target language).


### Align subspaces

In [7]:
english.align_monolingual_embeddings(languages='en-de', source=True)
german.align_monolingual_embeddings(languages='en-de', source=False)

german.align_monolingual_embeddings(languages='de-en', source=True)
english.align_monolingual_embeddings(languages='de-en', source=False)

---- INFO: Resulting subspace dimension: (13700, 300)
---- INFO: Resulting subspace dimension: (13700, 300)
---- INFO: Resulting subspace dimension: (10604, 300)
---- INFO: Resulting subspace dimension: (10604, 300)


### Put it all together: Learn projection matrix W from training dictionary

In [8]:
W_ende, W_deen = embeddings.WordEmbeddings.learn_projection_matrix(src_lang='en', trg_lang='de', extract_seed=False, align_subspaces=False)

---- INFO: Learn projection matrix for en-de
---- DONE: Projection matrix learned from en to de
---- INFO: Learn projection matrix for de-en
---- DONE: Projection matrix learned from de to en


## Evaluation of multilingual word embeddings

In [11]:
english_eval = embeddings.WordEmbeddings('en', evaluation=True)
english_eval.load_vec_embeddings(n_max=50000)

german_eval = embeddings.WordEmbeddings('de', evaluation=True)
german_eval.load_vec_embeddings(n_max=200000)

In [12]:
accuracy1, translations1 = embeddings.WordEmbeddings.evaluate_multilingual_embeddings(src_lang='en', trg_lang='de', k=1)


---- INFO: Found 3660 valid translation pairs in expert dictionary.
---- INFO: 0 other pairs contained at least one unknown word (0 in source language, 0 in target language).
---- INFO: Aims to find correct translations between 1500 source words and 3429 target words
---- INFO: Start determination of top k=1 translations
---- INFO: Start identification of correct translations
---- DONE: Accuracy with k=1: 0.6893333333333334
------------------------------------------------------------
Examples of Top 1 translations:
recommend -> ['empfehle']
geographical -> ['geografische']
developer -> ['entwickler']


In [104]:
accuracy5, translations5 = embeddings.WordEmbeddings.evaluate_multilingual_embeddings(src_lang='en', trg_lang='de', k=5)


---- INFO: Found 3660 valid translation pairs in expert dictionary.
---- INFO: 0 other pairs contained at least one unknown word (0 in source language, 0 in target language).
---- INFO: Aims to find correct translations between 1500 source words and 3429 target words
---- INFO: Start determination of top k=5 translations
---- INFO: Start identification of correct translations
---- DONE: Accuracy with k=5: 0.86
------------------------------------------------------------
Examples of Top 5 translations:
recommend -> ['empfehle', 'vorschlagen', 'empfehlen', 'zweifelsfall', 'hilfreich']
geographical -> ['geografische', 'geographischen', 'geographische', 'geografischen', 'geografisch']
developer -> ['entwickler', 'projektentwickler', 'hauptentwickler', 'mitentwickler', 'softwareentwickler']


In [105]:
accuracy10, translations10 = embeddings.WordEmbeddings.evaluate_multilingual_embeddings(src_lang='en', trg_lang='de', k=10)


---- INFO: Found 3660 valid translation pairs in expert dictionary.
---- INFO: 0 other pairs contained at least one unknown word (0 in source language, 0 in target language).
---- INFO: Aims to find correct translations between 1500 source words and 3429 target words
---- INFO: Start determination of top k=10 translations
---- INFO: Start identification of correct translations
---- DONE: Accuracy with k=10: 0.8953333333333333
------------------------------------------------------------
Examples of Top 10 translations:
recommend -> ['empfehle', 'vorschlagen', 'empfehlen', 'zweifelsfall', 'hilfreich', 'sinnvoll', 'abraten', 'befürworte', 'anzuraten', 'überlege']
geographical -> ['geografische', 'geographischen', 'geographische', 'geografischen', 'geografisch', 'geographisch', 'geografischer', 'geographisches', 'geografisches', 'geographischer']
developer -> ['entwickler', 'projektentwickler', 'hauptentwickler', 'mitentwickler', 'softwareentwickler', 'entwicklers', 'spieleentwickler', 'en