In [2]:
import io
import os
import numpy as np
from numpy.linalg import svd
from sklearn.metrics.pairwise import cosine_similarity
from mono_embedding_loading import load_monolingual_embedding
from subspace_creation import create_translation_dictionary, extract_seed_dictionary, align_monolingual_subspaces
from multi_embedding_learning import projection_learning, LEARNING_METHODS
from googletrans import Translator

# Inducing multilingual word embedding - DEMO

### Load fastText monolingual embeddings

In [3]:
emb_en, id2word_en, word2id_en = load_monolingual_embedding(path = 'fastText_mon_emb/wiki.en.vec', n_max=50000)
emb_fr, id2word_fr, word2id_fr = load_monolingual_embedding(path = 'fastText_mon_emb/wiki.fr.vec', n_max=50000)

In [4]:
print(emb_en.shape)
print({k: id2word_en[k] for k in list(id2word_en)[:30]})

(50000, 300)
{0: ',', 1: '.', 2: 'the', 3: '</s>', 4: 'of', 5: '-', 6: 'in', 7: 'and', 8: "'", 9: ')', 10: '(', 11: 'to', 12: 'a', 13: 'is', 14: 'was', 15: 'on', 16: 's', 17: 'for', 18: 'as', 19: 'by', 20: 'that', 21: 'it', 22: 'with', 23: 'from', 24: 'at', 25: 'he', 26: 'this', 27: 'be', 28: 'i', 29: 'an'}


### Optional: Create expert dictionary using Google Translate API

In [None]:
# does not work since Google API's request limit is exceeded -> use MUSE expert dictionary
google_dict = create_translation_dictionary(id2word_en.values(), 'en', 'fr', 20000, write_to_path='expert_dictionaries/GOOGLE_en-fr.0-20000.txt')

In [None]:
print({k: google_dict[k] for k in list(google_dict)[:10]})

### Extract seed dictionary from expert dictionary using the vocabularies from monolingual word embeddings

In [4]:
seed_dict_indices_train, seed_dict_words_train = extract_seed_dictionary('expert_dictionaries/MUSE_en-fr.0-5000.txt', s_word2id=word2id_en, t_word2id=word2id_fr)
print('\n', seed_dict_words_train[:100])

Found 10369 valid translation pairs.
503 other pairs contained at least one unknown word (0 in source language, 503 in target language).

 [('the', 'le'), ('the', 'les'), ('the', 'la'), ('and', 'et'), ('was', 'fut'), ('was', 'etait'), ('was', 'était'), ('for', 'pour'), ('that', 'que'), ('that', 'cela'), ('with', 'avec'), ('from', 'du'), ('from', 'de'), ('from', 'depuis'), ('this', 'ceci'), ('this', 'cet'), ('this', 'cette'), ('this', 'cela'), ('this', 'ce'), ('utc', 'utc'), ('his', 'sa'), ('his', 'his'), ('his', 'ses'), ('his', 'son'), ('not', 'not'), ('not', 'non'), ('not', 'pas'), ('are', 'sont'), ('talk', 'parler'), ('talk', 'parle'), ('talk', 'talk'), ('talk', 'parlez'), ('which', 'lesquels'), ('which', 'laquelle'), ('which', 'lequel'), ('also', 'également'), ('also', 'aussi'), ('were', 'étaient'), ('but', 'mais'), ('have', 'avoir'), ('have', 'ont'), ('one', 'un'), ('one', 'une'), ('one', 'one'), ('new', 'nouveau'), ('new', 'nouvelle'), ('new', 'nouvelles'), ('new', 'nouveaux'), ('

### Create aligned monolingual subspaces from seed dictionary

In [5]:
X_s_train, X_t_train = align_monolingual_subspaces(emb_en, emb_fr, seed_dict_indices_train)

Resulting subspace dimension: (10369, 300)


### Put it all together: Learn projection matrix W from training dictionary

In [6]:
W = projection_learning('fastText_mon_emb/wiki.en.vec', 'fastText_mon_emb/wiki.fr.vec', 'expert_dictionaries/MUSE_en-fr.0-5000.txt')

Found 10369 valid translation pairs.
503 other pairs contained at least one unknown word (0 in source language, 503 in target language).
Resulting subspace dimension: (10369, 300)


### Apply learned projection matrix W to test dictionary

In [7]:
seed_dict_indices_test, seed_dict_words_test = extract_seed_dictionary('expert_dictionaries/MUSE_en-fr.5000-6500.txt', s_word2id=word2id_en, t_word2id=word2id_fr)

Found 2729 valid translation pairs.
214 other pairs contained at least one unknown word (0 in source language, 214 in target language).


In [8]:
X_s_test, X_t_test = align_monolingual_subspaces(emb_en, emb_fr, seed_dict_indices_test)

Resulting subspace dimension: (2729, 300)


In [9]:
accuracy = 0
for i in range(len(X_s_test)):
    accuracy += int(cosine_similarity((X_s_test[i] @ W).reshape(1,-1), X_t_test).argmax() == i)
accuracy = accuracy/len(X_s_test)
accuracy

0.4789300109930377

In [10]:
seed_dict_indices_unique = []
for tup in seed_dict_indices_test:
    if tup[0] not in [t[0] for t in seed_dict_indices_unique]:
        seed_dict_indices_unique.append(tup)

seed_dict_words_unique = []
for tup in seed_dict_words_test:
    if tup[0] not in [t[0] for t in seed_dict_words_unique]:
        seed_dict_words_unique.append(tup)       

In [11]:
X_s_test2, X_t_test2 = align_monolingual_subspaces(emb_en, emb_fr, seed_dict_indices_unique)

Resulting subspace dimension: (1488, 300)


In [12]:
accuracy = 0
for i in range(len(X_s_test2)):
    accuracy += int(cosine_similarity((X_s_test2[i] @ W).reshape(1,-1), X_t_test2).argmax() == i)
accuracy = accuracy/len(X_s_test2)
accuracy

0.8145161290322581