In [1]:
import numpy as np
from sentence_vector_creation import load_sentences, preprocess_sentences, tf_idf, transform_into_sentence_vectors, AGGREGATION_METHODS, LANGUAGES
import re
import os
import io
from numpy.linalg import svd
from sklearn.metrics.pairwise import cosine_similarity

import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir) 
from induce_multilingual_embedding_space.mono_embedding_loading import load_monolingual_embedding

# Supervised classifier for cross-lingual retrieval (L2R) - DEMO

### Load Europarl sentences and transform to sentence vector embeddings

In [2]:
emb_en, id2word_en, word2id_en = load_monolingual_embedding('../induce_multilingual_embedding_space/fastText_mon_emb/wiki.en.vec', 50000)
emb_de, id2word_de, word2id_de = load_monolingual_embedding('../induce_multilingual_embedding_space/fastText_mon_emb/wiki.de.vec', 50000)

In [3]:
sen_en = load_sentences('europarl_datasets/de-en/europarl-v7.de-en.en', 1)
sen_de = load_sentences('europarl_datasets/de-en/europarl-v7.de-en.de', 1)

In [6]:
sen_emb_en, id2sentence_en = transform_into_sentence_vectors(sen_en, 'english', emb_en, word2id_en, agg_method='tf_idf')
sen_emb_de, id2sentence_de = transform_into_sentence_vectors(sen_de, 'german', emb_de, word2id_de, agg_method='tf_idf')
sen_emb_de.shape

(1, 300)

### Compute cosine similarity of two sentence translations

In [7]:
LEARNING_METHODS = {'procrustes'}  # todo: implement further methods


def learn_projection_matrix(s_vecs, t_vecs, train_expert_dict, s_word2id: dict = None, t_word2id: dict = None,
                            method: str = 'procrustes', n_max: int = 50000):
    """
    Learns projection matrix W that maps source language monolingual embedding into multilingual word embedding space.
    :param s_vecs: path of fastText source monolingual embedding text file or array of word embedding
    :param t_vecs: path of fastText target monolingual embedding text file or array of word embedding
                   (same type as s_vecs required)
    :param train_expert_dict: path of external expert training translation dictionary
    :param s_word2id: word/id dictionary, needed if only word vector embeddings are specified in s_vecs
    :param t_word2id: word/id dictionary, needed if only word vector embeddings are specified in t_vecs
    :param method: method to solve the learning problem
    :param n_max: maximum number of most frequent words that are loaded in monolingual word embeddings
    :return: projection matrix W
    """
    if method not in LEARNING_METHODS:
        raise ValueError("Method must be one of {}.".format(LEARNING_METHODS))
    if isinstance(s_vecs, np.ndarray) and s_word2id is None:
        raise TypeError("word2id dictionaries have to be specified if embeddings are given as numpy arrays.")

    if isinstance(s_vecs, str) and os.path.isfile(s_vecs):
        l1_emb, l1_id2word, l1_word2id = load_monolingual_embedding(s_vecs, n_max)
        l2_emb, l2_id2word, l2_word2id = load_monolingual_embedding(t_vecs, n_max)
        d_index, d_word = extract_seed_dictionary(train_expert_dict, l1_word2id, l2_word2id)
        s_emb, t_emb = align_monolingual_subspaces(l1_emb, l2_emb, d_index)
    else:
        d_index, d_word = extract_seed_dictionary(train_expert_dict, s_word2id, t_word2id)
        s_emb, t_emb = align_monolingual_subspaces(s_vecs, t_vecs, d_index)

    if method == 'procrustes':
        U, s, Vt = svd(np.transpose(s_emb) @ t_emb)
        W = U @ Vt
        return W

def extract_seed_dictionary(expert_dict, s_word2id: dict, t_word2id: dict):
    """
    Extract seed dictionary from external expert dictionary according to vocabularies included in monolingual embedding
    spaces.
    :param expert_dict: external expert dictionary (either text file or Python dictionary)
    :param s_word2id: source dictionary of words and indices as returned from load_monolingual_embedding
    :param t_word2id: target dictionary of indices and words as returned from load_monolingual_embedding
    :return: index/word pairs of resulting seed dictionary
    """
    index_pairs = []
    word_pairs = []
    misfit = 0
    misfit_s = 0
    misfit_t = 0

    if isinstance(expert_dict, str) and os.path.isfile(expert_dict):
        with io.open(expert_dict, 'r', encoding='utf-8') as file:
            for index, word_pair in enumerate(file):
                s_word, t_word = word_pair.rstrip().split()
                if s_word in s_word2id and t_word in t_word2id:
                    index_pairs.append((s_word2id[s_word], t_word2id[t_word]))
                    word_pairs.append((s_word, t_word))
                else:
                    misfit += 1
                    misfit_s += int(s_word not in s_word2id)
                    misfit_t += int(t_word not in t_word2id)
            print('Found {} valid translation pairs in expert dictionary.\n'
                  '{} other pairs contained at least one unknown word ({} in source language, {} in target language).'
                  .format(len(word_pairs), misfit, misfit_s, misfit_t))
            return index_pairs, word_pairs

    elif isinstance(expert_dict, dict):
        for s_word, t_word in expert_dict.items():
            if s_word in s_word2id and t_word in t_word2id:
                index_pairs.append((s_word2id[s_word], t_word2id[t_word]))
                word_pairs.append((s_word, t_word))
            else:
                misfit += 1
                misfit_s += int(s_word not in s_word2id)
                misfit_t += int(t_word not in t_word2id)
        print('Found {} valid translation pairs.\n'
              '{} other pairs contained at least one unknown word ({} in source language, {} in target language).'
              .format(len(word_pairs), misfit, misfit_s, misfit_t))
        return index_pairs, word_pairs

    else:
        print('Invalid translation dictionary type. Text file or Python dictionary is required.')
        return False


def align_monolingual_subspaces(s_emb: np.ndarray, t_emb: np.ndarray, seed_dictionary: list):
    """
    Create aligned monolingual subspaces from seed dictionary.
    :param s_emb: monolingual source embedding as returned from load_monolingual_embedding
    :param t_emb: monolingual target embedding as returned from load_monolingual_embedding
    :param seed_dictionary: index pairs of seed dictionary as returned from extract_seed_dictionary
    :return: aligned source and target subspaces
    """
    s_subspace = s_emb[[tuples[0] for tuples in seed_dictionary]]
    t_subspace = t_emb[[tuples[1] for tuples in seed_dictionary]]
    print("Resulting subspace dimension: {}".format(s_subspace.shape))
    return s_subspace, t_subspace

In [8]:
W = learn_projection_matrix(emb_en, emb_de, '../induce_multilingual_embedding_space/expert_dictionaries/MUSE_en-de.0-5000.txt', s_word2id=word2id_en, t_word2id=word2id_de)

Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Resulting subspace dimension: (13700, 300)


In [9]:
cosine_similarity((sen_emb_en[0] @ W).reshape(1, -1), sen_emb_de[0].reshape(1,-1))[0]

array([0.69952961])