In [1]:
import numpy as np
from sentence_vector_creation import load_sentences, preprocess_sentences, tf_idf, transform_into_sentence_vectors, AGGREGATION_METHODS, LANGUAGES
import re
import os
import io
from numpy.linalg import svd
from sklearn.metrics.pairwise import cosine_similarity

import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir) 
from induce_multilingual_embedding_space.mono_embedding_loading import load_monolingual_embedding

# Supervised classifier for cross-lingual retrieval (L2R) - DEMO

### Load Europarl sentences and transform to sentence vector embeddings

In [2]:
emb_en, id2word_en, word2id_en = load_monolingual_embedding('../induce_multilingual_embedding_space/fastText_mon_emb/wiki.en.vec', 50000)
emb_de, id2word_de, word2id_de = load_monolingual_embedding('../induce_multilingual_embedding_space/fastText_mon_emb/wiki.de.vec', 50000)

In [3]:
sen_en = load_sentences('europarl_datasets/de-en/europarl-v7.de-en.en', 1)
sen_de = load_sentences('europarl_datasets/de-en/europarl-v7.de-en.de', 1)

In [6]:
sen_emb_en, id2sentence_en = transform_into_sentence_vectors(sen_en, 'english', emb_en, word2id_en, agg_method='tf_idf')
sen_emb_de, id2sentence_de = transform_into_sentence_vectors(sen_de, 'german', emb_de, word2id_de, agg_method='tf_idf')
sen_emb_de.shape

(1, 300)

### Compute cosine similarity of two sentence translations

In [7]:
LEARNING_METHODS = {'procrustes'}  # todo: implement further methods


def learn_projection_matrix(s_vecs, t_vecs, train_expert_dict, s_word2id: dict = None, t_word2id: dict = None,
                            method: str = 'procrustes', n_max: int = 50000):
    """
    Learns projection matrix W that maps source language monolingual embedding into multilingual word embedding space.
    :param s_vecs: path of fastText source monolingual embedding text file or array of word embedding
    :param t_vecs: path of fastText target monolingual embedding text file or array of word embedding
                   (same type as s_vecs required)
    :param train_expert_dict: path of external expert training translation dictionary
    :param s_word2id: word/id dictionary, needed if only word vector embeddings are specified in s_vecs
    :param t_word2id: word/id dictionary, needed if only word vector embeddings are specified in t_vecs
    :param method: method to solve the learning problem
    :param n_max: maximum number of most frequent words that are loaded in monolingual word embeddings
    :return: projection matrix W
    """
    if method not in LEARNING_METHODS:
        raise ValueError("Method must be one of {}.".format(LEARNING_METHODS))
    if isinstance(s_vecs, np.ndarray) and s_word2id is None:
        raise TypeError("word2id dictionaries have to be specified if embeddings are given as numpy arrays.")

    if isinstance(s_vecs, str) and os.path.isfile(s_vecs):
        l1_emb, l1_id2word, l1_word2id = load_monolingual_embedding(s_vecs, n_max)
        l2_emb, l2_id2word, l2_word2id = load_monolingual_embedding(t_vecs, n_max)
        d_index, d_word = extract_seed_dictionary(train_expert_dict, l1_word2id, l2_word2id)
        s_emb, t_emb = align_monolingual_subspaces(l1_emb, l2_emb, d_index)
    else:
        d_index, d_word = extract_seed_dictionary(train_expert_dict, s_word2id, t_word2id)
        s_emb, t_emb = align_monolingual_subspaces(s_vecs, t_vecs, d_index)

    if method == 'procrustes':
        U, s, Vt = svd(np.transpose(s_emb) @ t_emb)
        W = U @ Vt
        return W

def extract_seed_dictionary(expert_dict, s_word2id: dict, t_word2id: dict):
    """
    Extract seed dictionary from external expert dictionary according to vocabularies included in monolingual embedding
    spaces.
    :param expert_dict: external expert dictionary (either text file or Python dictionary)
    :param s_word2id: source dictionary of words and indices as returned from load_monolingual_embedding
    :param t_word2id: target dictionary of indices and words as returned from load_monolingual_embedding
    :return: index/word pairs of resulting seed dictionary
    """
    index_pairs = []
    word_pairs = []
    misfit = 0
    misfit_s = 0
    misfit_t = 0

    if isinstance(expert_dict, str) and os.path.isfile(expert_dict):
        with io.open(expert_dict, 'r', encoding='utf-8') as file:
            for index, word_pair in enumerate(file):
                s_word, t_word = word_pair.rstrip().split()
                if s_word in s_word2id and t_word in t_word2id:
                    index_pairs.append((s_word2id[s_word], t_word2id[t_word]))
                    word_pairs.append((s_word, t_word))
                else:
                    misfit += 1
                    misfit_s += int(s_word not in s_word2id)
                    misfit_t += int(t_word not in t_word2id)
            print('Found {} valid translation pairs in expert dictionary.\n'
                  '{} other pairs contained at least one unknown word ({} in source language, {} in target language).'
                  .format(len(word_pairs), misfit, misfit_s, misfit_t))
            return index_pairs, word_pairs

    elif isinstance(expert_dict, dict):
        for s_word, t_word in expert_dict.items():
            if s_word in s_word2id and t_word in t_word2id:
                index_pairs.append((s_word2id[s_word], t_word2id[t_word]))
                word_pairs.append((s_word, t_word))
            else:
                misfit += 1
                misfit_s += int(s_word not in s_word2id)
                misfit_t += int(t_word not in t_word2id)
        print('Found {} valid translation pairs.\n'
              '{} other pairs contained at least one unknown word ({} in source language, {} in target language).'
              .format(len(word_pairs), misfit, misfit_s, misfit_t))
        return index_pairs, word_pairs

    else:
        print('Invalid translation dictionary type. Text file or Python dictionary is required.')
        return False


def align_monolingual_subspaces(s_emb: np.ndarray, t_emb: np.ndarray, seed_dictionary: list):
    """
    Create aligned monolingual subspaces from seed dictionary.
    :param s_emb: monolingual source embedding as returned from load_monolingual_embedding
    :param t_emb: monolingual target embedding as returned from load_monolingual_embedding
    :param seed_dictionary: index pairs of seed dictionary as returned from extract_seed_dictionary
    :return: aligned source and target subspaces
    """
    s_subspace = s_emb[[tuples[0] for tuples in seed_dictionary]]
    t_subspace = t_emb[[tuples[1] for tuples in seed_dictionary]]
    print("Resulting subspace dimension: {}".format(s_subspace.shape))
    return s_subspace, t_subspace

In [8]:
W = learn_projection_matrix(emb_en, emb_de, '../induce_multilingual_embedding_space/expert_dictionaries/MUSE_en-de.0-5000.txt', s_word2id=word2id_en, t_word2id=word2id_de)

Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Resulting subspace dimension: (13700, 300)


In [9]:
cosine_similarity((sen_emb_en[0] @ W).reshape(1, -1), sen_emb_de[0].reshape(1,-1))[0]

array([0.69952961])

### Create training dataset

In [18]:
import pandas as pd
from xml.dom import minidom


def process_tuv(tuv):
    lang = tuv.getAttribute("lang")
    if lang == '':
        lang = tuv.getAttribute("xml:lang")
    seg = tuv.getElementsByTagName('seg')[0]
    txt = seg.childNodes[0].data
    return lang, txt

def tmx_dataframe(path):

    """takes in a path to TMX translation file and outputs the metadata and a pandas dataframe.
    Args:
        param1 (str): The path to the TMX translation file
    Returns:
        Metadata: The header of the TMX file, which contains metadata
        DataFrame: A Pandas Dataframe. Each line item consists of source_language, source_sentence, target_language, target_sentence
    """
    # parse an xml file by name
    tmx = minidom.parse(path)

    # Get metadata
    metadata = {}
    header = tmx.getElementsByTagName('header')[0]
    for key in header.attributes.keys():
        metadata[key] = header.attributes[key].value
        
    srclang = metadata['srclang']

    # Get translation sentences
    body = tmx.getElementsByTagName('body')[0]
    translation_units = body.getElementsByTagName('tu')
    items = []
    count_unpaired = 0
    for tu in translation_units:
        if len(tu.getElementsByTagName('tuv')) < 2:
            print("Unpaired translation. Ignoring...")
            count_unpaired = count_unpaired + 1
        else:
            srclang, srcsentence = process_tuv(tu.getElementsByTagName('tuv')[0])
            targetlang, targetsentence = process_tuv(tu.getElementsByTagName('tuv')[1])
            item = {
                'source_sentence': srcsentence,
                'target_sentence': targetsentence
            }
            items.append(item)

    df = pd.DataFrame(items)
    if count_unpaired > 0:
        print("The data contained %d unpaired translations which were ignored" % (count_unpaired))
    return metadata, df

In [19]:
meta, df = tmx_dataframe('europarl_datasets/de-en.tmx')

In [57]:
df_train = df[:50000]
df_train.loc[:, 'translation'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [58]:
df_wrong_translations =  pd.concat([df[:50000][['source_sentence']]]*4, ignore_index=True)

In [61]:
df_wrong_translations['target_sentence'] = np.random.choice(df[50000:]['target_sentence'], 200000)
df_wrong_translations.loc[:, 'translation'] = 0

In [62]:
df_wrong_translations

Unnamed: 0,source_sentence,target_sentence,translation
0,"Ich erkläre die am Freitag, dem 17. Dezember u...",I will now respond briefly to a few criticisms.,0
1,"Wie Sie feststellen konnten, ist der gefürchte...","However, it does not prevent states or suprana...",0
2,Im Parlament besteht der Wunsch nach einer Aus...,"It is very easy in the external field, perhaps...",0
3,Heute möchte ich Sie bitten - das ist auch der...,We have only 13 months to bring along the deci...,0
4,"Ich bitte Sie, sich zu einer Schweigeminute zu...",The orchestrator of this state terrorism is th...,0
5,Wie Sie sicher aus der Presse und dem Fernsehe...,The third difference concerns voting in the Go...,0
6,"Zu den Attentatsopfern, die es in jüngster Zei...",In the first case the aid goes to the owner an...,0
7,"Wäre es angemessen, wenn Sie, Frau Präsidentin...",The achievements of the programme in boosting ...,0
8,"Ja, Herr Evans, ich denke, daß eine derartige ...",There was good coordination between the Commit...,0
9,"Wenn das Haus damit einverstanden ist, werde i...",You could call this ‘slit and chuck’.,0


In [63]:
df_train = df_train.append(df_wrong_translations, ignore_index=True)

In [64]:
df_train

Unnamed: 0,source_sentence,target_sentence,translation
0,"Ich erkläre die am Freitag, dem 17. Dezember u...",I declare resumed the session of the European ...,1
1,"Wie Sie feststellen konnten, ist der gefürchte...","Although, as you will have seen, the dreaded '...",1
2,Im Parlament besteht der Wunsch nach einer Aus...,You have requested a debate on this subject in...,1
3,Heute möchte ich Sie bitten - das ist auch der...,"In the meantime, I should like to observe a mi...",1
4,"Ich bitte Sie, sich zu einer Schweigeminute zu...","Please rise, then, for this minute' s silence.",1
5,Wie Sie sicher aus der Presse und dem Fernsehe...,You will be aware from the press and televisio...,1
6,"Zu den Attentatsopfern, die es in jüngster Zei...",One of the people assassinated very recently i...,1
7,"Wäre es angemessen, wenn Sie, Frau Präsidentin...","Would it be appropriate for you, Madam Preside...",1
8,"Ja, Herr Evans, ich denke, daß eine derartige ...","Yes, Mr Evans, I feel an initiative of the typ...",1
9,"Wenn das Haus damit einverstanden ist, werde i...","If the House agrees, I shall do as Mr Evans ha...",1
