In [29]:
import os
import io
import torch
import numpy as np

In [38]:
def read_txt_embeddings(emb_path, max_vocab=1000, full_vocab=False):
    """
    Reload pretrained embeddings from a text file.
    """
    word2id = {}
    vectors = []

    # load pretrained embeddings 
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        for i, line in enumerate(f):
            if i == 0:
                print(line.strip())
                split = line.split()
                assert len(split) == 2
            else:
                word, vect = line.rstrip().split(' ', 1)
                if not full_vocab:
                    word = word.lower()
                vect = np.fromstring(vect, sep=' ')
                if np.linalg.norm(vect) == 0:  # avoid to have null embeddings
                    vect[0] = 0.01
                if word in word2id:
                    if full_vocab:
                        logger.warning("Word '%s' found twice in %s embedding file"
                                       % (word, 'source' if source else 'target'))
                else:
                    word2id[word] = len(word2id)
                    vectors.append(vect[None])
            if max_vocab > 0 and len(word2id) >= max_vocab and not full_vocab:
                break

    assert len(word2id) == len(vectors)
    print("Loaded %i pre-trained word embeddings from %s." % (len(vectors), emb_path))

    # compute new vocabulary / embeddings
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.concatenate(vectors, 0)
    embeddings = torch.from_numpy(embeddings).float()

    return id2word, word2id, embeddings

In [39]:
emb_file_src = "/mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.en.vec"
emb_file_tgt = "/mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.de.vec"

In [40]:
id2word_src, word2id_src, emb_src = read_txt_embeddings(emb_file_src)
id2word_tgt, word2id_tgt, emb_tgt = read_txt_embeddings(emb_file_tgt)

2519370 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.en.vec.
2275233 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.de.vec.


In [41]:
def load_dictionary(path, word2id1, word2id2):
    """
    Return a torch tensor of size (n, 2) where n is the size of the
    loader dictionary, and sort it by source word frequency.
    """
    assert os.path.isfile(path)

    pairs = []
    not_found = 0
    not_found1 = 0
    not_found2 = 0

    with io.open(path, 'r', encoding='utf-8') as f:
        for _, line in enumerate(f):
            assert line == line.lower()
            word1, word2 = line.rstrip().split()
            if word1 in word2id1 and word2 in word2id2:
                pairs.append((word1, word2))
            else:
                not_found += 1
                not_found1 += int(word1 not in word2id1)
                not_found2 += int(word2 not in word2id2)

    print("Found %i pairs of words in the dictionary (%i unique). "
                "%i other pairs contained at least one unknown word "
                "(%i in lang1, %i in lang2)"
                % (len(pairs), len(set([x for x, _ in pairs])),
                   not_found, not_found1, not_found2))

    # sort the dictionary by source word frequencies
    pairs = sorted(pairs, key=lambda x: word2id1[x[0]])
    dico = torch.LongTensor(len(pairs), 2)
    for i, (word1, word2) in enumerate(pairs):
        dico[i, 0] = word2id1[word1]
        dico[i, 1] = word2id2[word2]

    return dico, pairs

In [42]:
dict_path = "/mnt/ssd-201-112-01/cpii.local/lptang/dictionaries/MUSE/en-de.0-5000.txt"
dico, dico_words = load_dictionary(dict_path, word2id_src, word2id_tgt)

Found 801 pairs of words in the dictionary (552 unique). 13876 other pairs contained at least one unknown word (11737 in lang1, 13306 in lang2)


In [43]:
dico_words

[('the', 'die'),
 ('the', 'der'),
 ('the', 'dem'),
 ('the', 'den'),
 ('the', 'das'),
 ('and', 'sowie'),
 ('and', 'und'),
 ('was', 'war'),
 ('was', 'wurde'),
 ('for', 'für'),
 ('that', 'dass'),
 ('that', 'das'),
 ('with', 'mit'),
 ('from', 'vom'),
 ('from', 'von'),
 ('from', 'ab'),
 ('from', 'aus'),
 ('this', 'dieser'),
 ('this', 'diese'),
 ('this', 'das'),
 ('his', 'seinem'),
 ('his', 'seinen'),
 ('his', 'seine'),
 ('his', 'sein'),
 ('his', 'seiner'),
 ('not', 'nicht'),
 ('not', 'kein'),
 ('are', 'sind'),
 ('which', 'welches'),
 ('which', 'welche'),
 ('also', 'ebenso'),
 ('also', 'außerdem'),
 ('also', 'ebenfalls'),
 ('also', 'auch'),
 ('has', 'hat'),
 ('were', 'wurden'),
 ('were', 'waren'),
 ('but', 'doch'),
 ('but', 'aber'),
 ('but', 'allerdings'),
 ('but', 'jedoch'),
 ('have', 'haben'),
 ('have', 'habe'),
 ('one', 'einen'),
 ('one', 'einer'),
 ('one', 'eine'),
 ('one', 'ein'),
 ('new', 'neue'),
 ('new', 'neu'),
 ('new', 'neuen'),
 ('first', 'erste'),
 ('first', 'erster'),
 ('first',

In [44]:
dico_words_uniq = []
prev_word1 = ""
for (word1, word2) in dico_words:
    if word1 != prev_word1:
        dico_words_uniq.append((word1, word2))
        prev_word1 = word1
    if len(dico_words_uniq) >= 10:
        break

In [45]:
dico_words_uniq

[('the', 'die'),
 ('and', 'sowie'),
 ('was', 'war'),
 ('for', 'für'),
 ('that', 'dass'),
 ('with', 'mit'),
 ('from', 'vom'),
 ('this', 'dieser'),
 ('his', 'seinem'),
 ('not', 'nicht')]

In [48]:
dico_uniq = torch.LongTensor(len(dico_words_uniq), 2)
for i, (word1, word2) in enumerate(dico_words_uniq):
    dico_uniq[i, 0] = word2id_src[word1]
    dico_uniq[i, 1] = word2id_tgt[word2]
dico_uniq

tensor([[  2,   6],
        [  7, 100],
        [ 14,  34],
        [ 17,  20],
        [ 20,  55],
        [ 22,  16],
        [ 23,  78],
        [ 26,  86],
        [ 31, 154],
        [ 32,  30]])

In [50]:
avg_cos = 0
cos_all = torch.cosine_similarity(emb_src[dico_uniq[:,0]], emb_tgt[dico_uniq[:,1]])
# for word_id1, word_id2 in dico_uniq:
#     avg_costorch.cosine_similarity(src_emb[word_id1], tgt_emb[word_id2])

In [52]:
cos_all

tensor([ 0.0193,  0.0257,  0.0758,  0.0467,  0.0366,  0.0144,  0.0531,  0.1040,
        -0.0562,  0.1039])