In [19]:
import io
from collections import defaultdict

In [1]:
lang_list = ["en", "de", "es", "it", "fi", "ar", "he", "ms", "ja", "zh", "vi", "th"]

In [42]:
def load_identical_char_dico(word2id1, word2id2):
    """
    Build a dictionary of identical character strings.
    """
#     pairs = [(w1, w1) for w1 in word2id1.keys() if w1 in word2id2 and len(w1) > 1]
    pairs = [(w1, w1) for w1 in word2id1.keys() if w1 in word2id2]
    if len(pairs) == 0:
        raise Exception("No identical character strings were found. "
                        "Please specify a dictionary.")

    print("Found %i pairs of identical character strings." % len(pairs))

#     # sort the dictionary by source word frequencies
#     pairs = sorted(pairs, key=lambda x: word2id1[x[0]])
#     dico = torch.LongTensor(len(pairs), 2)
#     for i, (word1, word2) in enumerate(pairs):
#         dico[i, 0] = word2id1[word1]
#         dico[i, 1] = word2id2[word2]

    return len(pairs), pairs

In [43]:
def load_word2id(emb_path, max_vocab=1000, full_vocab=False):
    """
    Reload pretrained embeddings from a text file.
    """
    word2id = {}

    # load pretrained embeddings 
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        for i, line in enumerate(f):
            if i == 0:
                print(line.strip())
                split = line.split()
                num_words = split[0]
                assert len(split) == 2
            else:
                word, _ = line.rstrip().split(' ', 1)
                if not full_vocab:
                    word = word.lower()
                if word in word2id:
                    if full_vocab:
                        print("Word '%s' found twice in %s embedding file"
                                       % (word, 'source' if source else 'target'))
                else:
                    word2id[word] = len(word2id)
            if max_vocab > 0 and len(word2id) >= max_vocab and not full_vocab:
                break

    print("Loaded %i pre-trained word embeddings from %s." % (len(word2id), emb_path))

    return word2id, num_words

In [44]:
emb_file_list = ["/mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.%s.vec"%lang for lang in lang_list]

In [30]:
for emb_file_i in emb_file_list:
    word2id1, num_words1 = load_word2id(emb_file_i)
    for emb_file_j in emb_file_list:
        if emb_file_j == emb_file_i:
            continue
        word2id2, num_words2 = load_word2id(emb_file_j)
        load_identical_char_dico(word2id1, word2id2)

2519370 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.en.vec.
2275233 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.de.vec.
Found 150 pairs of identical character strings.
985667 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.es.vec.
Found 140 pairs of identical character strings.
871053 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.it.vec.
Found 141 pairs of identical character strings.
730483 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.fi.vec.
Found 128 pairs of identical character strings.
610977 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.ar.vec.
Found 75 pairs of identical character strings.
488936 300
Load

Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.en.vec.
Found 163 pairs of identical character strings.
2275233 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.de.vec.
Found 116 pairs of identical character strings.
985667 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.es.vec.
Found 109 pairs of identical character strings.
871053 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.it.vec.
Found 104 pairs of identical character strings.
730483 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.fi.vec.
Found 105 pairs of identical character strings.
610977 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.ar.vec.
Found 86 pairs of identica

In [50]:
iden_num = defaultdict(dict)
for lang_i in lang_list:
    emb_file_i = "/mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.%s.vec"%lang_i
    max_vocab = 1000
    word2id1, num_words1 = load_word2id(emb_file_i, max_vocab)
    iden_num[lang_i]["num_words"] = num_words1
    iden_num[lang_i]["max_vocab"] = max_vocab
    
    for lang_j in lang_list:
        emb_file_j = "/mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.%s.vec"%lang_j
        if emb_file_j == emb_file_i:
            continue
        word2id2, num_words2 = load_word2id(emb_file_j, max_vocab)
        iden_tmp, iden_pair = load_identical_char_dico(word2id1, word2id2)
        iden_num[lang_i][lang_j] = {"num": iden_tmp, "pairs": iden_pair}

2519370 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.en.vec.
2275233 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.de.vec.
Found 109 pairs of identical character strings.
985667 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.es.vec.
Found 96 pairs of identical character strings.
871053 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.it.vec.
Found 98 pairs of identical character strings.
730483 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.fi.vec.
Found 85 pairs of identical character strings.
610977 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.ar.vec.
Found 36 pairs of identical character strings.
488936 300
Loaded 

871053 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.it.vec.
Found 63 pairs of identical character strings.
730483 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.fi.vec.
Found 64 pairs of identical character strings.
610977 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.ar.vec.
Found 49 pairs of identical character strings.
488936 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.he.vec.
Found 17 pairs of identical character strings.
580000 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.ja.vec.
Found 28 pairs of identical character strings.
332647 300
Loaded 1000 pre-trained word embeddings from /mnt/ssd-201-112-01/cpii.local/lptang/WordEmb/fasttext/wiki.zh.vec.
Found 38 pairs of ide

In [51]:
# max_vocab = 10000
iden_num

defaultdict(dict,
            {'en': {'num_words': '2519370',
              'max_vocab': 1000,
              'de': {'num': 109,
               'pairs': [('the', 'the'),
                ('</s>', '</s>'),
                ('of', 'of'),
                ('in', 'in'),
                ('and', 'and'),
                ('to', 'to'),
                ('was', 'was'),
                ('on', 'on'),
                ('for', 'for'),
                ('from', 'from'),
                ('at', 'at'),
                ('an', 'an'),
                ('also', 'also'),
                ('rd', 'rd'),
                ('new', 'new'),
                ('all', 'all'),
                ('score', 'score'),
                ('team', 'team'),
                ('american', 'american'),
                ('links', 'links'),
                ('see', 'see'),
                ('so', 'so'),
                ('world', 'world'),
                ('university', 'university'),
                ('national', 'national'),
                ('wikiped

In [32]:
# max_vocab = 1000
iden_num

defaultdict(dict,
            {'en': {'num_words': '2519370',
              'max_vocab': 1000,
              'de': 150,
              'es': 140,
              'it': 141,
              'fi': 128,
              'ar': 75,
              'he': 44,
              'ms': 163,
              'ja': 66,
              'zh': 84,
              'vi': 96,
              'th': 63},
             'de': {'num_words': '2275233',
              'max_vocab': 1000,
              'en': 150,
              'es': 110,
              'it': 128,
              'fi': 110,
              'ar': 65,
              'he': 43,
              'ms': 116,
              'ja': 67,
              'zh': 67,
              'vi': 81,
              'th': 52},
             'es': {'num_words': '985667',
              'max_vocab': 1000,
              'en': 140,
              'de': 110,
              'it': 208,
              'fi': 107,
              'ar': 72,
              'he': 44,
              'ms': 109,
              'ja': 67,
              '