In [1]:
import numpy as np
import pandas as pd
import utils

__author__ = "Olivares Castillo José Luis"

In [2]:
def read(file, threshold=0, vocabulary=None, dtype='float'):
    # Copyright (C) 2016-2018  Mikel Artetxe <artetxem@gmail.com>
    # https://github.com/artetxem/vecmap/blob/master/embeddings.py
    header = file.readline().split(' ')
    count = int(header[0]) if threshold <= 0 else min(threshold, int(header[0]))
    dim = int(header[1])
    words = []
    matrix = np.empty((count, dim), dtype=dtype) if vocabulary is None else []
    for i in range(count):
        word, vec = file.readline().split(' ', 1)
        if vocabulary is None:
            words.append(word)
            matrix[i] = np.fromstring(vec, sep=' ', dtype=dtype)
        elif word in vocabulary:
            words.append(word)
            matrix.append(np.fromstring(vec, sep=' ', dtype=dtype))
    return (words, matrix) if vocabulary is None else (words, np.array(matrix, dtype=dtype))

In [3]:
def read_lexicon(source):
    src,trg=list(),list()
    if source.__eq__("train"):
        with open("../dataset/OPUS_en_it_europarl_train_5K.txt","r",encoding='utf-8') as file:
            for line in file:
                src.append(line.split()[0])
                trg.append(line.split()[1])
        return (src,trg)
    elif source.__eq__("test"):
        with open("../dataset/OPUS_en_it_europarl_test.txt","r",encoding='utf-8') as file:
            for line in file:
                src.append(line.split()[0])
                trg.append(line.split()[1])
        return (src,trg)

In [4]:
def get_vectors(lexicon,words,embeddings,dtype='float'):
    matrix = np.empty((len(lexicon),embeddings.shape[1]),dtype=dtype)
    for i in range(len(lexicon)):
        if lexicon[i] in words:
            #tmp.append(list(embeddings[words.index(lexicon[i])]))
            matrix[i] = embeddings[words.index(lexicon[i])]
    return np.asarray(matrix,dtype=dtype)

In [5]:
en,it=read_lexicon("train")
len(en),len(it)

(5000, 5000)

In [6]:
source_vec = open("../dataset/EN.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt",
             encoding="utf-8",errors="surrogateescape")

words_en,en_vec=read(source_vec)

In [7]:
src_vec=get_vectors(en,words_en,en_vec)
src_vec.shape

(5000, 300)

In [8]:
target_vec = open("../dataset/IT.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt",
             encoding="utf-8",errors="surrogateescape")

words_it,it_vec=read(target_vec)

In [9]:
trg_vec=get_vectors(it,words_it,it_vec)
trg_vec.shape

(5000, 300)

In [41]:
words_en

['the',
 ',',
 '.',
 'of',
 'and',
 'to',
 'in',
 'a',
 'is',
 '"',
 'for',
 'on',
 'that',
 ')',
 '(',
 'with',
 'was',
 'as',
 'it',
 'by',
 'be',
 ':',
 "'s",
 'are',
 'at',
 'this',
 'from',
 'you',
 'or',
 'i',
 'an',
 'he',
 'have',
 "'",
 'not',
 '-',
 'which',
 'his',
 'will',
 'has',
 'but',
 'we',
 'they',
 'all',
 'their',
 'were',
 'can',
 ';',
 'one',
 'also',
 'there',
 'had',
 'more',
 'been',
 'if',
 'who',
 'new',
 'your',
 'other',
 '?',
 'its',
 'when',
 'about',
 'first',
 'up',
 'would',
 'out',
 'do',
 'time',
 'some',
 'so',
 'may',
 'our',
 'what',
 'into',
 'two',
 'no',
 'these',
 'people',
 'only',
 'after',
 'any',
 '!',
 'her',
 'she',
 'over',
 'than',
 'work',
 'them',
 "n't",
 'most',
 'such',
 'then',
 'my',
 'many',
 'information',
 'where',
 'years',
 'use',
 'now',
 'year',
 'should',
 'well',
 'how',
 'very',
 'being',
 'through',
 'like',
 '%',
 'between',
 'made',
 'used',
 'world',
 'us',
 'just',
 'could',
 '£',
 'see',
 'part',
 'school',
 'him

In [46]:
with open("../dataset/NEW-EN","w") as file:
    for i in range(src_vec.shape[0]):
        #file.write(en[i]+" "+" ".join(map(str,src_vec[i]))+"\n")
        file.write(" ".join(map(str,src_vec[i]))+"\n")

In [48]:
with open("../dataset/NEW-IT","w") as file:
    for i in range(trg_vec.shape[0]):
        #file.write(it[i]+" "+" ".join(map(str,trg_vec[i]))+"\n")
        file.write(" ".join(map(str,trg_vec[i]))+"\n")

In [43]:
words_it.index("per")

12

In [45]:
it_vec[12]

array([-0.004587, -0.09339 ,  0.024194, -0.160234, -0.081758,  0.02754 ,
        0.001967, -0.007735, -0.07485 ,  0.004046, -0.015186,  0.043899,
        0.060886, -0.084293,  0.02517 ,  0.019767, -0.088462, -0.100752,
       -0.076238, -0.029609, -0.037766,  0.074355, -0.011544,  0.124296,
       -0.131977,  0.016333, -0.024172,  0.039691, -0.051303, -0.046599,
        0.020477,  0.177797,  0.001731, -0.031561,  0.080046, -0.02657 ,
        0.041981,  0.024949,  0.100902, -0.018006,  0.171553, -0.153255,
        0.031861,  0.049595,  0.05172 ,  0.056013, -0.089203, -0.070196,
       -0.035462,  0.012322, -0.031717, -0.013635,  0.014406, -0.121697,
        0.07401 , -0.003795,  0.136772,  0.027047, -0.145845,  0.069214,
       -0.133257, -0.105563, -0.121019, -0.049852, -0.095362, -0.029586,
       -0.120687, -0.104069,  0.018176,  0.052571,  0.085504,  0.008815,
       -0.042495, -0.012995,  0.138161, -0.023372, -0.054404,  0.032455,
       -0.067908, -0.01588 ,  0.015618,  0.044195, 