In [1]:
import gzip
import io
import numpy as np
import utils
from zipfile import ZipFile
from collections import Counter

__author__ = "Olivares Castillo José Luis"

In [2]:
def read(file, threshold=0, vocabulary=None, dtype='float'):
    header = file.readline().split(' ')
    count = int(header[0]) if threshold <= 0 else min(threshold, int(header[0]))
    dim = int(header[1])
    words = []
    matrix = np.empty((count, dim), dtype=dtype) if vocabulary is None else []
    for i in range(count):
        word, vec = file.readline().split(' ', 1)
        if vocabulary is None:
            words.append(word)
            matrix[i] = np.fromstring(vec, sep=' ', dtype=dtype)
        elif word in vocabulary:
            words.append(word)
            matrix.append(np.fromstring(vec, sep=' ', dtype=dtype))
    return (words, matrix) if vocabulary is None else (words, np.array(matrix, dtype=dtype))

def get_vectors(lexicon, words, embeddings, dtype='float'):
    """Función para cargar vectores del lexicon indicado.
    Arguments:
        lexicon {list} -- lista de palabras del lexicon
        words {list} -- lista con palabras de los vectores.
        embeddings {numpy.ndarray} -- matriz con embeddings
    Return:
        numpy.ndarray -- Matriz con embeddings del lexicon
    """
    matrix = np.empty((len(lexicon), embeddings.shape[1]), dtype=dtype)
    for i in range(len(lexicon)):
        if lexicon[i] in words:
            matrix[i] = embeddings[words.index(lexicon[i])]
    return np.asarray(matrix, dtype=dtype)

In [3]:
src,trg=utils.get_lexicon("es-na.train")
len(src),len(trg)

(496, 496)

In [4]:
source_vec = open("datasets/es-na/es.node2vec.norm.embeddings",errors="surrogateescape")
words_en,en_vec=read(source_vec)
src_vec=get_vectors(src,words_en,en_vec)
src_vec.shape

(496, 128)

In [5]:
target_vec = open("datasets/es-na/na.node2vec.norm.embeddings",errors="surrogateescape")
words_trg,target_vec=read(target_vec)
trg_vec=get_vectors(trg,words_trg,target_vec)
trg_vec.shape

(496, 128)

In [6]:
with open("datasets/es-na/es.496.128d.train.norm.n2v","w") as file:
    for i in range(src_vec.shape[0]):
        #file.write(en[i]+" "+" ".join(map(str,src_vec[i]))+"\n")it-train.5k.300d.embeddings
        if i.__ne__(src_vec.shape[0] - 1):
            file.write(src[i]+" "+" ".join(map(str,src_vec[i]))+"\n")
        else:
            file.write(src[i]+" "+" ".join(map(str,src_vec[i])))

In [7]:
with open("datasets/es-na/na.496.128d.train.norm.n2v","w") as file:
    for i in range(trg_vec.shape[0]):
        #file.write(en[i]+" "+" ".join(map(str,src_vec[i]))+"\n")
        if i.__ne__(trg_vec.shape[0] - 1):
            file.write(trg[i]+" "+" ".join(map(str,trg_vec[i]))+"\n")
        else:
            file.write(trg[i]+" "+" ".join(map(str,trg_vec[i])))

In [None]:
src_lex,trg_lex = utils.get_lexicon("en-it.train")
len((src_lex)),len((trg_lex))

In [None]:
asd = Counter(trg_lex)

In [None]:
asd["per"]

In [None]:
filesrc = open("datasets/en.200k.300d.fst","r")
wordssrc, vecsrc = read(filesrc,vocabulary=Counter(src_lex),is_zipped=False)
vecsrc.shape

In [None]:
filetrg = open("datasets/en-it/it.200k.300d.fst","r",encoding="utf-8")
wordstrg, vectrg = read(filetrg,vocabulary=Counter(trg_lex),is_zipped=False)
vectrg.shape

In [None]:
src_lex[0],trg_lex[0],wordssrc[0],wordstrg[0],vectrg[0][:2],vectrg[3][:2]

In [None]:
[wordstrg[i] for i in range(5)]

In [None]:
vectrg[0],vectrg[1]

In [None]:
vecsrc.shape

In [None]:
def write_to_file(path,matrix,lex):
    with open(path,"w",encoding="utf-8") as f:
        for _ in range(matrix.shape[0]):
            if _.__ne__(matrix.shape[0]-1):
                f.write(lex[_]+" "+" ".join(map(str,matrix[_]))+"\n")
            else:
                f.write(lex[_]+" "+" ".join(map(str,matrix[_])))

In [None]:
write_to_file(path="datasets/en-it/en.5k.300d.fst",matrix=vecsrc,lex=src_lex)

In [None]:
write_to_file(path="datasets/en-it/it.5k.300d.fst",matrix=vectrg,lex=trg_lex)

In [None]:
src_vec.shape,len(src_words),src_matrix.shape

In [None]:
tm=list(set(src_lex))

In [None]:
x=[tm,src_words]

In [None]:
x=[set(a) for a in x]

In [None]:
w=0
for i in tm:
    if i not in src_words:
        print(i)
        w+=1
print(w)

In [None]:
1310+190