In [1]:
import numpy as np
import pandas as pd
import utils

__author__ = "Olivares Castillo José Luis"

In [2]:
def read(file, threshold=0, vocabulary=None, dtype='float'):
    # Copyright (C) 2016-2018  Mikel Artetxe <artetxem@gmail.com>
    # https://github.com/artetxem/vecmap/blob/master/embeddings.py
    """Función para leer un archivo con los word embeddings.
    Arguments:
        file {str} -- archivo a leer.
        threshold {int} -- Número a embeddings a leer
                           (default:{El indicado en la cabecera del archivo})
        vocabulary {str} -- Para solo acceder a vectores según el lexicon definido
                            (default:{None})
    
    Return:
        tuple -- lista de palabras y su correspondiente matriz de embeddings
    """
    header = file.readline().split(' ')
    count = int(header[0]) if threshold <= 0 else min(threshold, int(header[0]))
    dim = int(header[1])
    words = []
    matrix = np.empty((count, dim), dtype=dtype) if vocabulary is None else []
    for i in range(count):
        word, vec = file.readline().split(' ', 1)
        if vocabulary is None:
            words.append(word)
            matrix[i] = np.fromstring(vec, sep=' ', dtype=dtype)
        elif word in vocabulary:
            words.append(word)
            matrix.append(np.fromstring(vec, sep=' ', dtype=dtype))
    return (words, matrix) if vocabulary is None else (words, np.array(matrix, dtype=dtype))


def closest_word_to(top_10, words):
    """Función para retornar las palabras de top_10 mediante su índice
    Arguments:
        top_10 {list} -- lista de tupla con índice una palabra y su distancia.
        words {list} -- lista de palabras
    Return:
        list -- lista con palabras del top_10
    """
    return [words[index] for index, _ in top_10]


def get_lexicon(source):
    """Función para cargar lexicones
    Arguments:
        source {str} -- Nombre de archivo a cargar
    Return:
        list (2) -- lista con palabras del lexicon indicado.
    """
    if source.__eq__("en-it.train"):
        src,trg = load_lexicon("../dataset/dictionaries/en-it.train.drive.txt")
        return (src,trg)
    elif source.__eq__("en-it.test"):
        src,trg = load_lexicon("../dataset/dictionaries/en-it.test.drive.txt")
        return (src,trg)
    elif source.__eq__("en-de.test"):
        src,trg = load_lexicon("../dataset/dictionaries/en-de.test.txt")
        return (src,trg)
    elif source.__eq__("en-de.train"):
        src,trg = load_lexicon("../dataset/dictionaries/en-de.train.txt")
        return (src,trg)
    elif source.__eq__("en-es.test"):
        src,trg = load_lexicon("../dataset/dictionaries/en-es.test.txt")
        return (src,trg)
    elif source.__eq__("en-es.train"):
        src,trg = load_lexicon("../dataset/dictionaries/en-es.train.txt")
        return (src,trg)
    elif source.__eq__("en-fi.test"):
        src,trg = load_lexicon("../dataset/dictionaries/en-fi.test.txt")
        return (src,trg)
    elif source.__eq__("en-fi.train"):
        src,trg = load_lexicon("../dataset/dictionaries/en-fi.train.txt")
        return (src,trg)
    else:
        raise ValueError("Archivo no encontrado %s" % (source))
    
def load_lexicon(source):
    """Función auxiliar de `get_lexicon` para cargar lexicones
    Arguments:
        source {str} -- Nombre de archivo a cargar
    Return:
        list (2) -- lista con palabras del lexicon indicado.
    """
    src, trg = list(), list()
    with open(source, "r", encoding='utf-8') as file:
        for line in file:
            src.append(line.split()[0])
            trg.append(line.split()[1])
    return (src, trg)


def get_vectors(lexicon, words, embeddings, dtype='float'):
    """Función para cargar vectores del lexicon indicado.
    Arguments:
        lexicon {list} -- lista de palabras del lexicon
        words {list} -- lista con palabras de los vectores.
        embeddings {numpy.ndarray} -- matriz con embeddings
    Return:
        numpy.ndarray -- Matriz con embeddings del lexicon
    """
    matrix = np.empty((len(lexicon), embeddings.shape[1]), dtype=dtype)
    for i in range(len(lexicon)):
        if lexicon[i] in words:
            matrix[i] = embeddings[words.index(lexicon[i])]
    return np.asarray(matrix, dtype=dtype)


def open_file(source):
    """Función para leer archivos
    Arguments:
        source {str} -- Archivo a leer
    Return:
        _io.TextIOWrapper -- Apuntador a ficher a leer
    """
    if source.__eq__("en"):
        return open("../dataset/en.200k.300d.embeddings",encoding="utf-8", errors="surrogateescape")
    elif source.__eq__("it"):
        return open("../dataset/en-it/it.200k.300d.embeddings",encoding="utf-8", errors="surrogateescape")
    elif source.__eq__("es"):
        return open("../dataset/en-es/es.200k.300d.embeddings",encoding="utf-8", errors="surrogateescape")
    elif source.__eq__("de"):
        return open("../dataset/en-de/de.200k.300d.embeddings",encoding="utf-8", errors="surrogateescape")
    elif source.__eq__("fi"):
        return open("../dataset/en-fi/fi.200k.300d.embeddings",encoding="utf-8", errors="surrogateescape")
    elif source.__eq__("1"):
        return open("../dataset/data/EN.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt",errors="surrogateescape")
    elif source.__eq__("2"):
        return open("../dataset/data/IT.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt",errors="surrogateescape")
    else:
        raise ValueError("Archivo no encontrado %s" % (source))

In [46]:
src,trg=get_lexicon("en-fi.train")
len(src),len(trg)

(5000, 5000)

In [47]:
src[0],trg[0]

('bodies', 'elinten')

In [48]:
source_vec = open("../dataset/normalize/unit-center/en.emb.txt",errors="surrogateescape")
words_en,en_vec=read(source_vec)
src_vec=get_vectors(src,words_en,en_vec)
src_vec.shape

(5000, 300)

In [49]:
target_vec = open("../dataset/normalize/unit-center/fi.emb.txt",errors="surrogateescape")
words_trg,target_vec=read(target_vec)
trg_vec=get_vectors(trg,words_trg,target_vec)
trg_vec.shape

FileNotFoundError: [Errno 2] No such file or directory: '../dataset/normalize/unit-center/fi.emb.txt'

In [None]:
en_vec[words_en.index(src[0])][0]

In [42]:
src[0],trg[0]

('arts', 'espectáculo')

In [43]:
src_vec[0][0],trg_vec[0][0]

(0.0102355, 0.00434003)

In [44]:
with open("../dataset/normalize/unit-center/en-fi/en-train.5k.300d.embeddings.unit-center","w") as file:
    for i in range(src_vec.shape[0]):
        #file.write(en[i]+" "+" ".join(map(str,src_vec[i]))+"\n")it-train.5k.300d.embeddings
        if i.__ne__(src_vec.shape[0] - 1):
            file.write(src[i]+" "+" ".join(map(str,src_vec[i]))+"\n")
        else:
            file.write(src[i]+" "+" ".join(map(str,src_vec[i])))

In [45]:
with open("../dataset/normalize/unit-center/en-fi/fi-train.5k.300d.embeddings.unit-center","w") as file:
    for i in range(trg_vec.shape[0]):
        #file.write(en[i]+" "+" ".join(map(str,src_vec[i]))+"\n")
        if i.__ne__(trg_vec.shape[0] - 1):
            file.write(trg[i]+" "+" ".join(map(str,trg_vec[i]))+"\n")
        else:
            file.write(trg[i]+" "+" ".join(map(str,trg_vec[i])))

In [None]:
words_it.index("per")