In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import utils

#from scipy.spatial.distance import cdist
__author__ = "Olivares Castillo José Luis"

tf.__version__

  from ._conv import register_converters as _register_converters


'1.6.0'

In [2]:
def read(file, threshold=0, vocabulary=None, dtype='float'):
    # Copyright (C) 2016-2018  Mikel Artetxe <artetxem@gmail.com>
    # https://github.com/artetxem/vecmap/blob/master/embeddings.py
    """Función para leer un archivo con los word embeddings.
    Arguments:
        file {str} -- archivo a leer.
        threshold {int} -- Número a embeddings a leer
                           (default:{El indicado en la cabecera del archivo})
        vocabulary {str} -- Para solo acceder a vectores según el lexicon definido
                            (default:{None})
    
    Return:
        tuple -- lista de palabras y su correspondiente matriz de embeddings
    """
    header = file.readline().split(' ')
    count = int(header[0]) if threshold <= 0 else min(threshold, int(header[0]))
    dim = int(header[1])
    words = []
    matrix = np.empty((count, dim), dtype=dtype) if vocabulary is None else []
    for i in range(count):
        word, vec = file.readline().split(' ', 1)
        if vocabulary is None:
            words.append(word)
            matrix[i] = np.fromstring(vec, sep=' ', dtype=dtype)
        elif word in vocabulary:
            words.append(word)
            matrix.append(np.fromstring(vec, sep=' ', dtype=dtype))
    return (words, matrix) if vocabulary is None else (words, np.array(matrix, dtype=dtype))


def closest_word_to(top_10, words):
    """Función para retornar las palabras de top_10 mediante su índice
    Arguments:
        top_10 {list} -- lista de tupla con índice una palabra y su distancia.
        words {list} -- lista de palabras
    Return:
        list -- lista con palabras del top_10
    """
    return [words[index] for index, _ in top_10]


def get_lexicon(source):
    """Función para cargar lexicones
    Arguments:
        source {str} -- Nombre de archivo a cargar
    Return:
        list (2) -- lista con palabras del lexicon indicado.
    """
    if source.__eq__("en-it.train"):
        src,trg = load_lexicon("../dataset/dictionaries/en-it.train.txt")
        return (src,trg)
    elif source.__eq__("en-it.test"):
        src,trg = load_lexicon("../dataset/dictionaries/en-it.test.txt")
        return (src,trg)
    elif source.__eq__("en-de.test"):
        src,trg = load_lexicon("../dataset/dictionaries/en-de.test.txt")
        return (src,trg)
    elif source.__eq__("en-de.train"):
        src,trg = load_lexicon("../dataset/dictionaries/en-de.train.txt")
        return (src,trg)
    elif source.__eq__("en-es.test"):
        src,trg = load_lexicon("../dataset/dictionaries/en-es.test.txt")
        return (src,trg)
    elif source.__eq__("en-es.train"):
        src,trg = load_lexicon("../dataset/dictionaries/en-es.train.txt")
        return (src,trg)
    elif source.__eq__("en-fi.test"):
        src,trg = load_lexicon("../dataset/dictionaries/en-fi.test.txt")
        return (src,trg)
    elif source.__eq__("en-fi.train"):
        src,trg = load_lexicon("../dataset/dictionaries/en-fi.train.txt")
        return (src,trg)
    else:
        raise ValueError("Archivo no encontrado %s" % (source))
    
def load_lexicon(source):
    """Función auxiliar de `get_lexicon` para cargar lexicones
    Arguments:
        source {str} -- Nombre de archivo a cargar
    Return:
        list (2) -- lista con palabras del lexicon indicado.
    """
    src, trg = list(), list()
    with open(source, "r", encoding='utf-8') as file:
        for line in file:
            src.append(line.split()[0])
            trg.append(line.split()[1])
    return (src, trg)


def get_vectors(lexicon, words, embeddings, dtype='float'):
    """Función para cargar vectores del lexicon indicado.
    Arguments:
        lexicon {list} -- lista de palabras del lexicon
        words {list} -- lista con palabras de los vectores.
        embeddings {numpy.ndarray} -- matriz con embeddings
    Return:
        numpy.ndarray -- Matriz con embeddings del lexicon
    """
    matrix = np.empty((len(lexicon), embeddings.shape[1]), dtype=dtype)
    for i in range(len(lexicon)):
        if lexicon[i] in words:
            matrix[i] = embeddings[words.index(lexicon[i])]
    return np.asarray(matrix, dtype=dtype)


def open_file(source,normalize):
    """Función para leer archivos
    Arguments:
        source {str} -- Archivo a leer
    Return:
        _io.TextIOWrapper -- Apuntador a fichero a leer
    """
    if normalize:
        norm_path = "normalize/unit-center/"
    else:
        norm_path = ""
    #print(norm_path)
    if source.__eq__("en"):
        return open("../dataset/"+norm_path+"en.200k.300d.embeddings",encoding="utf-8", errors="surrogateescape")
    elif source.__eq__("it"):
        return open("../dataset/"+norm_path+"en-it/it.200k.300d.embeddings",encoding="utf-8", errors="surrogateescape")
    elif source.__eq__("es"):
        return open("../dataset/"+norm_path+"en-es/es.200k.300d.embeddings",encoding="utf-8", errors="surrogateescape")
    elif source.__eq__("de"):
        return open("../dataset/"+norm_path+"en-de/de.200k.300d.embeddings",encoding="utf-8", errors="surrogateescape")
    elif source.__eq__("fi"):
        return open("../dataset/"+norm_path+"en-fi/fi.200k.300d.embeddings",encoding="utf-8", errors="surr|ogateescape")
    else:
        raise ValueError("Archivo no encontrado %s" % (source))

In [3]:
normalize=False

In [4]:
LEER = {"de":{"idioma":"en-de.test",
        "objetivo":"de",
        "model":"en-de"},
        
        "fi":{"idioma":"en-fi.test",
        "objetivo":"fi",
        "model":"en-fi"},
        
        "it":{"idioma":"en-it.test",
        "objetivo":"it",
        "model":"en-it"},
        
        "es":{"idioma":"en-es.test",
        "objetivo":"es",
        "model":"en-es"},
       }

In [5]:
lex = LEER["it"]

In [6]:
words_scr_lexicon, words_trg_lexicon = get_lexicon(lex["idioma"])

In [7]:
print(len(words_scr_lexicon), len(words_trg_lexicon))

1869 1869


In [None]:
source_vec = open_file('en',normalize=normalize)
words_src, source_vec = read(source_vec)
eval_src = list(set(words_scr_lexicon))
src_vec = get_vectors(eval_src, words_src, source_vec)
print(src_vec.shape)

In [None]:
target_vec = open_file(lex["objetivo"],normalize=normalize)
words_trg, target_vec = read(target_vec)
#eval_it = list(set(it))
#trg_vec = get_vectors(eval_it, words_it, it_vec)
print(target_vec.shape)

In [None]:
test_vectors = src_vec

In [None]:
if normalize:
    norm_model="/norm"
else:
    norm_model=""
'./models/'+lex["model"]+norm_model+'/model1111_gpu/model2250.ckpt.meta'

In [None]:
tf.reset_default_graph()
print(lex["model"])
sess = tf.Session()
saver = tf.train.import_meta_graph('./models/'+lex["model"]+norm_model+'/model1111_gpu/model2250.ckpt.meta')
saver.restore(sess, tf.train.latest_checkpoint('./models/'+lex["model"]+norm_model+'/model1111_gpu/'))

In [None]:
graph = tf.get_default_graph()
X = graph.get_tensor_by_name("input/input_es:0")
kprob = graph.get_tensor_by_name("dropout_prob:0")

In [None]:
#([n.name for n in graph.as_graph_def().node])

In [None]:
output_NN = graph.get_tensor_by_name("xw_plus_b_1:0")
#output_NN = graph.get_tensor_by_name("nah_predicted:0")
#code = graph.get_tensor_by_name("xw_plus_b_2:0")
print(output_NN)

feed_dict = {X: test_vectors, kprob: 1}
pred = sess.run(output_NN, feed_dict)
print(pred.shape)

In [None]:
%%time
top_10 = [utils.get_top10_closest(pred[_], target_vec) for _ in range(pred.shape[0])]

In [None]:
%%time
closest = [closest_word_to(top_10[_], words_trg) for _ in range(pred.shape[0])]

In [None]:
resultados = {palabra_en: top_10_it for (palabra_en, top_10_it) in zip(eval_src, closest)}

In [None]:
gold = utils.gold_dict(words_scr_lexicon, words_trg_lexicon)

In [None]:
%%time
p1, p5, p10 = 0, 0, 0
list_en_eval = list(resultados.keys())
hits, not_found = [], []

for palabra_gold in list_en_eval:
    for i in gold[palabra_gold]:
        if i in resultados[palabra_gold]:
            hits.append(resultados[palabra_gold].index(i))
    if hits.__len__() > 0:
        if min(hits) == 0:
            p1 += 1
            p5 += 1
            p10 += 1
        if min(hits) >= 1 and min(hits) <= 5:
            p5 += 1
            p10 += 1
        if min(hits) > 5 and min(hits) < 10:
            p10 += 1
    else:
        not_found.append(palabra_gold)
    hits.clear()

length = list_en_eval.__len__()
print("not found:", not_found.__len__(), "-", not_found.__len__() / length, "%")
print("P@1:", p1, "\tP@5:", p5, "\tP@10:", p10)
print("P@1:", p1 / length, "\tP@5:", p5 /length, "\tP@10:", p10 / length)