In [2]:
import pandas as pd
import numpy as np
import utils

In [3]:
def read(file, threshold=0, vocabulary=None, dtype='float'):
    # Copyright (C) 2016-2018  Mikel Artetxe <artetxem@gmail.com>
    # https://github.com/artetxem/vecmap/blob/master/embeddings.py
    """Función para leer un archivo con los word embeddings.
    Arguments:
        file {str} -- archivo a leer.
        threshold {int} -- Número a embeddings a leer
                           (default:{El indicado en la cabecera del archivo})
        vocabulary {str} -- Para solo acceder a vectores según el lexicon definido
                            (default:{None})
    
    Return:
        tuple -- lista de palabras y su correspondiente matriz de embeddings
    """
    header = file.readline().split(' ')
    count = int(header[0]) if threshold <= 0 else min(threshold, int(header[0]))
    dim = int(header[1])
    words = []
    matrix = np.empty((count, dim), dtype=dtype) if vocabulary is None else []
    for i in range(count):
        word, vec = file.readline().split(' ', 1)
        if vocabulary is None:
            words.append(word)
            matrix[i] = np.fromstring(vec, sep=' ', dtype=dtype)
        elif word in vocabulary:
            words.append(word)
            matrix.append(np.fromstring(vec, sep=' ', dtype=dtype))
    return (words, matrix) if vocabulary is None else (words, np.array(matrix, dtype=dtype))


def closest_word_to(top_10, words):
    """Función para retornar las palabras de top_10 mediante su índice
    Arguments:
        top_10 {list} -- lista de tupla con índice una palabra y su distancia.
        words {list} -- lista de palabras
    Return:
        list -- lista con palabras del top_10
    """
    return [words[index] for index, _ in top_10]


def get_lexicon(source):
    """Función para cargar lexicones
    Arguments:
        source {str} -- Nombre de archivo a cargar
    Return:
        list (2) -- lista con palabras del lexicon indicado.
    """
    if source.__eq__("en-it.train"):
        src,trg = load_lexicon("../dataset/dictionaries/en-it.train.txt")
        return (src,trg)
    elif source.__eq__("en-it.test"):
        src,trg = load_lexicon("../dataset/dictionaries/en-it.test.txt")
        return (src,trg)
    elif source.__eq__("en-de.test"):
        src,trg = load_lexicon("../dataset/dictionaries/en-de.test.txt")
        return (src,trg)
    elif source.__eq__("en-de.train"):
        src,trg = load_lexicon("../dataset/dictionaries/en-de.train.txt")
        return (src,trg)
    elif source.__eq__("en-es.test"):
        src,trg = load_lexicon("../dataset/dictionaries/en-es.test.txt")
        return (src,trg)
    elif source.__eq__("en-es.train"):
        src,trg = load_lexicon("../dataset/dictionaries/en-es.train.txt")
        return (src,trg)
    elif source.__eq__("en-fi.test"):
        src,trg = load_lexicon("../dataset/dictionaries/en-fi.test.txt")
        return (src,trg)
    elif source.__eq__("en-fi.train"):
        src,trg = load_lexicon("../dataset/dictionaries/en-fi.train.txt")
        return (src,trg)
    else:
        print("ERR: dataset inválido")
        return None,None
    
def load_lexicon(source):
    """Función auxiliar de `get_lexicon` para cargar lexicones
    Arguments:
        source {str} -- Nombre de archivo a cargar
    Return:
        list (2) -- lista con palabras del lexicon indicado.
    """
    src, trg = list(), list()
    with open(source, "r", encoding='utf-8') as file:
        for line in file:
            src.append(line.split()[0])
            trg.append(line.split()[1])
    return (src, trg)


def get_vectors(lexicon, words, embeddings, dtype='float'):
    """Función para cargar vectores del lexicon indicado.
    Arguments:
        lexicon {list} -- lista de palabras del lexicon
        words {list} -- lista con palabras de los vectores.
        embeddings {numpy.ndarray} -- matriz con embeddings
    Return:
        numpy.ndarray -- Matriz con embeddings del lexicon
    """
    matrix = np.empty((len(lexicon), embeddings.shape[1]), dtype=dtype)
    for i in range(len(lexicon)):
        if lexicon[i] in words:
            matrix[i] = embeddings[words.index(lexicon[i])]
    return np.asarray(matrix, dtype=dtype)


def open_file(source):
    """Función para leer archivos
    Arguments:
        source {str} -- Archivo a leer
    Return:
        _io.TextIOWrapper -- Apuntador a fichero a leer
    """
    if source.__eq__("en"):
        return open("../dataset/en.200k.300d.embeddings",encoding="utf-8", errors="surrogateescape")
    elif source.__eq__("it"):
        return open("../dataset/en-it/it.200k.300d.embeddings",encoding="utf-8", errors="surrogateescape")
    elif source.__eq__("es"):
        return open("../dataset/en-es/es.200k.300d.embeddings",encoding="utf-8", errors="surrogateescape")
    elif source.__eq__("de"):
        return open("../dataset/en-de/de.200k.300d.embeddings",encoding="utf-8", errors="surrogateescape")
    elif source.__eq__("fi"):
        return open("../dataset/en-fi/fi.200k.300d.embeddings",encoding="utf-8", errors="surr|ogateescape")
    else:
        print("ERROR: dataset inválido")
        return None

In [4]:
def revisar_palabras_duplicadas(source):
    print("en-"+source)
    w1test, w2test = get_lexicon("en-"+source+".test")
    w1train, w2train = get_lexicon("en-"+source+".train")
    i=0
    x=[]
    print("w1...")
    for word in w1test:
        if word in w1train:
            i+=1
            print(w2test.index(word),word,w2train[w2train.index(word)])
    print(i,end="\t")
    x = [w1test,w1train]
    x = [set(a) for a in x]
    f=set.intersection(*x)
    print(f.__len__())
    ##
    i=0
    x.clear()
    print("w2...")
    for word in w2test:
        if word in w2train:
            i+=1
            print(w2test.index(word),word,w2train[w2train.index(word)])
    print(i,end="\t")
    x = [w2test,w2train]
    x = [set(a) for a in x]
    f=set.intersection(*x)
    print(f.__len__())
    print("##################")

In [6]:
revisar_palabras_duplicadas("it")
revisar_palabras_duplicadas("de")
revisar_palabras_duplicadas("fi")
revisar_palabras_duplicadas("es")

en-it
w1...
0	0
w2...
2 cucinare cucinare
4 musicali musicali
8 passeggeri passeggeri
14 disposti disposti
16 ultimi ultimi
30 problema problema
40 sono sono
42 prodotto prodotto
43 inchiesta inchiesta
44 indagine indagine
49 grande grande
50 nominati nominati
65 profitti profitti
66 dichiarato dichiarato
68 sufficiente sufficiente
88 norme norme
89 standard standard
92 golfo golfo
98 sole sole
104 paura paura
122 data data
134 comunità comunità
141 santa santa
142 attori attori
144 uccelli uccelli
161 spazio spazio
168 trasformazione trasformazione
170 mantenere mantenere
171 foto foto
173 rubrica rubrica
174 mai mai
178 stessa stessa
185 acquistato acquistato
186 facoltà facoltà
200 occidentale occidentale
202 capo capo
208 attività attività
211 supplementari supplementari
214 lord lord
223 consentire consentire
232 bancari bancari
243 deve deve
251 obiettivo obiettivo
252 scopo scopo
254 aiuto aiuto
272 esterne esterne
279 presidente presidente
290 fondi fondi
319 risposta risposta


In [8]:
def get_vectors(lexicon, words, embeddings, dtype='float'):
    matrix = np.empty((len(lexicon), embeddings.shape[1]), dtype=dtype)
    tmp=[]
    for i in range(len(lexicon)):
        if lexicon[i] in words:
            matrix[i] = embeddings[words.index(lexicon[i])]
            tmp.append(words.index(lexicon[i]))
    return np.asarray(matrix, dtype=dtype),tmp

In [70]:
w1test, w2test = get_lexicon("en-it.test")
w1train, w2train = get_lexicon("en-it.train")
source_vec = open_file('fi')
words_src, src_vec = read(source_vec)

In [71]:
unique,index=np.unique(src_vec,axis=0,return_counts=True)

In [72]:
len(words_src),len(set(words_src)),unique.shape,index.shape

(200000, 200000, (200000, 300), (200000,))

In [27]:
eval_src = list(set(w2test))
src_vec,tmp = get_vectors(eval_src, words_src, source_vec)
print(src_vec.shape)

eval_src = list(set(w2train))
src_vec1,tmp1 = get_vectors(eval_src, words_src, source_vec)

print(src_vec1.shape)
x = [tmp,tmp1]
x = [set(a) for a in x]
f=set.intersection(*x)
print("intersection")
print(f.__len__())
if f:
    print('lista')
    f=list(f)
    print(words_src[f[0]])

(1849, 300)
(4549, 300)
intersection
35
lista
critica


In [22]:
words_src

['</s>',
 ',',
 '.',
 'die',
 'der',
 'und',
 'in',
 'den',
 '"',
 'zu',
 'das',
 'von',
 'ist',
 'mit',
 'des',
 'nicht',
 'für',
 'auf',
 'im',
 'sich',
 'ein',
 'eine',
 'es',
 'sie',
 'auch',
 'dem',
 'als',
 '(',
 ')',
 'werden',
 'an',
 'ich',
 '-',
 'er',
 'sind',
 'bei',
 'aus',
 'nach',
 'wie',
 'wird',
 'oder',
 'dass',
 'so',
 'einer',
 'hat',
 'aber',
 'nur',
 'um',
 'wir',
 'einen',
 'durch',
 'über',
 'noch',
 'man',
 'zum',
 ':',
 'zur',
 'einem',
 'wenn',
 'am',
 'war',
 'haben',
 'kann',
 'vor',
 '?',
 'diese',
 'sein',
 'mehr',
 'was',
 'wurde',
 'können',
 'daß',
 'bis',
 'dieser',
 '!',
 'dann',
 'vom',
 'ihre',
 'unter',
 'schon',
 '/',
 'immer',
 'hier',
 'eines',
 'wieder',
 'uns',
 'ihr',
 'alle',
 'doch',
 ';',
 'seine',
 'sehr',
 'da',
 'gegen',
 'keine',
 'gibt',
 'menschen',
 'habe',
 'hatte',
 'anderen',
 'zeit',
 'denn',
 'diesem',
 'sondern',
 'selbst',
 'mir',
 'zwischen',
 'dies',
 'ihrer',
 'waren',
 'seiner',
 'wurden',
 'ohne',
 'damit',
 'sei',
 'ja

In [20]:
tmp.__len__(),tmp1.__len__()

(1500, 3442)

In [None]:
import tensorflow as tf
import utils
import pandas as pd
import numpy as np
from sys import argv
from collections import defaultdict


__author__ = "Olivares Castillo José Luis"

In [None]:
es, na = utils.load_embeddings("n2v")

In [None]:
na.head()

In [None]:
na_dummy = na.drop(na.columns[0], axis=1)
type(na_dummy)

In [None]:
na_dummy.head()

In [None]:
na_vectores1 = na_dummy.values.astype(np.float64)
na_vectores1

In [None]:

eval_set = utils.get_lexicon("eval")
eval_es = list(set(eval_set["esp"]))
eval_es_index = [int(es[es[0] == palabra].index[0])
                 for palabra in eval_es]

In [None]:
eval_es_vectores = utils.get_vectors(es, eval_es_index)
test_vectors = np.array([np.array(es.iloc[indice][1::]).astype(np.float64) for indice in eval_es_index])

In [None]:
eval_es_vectores = utils.get_vectors(es, eval_es_index)
test_vectors = np.array([np.array(es.iloc[indice][1::]).astype(np.float64) for indice in eval_es_index])

In [None]:
sess = tf.Session()
#saver = tf.train.import_meta_graph('./models/model1111_gpu/model2250.ckpt.meta')
#saver.restore(sess, tf.train.latest_checkpoint('./models/model1111_gpu/'))
saver = tf.train.import_meta_graph('./models/model_joyce/modeljoyce.ckpt.meta')
saver.restore(sess, tf.train.latest_checkpoint('./models/model_joyce/'))


In [None]:
graph = tf.get_default_graph()
X = graph.get_tensor_by_name("input/input_es:0")
#y = graph.get_tensor_by_name("input/target_na:0")
kprob = graph.get_tensor_by_name("dropout_prob:0")

In [None]:
output_NN = graph.get_tensor_by_name("xw_plus_b_1:0")
#output_NN = graph.get_tensor_by_name("dense_2/BiasAdd:0")
#output_NN = graph.get_tensor_by_name("output_1:0")

In [None]:
feed_dict = {X: test_vectors, kprob: 1}
pred = sess.run(output_NN, feed_dict)

In [None]:
top_10 = [utils.get_top10_closest(pred[_], na_vectores1)
          for _ in range(pred.shape[0])]

In [None]:
closest = [utils.get_closest_words_to(top_10[_], na)
           for _ in range(pred.shape[0])]

In [None]:
resultados = {palabra_es: top_10_nah for (palabra_es, top_10_nah) in zip(eval_es, closest)}

In [None]:
esp = list(eval_set["esp"].values)
nah = list(eval_set["nah"].values)
pares_eval = list(zip(esp, nah))
gold = defaultdict(list)
for palabra_es, palabra_na in pares_eval:
    gold[palabra_es].append(palabra_na)
gold = dict(gold)

In [None]:
p1 = 0
p5 = 0
p10 = 0
list_esp_eval = (list(resultados.keys()))
hits = list()
not_found = list()
# Se buscan las traducciones gold standard dentro de las predicciones y se obtiene 
# P@K, sino se encuentran, se añade a una lista de no encontrados.
for palabra_gold in list_esp_eval:
    for i in gold[palabra_gold]:
        if i in resultados[palabra_gold]:
            hits.append(resultados[palabra_gold].index(i))
    if hits.__len__() > 0:
        if min(hits) == 0:
            p1 += 1
            p5 += 1
            p10 += 1
        if min(hits) >= 1 and min(hits) <= 5:
            p5 += 1
            p10 += 1
        if min(hits) > 5 and min(hits) <= 10:
            p10 += 1
    else:
        not_found.append(palabra_gold)
    hits.clear()

In [None]:
length = list_esp_eval.__len__()
print("not found:", not_found.__len__(), "-", not_found.__len__() / length, "%")
print("P@1:", p1,"\tP@5:", p5 , "\tP@10:", p10)
print("P@1:", p1 / length,"\tP@5:", p5 / length, "\tP@10:", p10 / length)