In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import utils
from collections import defaultdict
from scipy.spatial.distance import cdist
__author__ = "Olivares Castillo José Luis"

  from ._conv import register_converters as _register_converters


In [2]:
def read(file, threshold=0, vocabulary=None, dtype='float'):
    # Copyright (C) 2016-2018  Mikel Artetxe <artetxem@gmail.com>
    # https://github.com/artetxem/vecmap/blob/master/embeddings.py
    header = file.readline().split(' ')
    count = int(header[0]) if threshold <= 0 else min(
        threshold, int(header[0]))
    dim = int(header[1])
    words = []
    matrix = np.empty((count, dim), dtype=dtype) if vocabulary is None else []
    for i in range(count):
        word, vec = file.readline().split(' ', 1)
        if vocabulary is None:
            words.append(word)
            matrix[i] = np.fromstring(vec, sep=' ', dtype=dtype)
        elif word in vocabulary:
            words.append(word)
            matrix.append(np.fromstring(vec, sep=' ', dtype=dtype))
    return (words, matrix) if vocabulary is None else (words, np.array(matrix, dtype=dtype))


def closest_word_to(top_10, words):
    return [words[index] for index, _ in top_10]


def read_lexicon(source):
    src, trg = list(), list()
    if source.__eq__("train"):
        with open("../dataset/OPUS_en_it_europarl_train_5K.txt", "r", encoding='utf-8') as file:
            for line in file:
                src.append(line.split()[0])
                trg.append(line.split()[1])
        return (src, trg)
    elif source.__eq__("test"):
        with open("../dataset/OPUS_en_it_europarl_test.txt", "r", encoding='utf-8') as file:
            for line in file:
                src.append(line.split()[0])
                trg.append(line.split()[1])
        return (src, trg)


def get_vectors(lexicon, words, embeddings, dtype='float'):
    matrix = np.empty((len(lexicon), embeddings.shape[1]), dtype=dtype)
    for i in range(len(lexicon)):
        if lexicon[i] in words:
            matrix[i] = embeddings[words.index(lexicon[i])]
    return np.asarray(matrix, dtype=dtype)

def get_distance(matrix_A,matrix_B,metric='cosine'):
    distances=1-cdist(matrix_A,matrix_B,metric=metric)
    tmp_distances,tmp_vect = [], []
    for i in range(distances.shape[0]):
        tmp_vect = list(enumerate(distances[i]))
        tmp_vect = sorted(tmp_vect, key=lambda dist: dist[1],reverse=True)
        tmp_distances.append(tmp_vect[:10])
        tmp_vect.clear()
    del tmp_vect
    del distances
    return tmp_distances

In [3]:
en, it = read_lexicon("test")
print(len(en), len(it))

1869 1869


In [4]:
source_vec = open("../dataset/EN.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt",
                  encoding="utf-8", errors="surrogateescape")
words_en, en_vec = read(source_vec)
eval_en = list(set(en))
src_vec = get_vectors(eval_en, words_en, en_vec)
print(src_vec.shape)

(1500, 300)


In [5]:
target_vec = open("../dataset/IT.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt",
                  encoding="utf-8", errors="surrogateescape")
words_it, it_vec = read(target_vec)
#eval_it = list(set(it))
#trg_vec = get_vectors(eval_it, words_it, it_vec)
#print(trg_vec.shape)

In [6]:
test_vectors = src_vec

In [66]:
tf.reset_default_graph()
sess = tf.Session()
saver = tf.train.import_meta_graph('./models/en-it/model1111_gpu/model2250.ckpt.meta')
saver.restore(sess, tf.train.latest_checkpoint('./models/en-it/model1111_gpu/'))

INFO:tensorflow:Restoring parameters from ./models/en-it/model1111_gpu/model2250.ckpt


In [67]:
graph = tf.get_default_graph()
X = graph.get_tensor_by_name("input/input_es:0")
kprob = graph.get_tensor_by_name("dropout_prob:0")

In [68]:
#([n.name for n in graph.as_graph_def().node])

In [79]:
output_NN = graph.get_tensor_by_name("xw_plus_b_1:0")
output_NN = graph.get_tensor_by_name("nah_predicted:0")
#code = graph.get_tensor_by_name("xw_plus_b_2:0")
print(output_NN)

feed_dict = {X: test_vectors, kprob: 1}
pred = sess.run(output_NN, feed_dict)
print(pred.shape)

Tensor("nah_predicted:0", shape=(?, 300), dtype=float64)
(1500, 300)


In [None]:
%%time
top_10 = [utils.get_top10_closest(pred[_], it_vec) for _ in range(pred.shape[0])]
#top_10 = get_distance(pred,it_vec)

In [71]:
top_10[0]

[(184414, 0.6064151646113275),
 (111215, 0.5998494307886582),
 (131190, 0.5987590655385501),
 (155511, 0.5936802291217208),
 (189738, 0.5890346989521987),
 (130154, 0.587551836488596),
 (110398, 0.5873961566001653),
 (131011, 0.5817305121265384),
 (137713, 0.5803647044725598),
 (120262, 0.5789179687972329)]

In [72]:
closest = [closest_word_to(top_10[_], words_it) for _ in range(pred.shape[0])]

In [73]:
eval_en[0]

'diktat'

In [74]:
closest[0]

['partitismo',
 'unanimismo',
 'contropoteri',
 'centraliste',
 'tecnocrazie',
 'politicismo',
 'delegittima',
 'burocratismo',
 'verticismo',
 'partitocratico']

In [75]:
resultados = {palabra_en: top_10_it
              for (palabra_en, top_10_it) in zip(eval_en, closest)}

In [76]:
pares_eval = list(zip(en, it))
gold = defaultdict(list)

for palabra_en, palabra_it in pares_eval:
    gold[palabra_en].append(palabra_it)
gold = dict(gold)

In [77]:
gold["diktat"]

['diktat']

In [78]:
%%time
p1, p5, p10 = 0, 0, 0
list_en_eval = list(resultados.keys())
hits, not_found = [], []

for palabra_gold in list_en_eval:
    for i in gold[palabra_gold]:
        if i in resultados[palabra_gold]:
            hits.append(resultados[palabra_gold].index(i))
    if hits.__len__() > 0:
        if min(hits) == 0:
            p1 += 1
            p5 += 1
            p10 += 1
        if min(hits) >= 1 and min(hits) <= 5:
            p5 += 1
            p10 += 1
        if min(hits) > 5 and min(hits) < 10:
            p10 += 1
    else:
        not_found.append(palabra_gold)
    hits.clear()

length = list_en_eval.__len__()
print("not found:", not_found.__len__(),
      "-", not_found.__len__() / length, "%")
print("P@1:", p1, "\tP@5:", p5, "\tP@10:", p10)
print("P@1:", p1 / length, "\tP@5:", p5 /
      length, "\tP@10:", p10 / length)


not found: 626 - 0.41733333333333333 %
P@1: 573 	P@5: 817 	P@10: 874
P@1: 0.382 	P@5: 0.5446666666666666 	P@10: 0.5826666666666667
CPU times: user 5.85 ms, sys: 3.32 ms, total: 9.17 ms
Wall time: 16.4 ms


In [65]:
not found: 704 - 0.4693333333333333 %
P@1: 510 	P@5: 737 	P@10: 796
P@1: 0.34 	P@5: 0.49133333333333334 	P@10: 0.5306666666666666

SyntaxError: invalid syntax (<ipython-input-65-20030d5a977a>, line 1)

In [None]:
not found: 689 - 0.4593333333333333 %
P@1: 515 	P@5: 753 	P@10: 811
P@1: 0.3433333333333333 	P@5: 0.502 	P@10: 0.5406666666666666

In [None]:
not found: 702 - 0.468 %
P@1: 503 	P@5: 735 	P@10: 798
P@1: 0.3353333333333333 	P@5: 0.49 	P@10: 0.532

In [None]:
not found: 719 - 0.47933333333333333 %
P@1: 496 	P@5: 725 	P@10: 781
P@1: 0.33066666666666666 	P@5: 0.48333333333333334 	P@10: 0.5206666666666667