In [20]:
from batch_sim.nn_vec import nn_vec
import argparse
import csv
import glob
import os
import pickle
import re
import sys
#from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
from collections import defaultdict
import numpy as np
from projlearn import MODELS
from multiprocessing import cpu_count

In [33]:
# build arguments dictionary
args = {}
#args['w2v'] = 'GoogleNews-vectors-negative300.bin'
args['w2v'] = "wiki-news-300d-1M-subword.vec"
args['test'] = 'test.npz'
#args['test'] = 'validation.npz'
args['subsumptions'] = 'subsumptions-test.txt'
#args['subsumptions'] = 'subsumptions-validation.txt'
args['non_optimized'] = False
args['threads'] = cpu_count()
#args['path'] = ['./en_300-k25-l0.0', './en_300-k25-l1.0']
args['path'] = ['./ft-300-k25-l0.1']


In [34]:
# double checks arguments
args

{'w2v': 'wiki-news-300d-1M-subword.vec',
 'test': 'test.npz',
 'subsumptions': 'subsumptions-test.txt',
 'non_optimized': False,
 'threads': 4,
 'path': ['./ft-300-k25-l0.1']}

In [9]:
# load word2vec pre-trained embeddings
w2v = Word2Vec.load_word2vec_format(args['w2v'], binary=True, unicode_errors='ignore')
w2v.init_sims(replace=True)


In [23]:
w2v = KeyedVectors.load_word2vec_format(args['w2v'] + ".bin", binary=True)
w2v.init_sims(replace=True)

In [29]:
with np.load(args['test']) as npz:
    X_index_test  = npz['X_index']
    Y_all_test    = npz['Y_all']
    Z_all_test    = npz['Z_all']

X_all_test  = Z_all_test[X_index_test[:, 0],   :]

subsumptions_test = []

with open(args['subsumptions']) as f:
    reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)

    for row in reader:
        subsumptions_test.append((row[0], row[1]))

# remove out-of-vocab entries
def confirmVocab(wordList):
    return [*filter(lambda x: x[0] in w2v.vocab and x[1] in w2v.vocab, wordList)]


subsumptions_test = confirmVocab(subsumptions_test)

print (len(subsumptions_test), X_all_test.shape[0])
assert len(subsumptions_test) == X_all_test.shape[0]


1535 1535


In [30]:
def extract(clusters, Y_hat_clusters):
    cluster_indices = {cluster: 0 for cluster in Y_hat_clusters}

    Y_all_hat = []

    for cluster in clusters:
        Y_hat = Y_hat_clusters[cluster][cluster_indices[cluster]]
        cluster_indices[cluster] += 1

        Y_all_hat.append(Y_hat)

    assert sum(cluster_indices.values()) == len(clusters)

    return np.array(Y_all_hat)

def compute_ats(measures):
    return [sum(measures[j].values()) / len(subsumptions_test) for j in range(len(measures))]

def compute_auc(ats):
    return sum([ats[j] + ats[j + 1] for j in range(0, len(ats) - 1)]) / 2


In [35]:
for path in args['path']:
    print('Doing "%s" on "%s" and "%s".' % (path, args['test'], args['subsumptions']), flush=True)

    kmeans = pickle.load(open(os.path.join(path, 'kmeans.pickle'), 'rb'))
    print('The number of clusters is %d.' % (kmeans.n_clusters), flush=True)

    clusters_test  = kmeans.predict(Y_all_test - X_all_test)
    
    #model_types = ['baseline', 'regularized_hyponym', 'regularized_synonym']
    model_types = ['regularized_synonym_phi']
    for m in model_types:        
        model = MODELS[m]
        try:
            with np.load(os.path.join(path, '%s.test.npz') % m) as npz:
                Y_hat_clusters = {int(cluster): npz[cluster] for cluster in npz.files}
        except FileNotFoundError:
            Y_hat_clusters = {}

        if kmeans.n_clusters != len(Y_hat_clusters):
            print('Missing the output for the model "%s"!' % model, file=sys.stderr, flush=True)
            continue

        Y_all_hat = extract(clusters_test, Y_hat_clusters)

        assert len(subsumptions_test) == Y_all_hat.shape[0]

        measures = [{} for _ in range(10)]

        if not args['non_optimized']:
            # normalize Y_all_hat to make dot product equeal to cosine and monotonically decreasing function of euclidean distance
            Y_all_hat_norm = Y_all_hat / np.linalg.norm(Y_all_hat,axis=1)[:,np.newaxis]
            print('nn_vec...')
            similar_indices = nn_vec(Y_all_hat_norm, w2v.syn0norm, topn=10, sort=True, return_sims=False, nthreads=args['threads'], verbose=False)
            print('nn_vec results covert...')
            similar_words = [[w2v.index2word[ind] for ind in row] for row in similar_indices]
            print('done')

        for i, (hyponym, hypernym) in enumerate(subsumptions_test):
            if args['non_optimized']:
                Y_hat  = Y_all_hat[i].reshape(X_all_test.shape[1],)
                actual = [w for w, _ in w2v.most_similar(positive=[Y_hat], topn=10)]
            else:
                actual = similar_words[i]

            for j in range(0, len(measures)):
                measures[j][(hyponym, hypernym)] = 1. if hypernym in actual[:j + 1] else 0.

            if (i + 1) % 100 == 0:
                ats = compute_ats(measures)
                auc = compute_auc(ats)
                ats_string = ', '.join(['A@%d=%.6f' % (j + 1, ats[j]) for j in range(len(ats))])
                print('%d examples out of %d done for "%s/%s": %s. AUC=%.6f.' % (
                    i + 1,
                    len(subsumptions_test),
                    path,
                    model,
                    ats_string,
                    auc),
                file=sys.stderr, flush=True)

        ats = compute_ats(measures)
        auc = compute_auc(ats)
        ats_string = ', '.join(['A@%d=%.4f' % (j + 1, ats[j]) for j in range(len(ats))])
        print('For "%s/%s": overall %s. AUC=%.6f.' % (
            path,
            model,
            ats_string,
            auc),
        flush=True)

Doing "./ft-300-k25-l0.1" on "test.npz" and "subsumptions-test.txt".
The number of clusters is 25.
nn_vec...


100 examples out of 1535 done for "./ft-300-k25-l0.1/<class 'projlearn.regularized_synonym_phi.RegularizedSynonymPhi'>": A@1=0.025407, A@2=0.028013, A@3=0.029316, A@4=0.029316, A@5=0.029316, A@6=0.029316, A@7=0.029316, A@8=0.029967, A@9=0.029967, A@10=0.029967. AUC=0.262215.
200 examples out of 1535 done for "./ft-300-k25-l0.1/<class 'projlearn.regularized_synonym_phi.RegularizedSynonymPhi'>": A@1=0.055375, A@2=0.059283, A@3=0.061238, A@4=0.063192, A@5=0.063844, A@6=0.064495, A@7=0.064495, A@8=0.065798, A@9=0.065798, A@10=0.066450. AUC=0.569055.
300 examples out of 1535 done for "./ft-300-k25-l0.1/<class 'projlearn.regularized_synonym_phi.RegularizedSynonymPhi'>": A@1=0.089902, A@2=0.097068, A@3=0.099023, A@4=0.102280, A@5=0.104235, A@6=0.104886, A@7=0.104886, A@8=0.106840, A@9=0.106840, A@10=0.107492. AUC=0.924756.
400 examples out of 1535 done for "./ft-300-k25-l0.1/<class 'projlearn.regularized_synonym_phi.RegularizedSynonymPhi'>": A@1=0.124430, A@2=0.133550, A@3=0.136156, A@4=0.139

nn_vec results covert...
done
For "./ft-300-k25-l0.1/<class 'projlearn.regularized_synonym_phi.RegularizedSynonymPhi'>": overall A@1=0.5101, A@2=0.5459, A@3=0.5622, A@4=0.5694, A@5=0.5752, A@6=0.5798, A@7=0.5831, A@8=0.5870, A@9=0.5889, A@10=0.5896. AUC=5.141368.
