In [1]:
from gensim.models.keyedvectors import KeyedVectors
import numpy as np

import io
import os
import codecs
import argparse
import random

from collections import Counter
from collections import defaultdict
from itertools import product

# very useful feature used to reload python modules
from importlib import reload

# import module that loads data, tokenises the tuples, initialises the embeddings matrix
import crim_data
import semeval_data

import multiprojection_model
import multiprojection_dual
# contains code to evaluate according to semeval2018 metrics
import semeval_eval
import crim_evaluator
import crim_dual_evaluator


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


# SemEval 2018, Task 9 Preamble

In [2]:
# initialise embeddings and normalise to unit-norm
#model = KeyedVectors.load_word2vec_format('embeddings/w2v_umbc_8w_10n_300d.txt', binary=False)
#model = KeyedVectors.load_word2vec_format('embeddings/glove_umbc_15w_300d.txt', binary=False)
model = KeyedVectors.load_word2vec_format('embeddings/fast_umbc_5ng_8w_300d.vec', binary=False)

#model.save_word2vec_format('embeddings/GoogleNews-vectors-negative300.txt', binary=False)
model.init_sims(replace=True)

In [3]:
# ignore words which don't feature in embeddings model
def read_subsumptions(filenames, w2v, word_type='Both'):
    hypo, hyper = filenames
        
    subsumptions = []
    is_concept = []
    
    with open(hypo, mode='r') as f_hypo, open(hyper, mode='r') as f_hyper:         
        for x, y in zip(f_hypo, f_hyper):
            query, category = x.strip().split("\t")                        
            query = query.replace(" ", "_").lower()                        
            y = y.strip()            
            # check that we have embeddings for query word            
            if (query in w2v and (word_type==category or word_type=='Both')):
                for h in y.split("\t"):
                    h = h.replace(" ", "_").lower()
                    if h in w2v:
                        subsumptions.append((query, h))
                        is_concept.append(0 if category=='Entity' else 1 )
        
        return is_concept, subsumptions
                    
# ignore vocab entries not having correspononding embeddings                                                
def read_vocab(filename, w2v):
        
    vocab = []    
    # load data itemsf
    with open(filename, mode='r') as f:        
        for word in f:
            word = word.strip().replace(" ","_").lower()            
            if word in w2v:
                vocab.append(word)
                              
    return vocab

In [4]:
# import SemEval data
data_file_names = list(map(lambda x: '1A.english.%s.data.txt'%(x), ['trial', 'test', 'training']))
gold_file_names = list(map(lambda x: '1A.english.%s.gold.txt'%(x), ['trial', 'test', 'training']))
vocab_file_name = '1A.english.vocabulary.txt'

file_names = list(zip(data_file_names, gold_file_names))
# 0 = validation; 1 = test; 2 = training
# create a dictionary for every dataset which maintains concepts, entities, and both
validation, test, training = {}, {}, {}
word_types = ['Concept', 'Entity','Both']
for w in word_types:
    validation[w] = read_subsumptions(file_names[0], model, w)[1] 
    test[w] = read_subsumptions(file_names[1], model, w)[1]
    if w == 'Both':
        is_concept, training[w] = read_subsumptions(file_names[2], model, w)
    else:
        training[w] = read_subsumptions(file_names[2], model, w)[1]

vocabulary = read_vocab(vocab_file_name, model)
    
# create hypernym dictionary
hyper_dict = defaultdict(list)
for x, y in validation['Both'] + test['Both'] + training['Both']:
    hyper_dict[x].append(y)
    
hyper_dict.default_factory = None

# print some quick stats
for w in word_types:
    print (w)
    print ("Tuples in validation set: %d; tuples in test set: %d; tuples in training: %d" 
          % (len(validation[w]), len(test[w]), len(training[w])))
    
    print ("Unique hyponyms in validation set: %d; hyponyms in test set: %d; hyponyms in training: %d"
          % (len(set([x for (x,y) in validation[w]])), len(set([x for (x,y) in test[w]])), len(set([x for (x,y) in training[w]]))))
    print ("-"*30)
    
print ("Vocab size: %d" % len(vocabulary))

# 647 terms were missing from the model vocab due to them appearing only once in the corpus.
# According to the technical paper, only words appearing at least 5 times were considered for vocab 
# from within the general-purpose corpus

# Also the word épée, is found without accents in the model.  Don't know why. But we'll ignore this single
# word since it should not make a difference in the scheme of things.

Concept
Tuples in validation set: 112; tuples in test set: 4935; tuples in training: 7182
Unique hyponyms in validation set: 30; hyponyms in test set: 1057; hyponyms in training: 978
------------------------------
Entity
Tuples in validation set: 88; tuples in test set: 2112; tuples in training: 4595
Unique hyponyms in validation set: 20; hyponyms in test set: 443; hyponyms in training: 521
------------------------------
Both
Tuples in validation set: 200; tuples in test set: 7047; tuples in training: 11777
Unique hyponyms in validation set: 50; hyponyms in test set: 1499; hyponyms in training: 1499
------------------------------
Vocab size: 218106


In [5]:
args = {'w2v':model,
        'train':training, 'test':test, 'validation':validation, 'vocabulary':vocabulary, 
        'is_concept':is_concept        
       }

data = semeval_data.SemevalData(args)

Initialising SemevalData...
Creating tokenizer
Dataset vocabulary size is 219034
Vocab size is 219034 words
Initialising negative sampler
Tokenising all dataset tuples
Creating embeddings matrix
Done!


In [12]:
def train_and_evaluate(hyp_model, train_split, valid_split, test_split):    
    
    # fit model
    # the test split is only used to measure the test loss
    hyp_model.fit(train_split, valid_split)    
    # this step should not be required since the model is dynamically linked to the evaluator
    hyp_model.evaluator.set_model(hyp_model.model)
    
    scores_all = []
    # evaluate trained model on word in either category separately and together
    for w in ['Concept', 'Entity', 'Both']:
        print ("Evaluating model on %s" % (w) )
        # generates predictions according to trained model
        predictions = hyp_model.evaluator.predict(test_split[w])
        # this converts the tokens back to words for evaluation
        test_tuples = data.token_to_words(test_split[w])
        # here we have a scorer that will mark our effort according to this particular test split
        scorer = semeval_eval.HypernymEvaluation(test_tuples)
        # get scores
        score_names, all_scores = scorer.get_evaluation_scores(predictions)
        # initialise scores (MRR, MAP, ...)
        scores = {s:0.0 for s in score_names }
        for k in range(len(score_names)):    
            scores[score_names[k]] = float('%.5f' % (sum([score_list[k] for score_list in all_scores]) / len(all_scores)))    
        
        scores_all.append(scores)
    return scores_all
    

# MULTI-PROJECTION Section

In [None]:
embeddings_layer = multiprojection_model.get_embeddings_model(data.embeddings_matrix, 1)

In [13]:
# standard model parameters - we won't be changing these
args['data']              = data
args['embeddings_layer']  = embeddings_layer
args['epochs']            = 15
args['batch_size']        = 32
args['synonym_sample_n']  = 1
args['phi_k']             = 1
args['lambda_c']          = 0.
args['negative_sample_n'] = 10
args['save_path']         = 'ft_semeval.npz'
args['patience']          = 3
args['eval_after_epoch']  = True
args['lr']                = 0.001
args['beta1']             = 0.9
args['beta2']             = 0.9
args['clip_value']        = 1.

# generate parameter combinations
_clusters = [10, 5, 1]
#_clusters = [10]
#_lambda_c = [0, 0.1, 1]
_lambda_c = [0., 0.1, 1.]
#_neg_count = [10, 5, 1]
_neg_count = [10]

parameters = [_clusters, _lambda_c, _neg_count]

param_list = list(product(*parameters))

# initialise hypernymy discovery model which we will reuse by resetting the model with new args
hyp_model = multiprojection_model.MultiProjModel(args)



In [14]:
# initialise final_scores dictionary
final_scores = {k:defaultdict(list) for k in param_list}

for idx2, _param in enumerate(param_list):
    print ("Running test with following parameters: phi_k: %d; lambda_c: %0.2f; neg_count: %d" \
           % (_param[0], _param[1], _param[2]))

    args['phi_k'] = _param[0]
    args['lambda_c'] = _param[1]
    args['negative_sample_n'] = _param[2]    

    for w in ['Both']:
        # train model on three sets of data
        print ("Training model on %s" % (w))
        args['save_path'] = 'ft_semeval_%s.npz' % (w)
        hyp_model.reset_model(args=args)

        all_scores = train_and_evaluate(hyp_model, 
                                        data.train_data_token[w], 
                                        data.valid_data_token[w],
                                        data.test_data_token)
        # run predictions on test

        for scores in all_scores:
            for s, v  in scores.items():
                final_scores[_param][s].append(v)

    print ("")
    print ("Finished %d from %d experiments" % (idx2+1, len(param_list)))
    print ("-"*30)

Running test with following parameters: phi_k: 10; lambda_c: 0.00; neg_count: 10
Training model on Both
Fitting model with following parameters:
 batch_size=32;
 phi_k=10;
 lambda_c=0.00;
 epochs=15;
 negative_count=10;
 synonym_count=1
Optimizer parameters:
 lr=0.00100;
 beta1=0.900;
 beta2=0.900;
 clip=1.00
--------------------
Epoch: 1; Training Loss: 0.44248; Test Loss: 0.27799; Test MAP: 0.00080; Test MRR: 0.00200
Epoch: 2; Training Loss: 0.25851; Test Loss: 0.19276; Test MAP: 0.07218; Test MRR: 0.13889
Epoch: 3; Training Loss: 0.19556; Test Loss: 0.14808; Test MAP: 0.10019; Test MRR: 0.19450
Epoch: 4; Training Loss: 0.15900; Test Loss: 0.13329; Test MAP: 0.11655; Test MRR: 0.20286
Epoch: 5; Training Loss: 0.13677; Test Loss: 0.11714; Test MAP: 0.12212; Test MRR: 0.21852
Epoch: 6; Training Loss: 0.12074; Test Loss: 0.12016; Test MAP: 0.09525; Test MRR: 0.18300
Epoch: 7; Training Loss: 0.11036; Test Loss: 0.11579; Test MAP: 0.12051; Test MRR: 0.21667
Epoch: 8; Training Loss: 0.1015

Epoch: 4; Training Loss: 0.21803; Test Loss: 0.17618; Test MAP: 0.07017; Test MRR: 0.14867
Epoch: 5; Training Loss: 0.19044; Test Loss: 0.15555; Test MAP: 0.06695; Test MRR: 0.12241
Epoch: 6; Training Loss: 0.17005; Test Loss: 0.13644; Test MAP: 0.12920; Test MRR: 0.22810
Epoch: 7; Training Loss: 0.15478; Test Loss: 0.12589; Test MAP: 0.12358; Test MRR: 0.23733
Epoch: 8; Training Loss: 0.14159; Test Loss: 0.12316; Test MAP: 0.10282; Test MRR: 0.19333
Epoch: 9; Training Loss: 0.13181; Test Loss: 0.12714; Test MAP: 0.12300; Test MRR: 0.21852
Early Stop invoked at epoch 9
Done!
Evaluating model on Concept
Done 100
Done 200
Done 300
Done 400
Done 500
Done 600
Done 700
Done 800
Done 900
Done 1000
Evaluating model on Entity
Done 100
Done 200
Done 300
Done 400
Evaluating model on Both
Done 100
Done 200
Done 300
Done 400
Done 500
Done 600
Done 700
Done 800
Done 900
Done 1000
Done 1100
Done 1200
Done 1300
Done 1400

Finished 5 from 9 experiments
------------------------------
Running test with 

In [15]:
final_scores

{(10, 0.0, 10): defaultdict(list,
             {'MRR': [0.19424, 0.41247, 0.2587],
              'MAP': [0.09542, 0.20048, 0.12654],
              'P@1': [0.14286, 0.33409, 0.19947],
              'P@5': [0.09308, 0.19195, 0.12236],
              'P@10': [0.08757, 0.18038, 0.11504]}),
 (10, 0.1, 10): defaultdict(list,
             {'MRR': [0.20307, 0.38316, 0.25643],
              'MAP': [0.0978, 0.19038, 0.12523],
              'P@1': [0.15421, 0.30023, 0.19746],
              'P@5': [0.09399, 0.18213, 0.1201],
              'P@10': [0.08853, 0.17363, 0.11374]}),
 (10, 1.0, 10): defaultdict(list,
             {'MRR': [0.20497, 0.385, 0.25831],
              'MAP': [0.1004, 0.18851, 0.12648],
              'P@1': [0.15043, 0.30474, 0.19613],
              'P@5': [0.09724, 0.18134, 0.12216],
              'P@10': [0.09173, 0.17209, 0.11552]}),
 (5, 0.0, 10): defaultdict(list,
             {'MRR': [0.19229, 0.37878, 0.24758],
              'MAP': [0.09421, 0.19232, 0.12328],
            

In [17]:
for k, v in final_scores.items():    
    cl_size = k[0]
    lam = k[1]
    neg = k[2]
    for k2, v2 in v.items():        
        print ("%d\t%0.1f\t%d\t%s\t%0.5f\t%0.5f\t%0.5f" 
               % (cl_size, lam, neg, k2, v2[0], v2[1], v2[2]) )

10	0.0	10	MRR	0.19424	0.41247	0.25870
10	0.0	10	MAP	0.09542	0.20048	0.12654
10	0.0	10	P@1	0.14286	0.33409	0.19947
10	0.0	10	P@5	0.09308	0.19195	0.12236
10	0.0	10	P@10	0.08757	0.18038	0.11504
10	0.1	10	MRR	0.20307	0.38316	0.25643
10	0.1	10	MAP	0.09780	0.19038	0.12523
10	0.1	10	P@1	0.15421	0.30023	0.19746
10	0.1	10	P@5	0.09399	0.18213	0.12010
10	0.1	10	P@10	0.08853	0.17363	0.11374
10	1.0	10	MRR	0.20497	0.38500	0.25831
10	1.0	10	MAP	0.10040	0.18851	0.12648
10	1.0	10	P@1	0.15043	0.30474	0.19613
10	1.0	10	P@5	0.09724	0.18134	0.12216
10	1.0	10	P@10	0.09173	0.17209	0.11552
5	0.0	10	MRR	0.19229	0.37878	0.24758
5	0.0	10	MAP	0.09421	0.19232	0.12328
5	0.0	10	P@1	0.14191	0.30248	0.18946
5	0.0	10	P@5	0.08988	0.18363	0.11765
5	0.0	10	P@10	0.08747	0.17776	0.11421
5	0.1	10	MRR	0.19051	0.40600	0.25437
5	0.1	10	MAP	0.09258	0.19872	0.12401
5	0.1	10	P@1	0.14570	0.32957	0.20013
5	0.1	10	P@5	0.08939	0.18845	0.11872
5	0.1	10	P@10	0.08461	0.18195	0.11343
5	1.0	10	MRR	0.17847	0.35379	0.23040
5	1.0	10	MAP	0.084

In [None]:
for k, v in final_scores.items():    
    cl_size = k[0]
    lam = k[1]
    neg = k[2]
    for k2, v2 in v.items():        
        print ("%d\t%0.1f\t%d\t%s\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f" 
               % (cl_size, lam, neg, k2, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], v2[8]) )
    

In [None]:
hyp_model.load_model()
#weights = np.load(hyp_model.save_path)

In [None]:
# we score as per the usual way the predictions
predictions = {}
for w in ['Concept', 'Entity', 'Both']:
    print ("Doing %s" % (w))
    predictions[w] = hyp_model.evaluator.predict(data.test_data_token[w])        
    test_tuples = data.token_to_words(data.test_data_token[w])
    scorer = semeval_eval.HypernymEvaluation(test_tuples)
    # get scores
    score_names, all_scores = scorer.get_evaluation_scores(predictions[w])
    # initialise scores (MRR, MAP, ...)
    scores = {s:0.0 for s in score_names }
    for k in range(len(score_names)):    
        scores[score_names[k]] = float('%.5f' % (sum([score_list[k] for score_list in all_scores]) / len(all_scores)))    
    
    print (scores)


In [None]:
plt.figure(figsize=(12,8))
ax = sns.regplot(x="Median Freq", y="MAP", data=score_freq, x_jitter=0.02, y_jitter=0.01);
ax.set(ylim=(-0.05, 1.05))

# DUAL Model Section

In [6]:
def train_and_evaluate_dual(dual_model, train_split, valid_split, test_split):    
    
    # fit model
    # the test split is only used to measure the test loss
    dual_model.fit(train_split, valid_split)    
    # this step should not be required since the model is dynamically linked to the evaluator
    dual_model.evaluator.set_model(dual_model.feature_extractor, dual_model.concept_model, dual_model.entity_model)    
    
    scores_all = []
    # evaluate trained model on word in either category separately and together
    for w in ['Concept', 'Entity', 'Both']:
        print ("Evaluating model on %s" % (w) )
        # generates predictions according to trained model
        predictions = dual_model.evaluator.predict(test_split[w])
        # this converts the tokens back to words for evaluation
        test_tuples = data.token_to_words(test_split[w])
        # here we have a scorer that will mark our effort according to this particular test split
        scorer = semeval_eval.HypernymEvaluation(test_tuples)
        # get scores
        score_names, all_scores = scorer.get_evaluation_scores(predictions)
        # initialise scores (MRR, MAP, ...)
        scores = {s:0.0 for s in score_names }
        for k in range(len(score_names)):    
            scores[score_names[k]] = float('%.5f' % (sum([score_list[k] for score_list in all_scores]) / len(all_scores)))    
        
        scores_all.append(scores)                    
    return scores_all

def evaluate_only_dual(dual_model, test_split):
    predictions = dual_model.evaluator.predict(test_split)
    # this converts the tokens back to words for evaluation
    test_tuples = data.token_to_words(test_split)
    # here we have a scorer that will mark our effort according to this particular test split
    scorer = semeval_eval.HypernymEvaluation(test_tuples)
    # get scores
    score_names, all_scores = scorer.get_evaluation_scores(predictions)
    # initialise scores (MRR, MAP, ...)
    scores = {s:0.0 for s in score_names }
    for k in range(len(score_names)):    
        scores[score_names[k]] = float('%.5f' % (sum([score_list[k] for score_list in all_scores]) / len(all_scores)))    

    return predictions, scores

In [7]:
embeddings_layer = multiprojection_model.get_embeddings_model(data.embeddings_matrix, 1)

Instructions for updating:
Colocations handled automatically by placer.


In [8]:
# standard model parameters - we won't be changing these
args['data']              = data
args['embeddings_layer']  = embeddings_layer
args['epochs']            = 20
args['batch_size']        = 64
args['synonym_sample_n']  = 1
args['phi_k']             = 1
args['lambda_c']          = 0.
args['negative_sample_n'] = 10
args['save_path']         = 'dual_ft_semeval.npz'
args['patience']          = 5
args['eval_after_epoch']  = True
args['lr']                = 0.001
args['beta1']             = 0.9
args['beta2']             = 0.9
args['clip_value']        = 1.


dual_model = multiprojection_dual.MultiProjModelDual(args)

# generate parameter combinations
#_clusters = [10, 5, 1]
_clusters = [1, 5, 10]
#_lambda_c = [0, 0.1, 1]
_lambda_c = [0.]
#_neg_count = [10, 5, 1]
_neg_count = [10]

parameters = [_clusters, _lambda_c, _neg_count]

param_list = list(product(*parameters))


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [9]:
# initialise final_scores dictionary
final_scores = {k:defaultdict(list) for k in param_list}

for idx2, _param in enumerate(param_list):
    print ("Running test with following parameters: phi_k: %d; lambda_c: %0.2f; neg_count: %d" \
           % (_param[0], _param[1], _param[2]))

    args['phi_k'] = _param[0]
    args['lambda_c'] = _param[1]
    args['negative_sample_n'] = _param[2]    

    for w in ['Both']:    
        # train model on three sets of data
        print ("Training model on %s" % (w))
        args['save_path'] = 'dual_ft_semeval_%s.npz' % (w)
        dual_model.reset_model(args=args)

        all_scores = train_and_evaluate_dual(dual_model, 
                                             data.train_data_token[w], 
                                             data.valid_data_token[w],
                                             data.test_data_token)
        # run predictions on test

        for scores in all_scores:
            for s, v  in scores.items():
                final_scores[_param][s].append(v)

    print ("")
    print ("Finished %d from %d experiments" % (idx2+1, len(param_list)))
    print ("-"*30)

    

Running test with following parameters: phi_k: 1; lambda_c: 0.00; neg_count: 10
Training model on Both
Fitting model with following parameters:
 batch_size=64;
 phi_k=1;
 lambda_c=0.00;
 epochs=20;
 negative_count=10;
 synonym_count=1
Optimizer parameters:
 lr=0.00100;
 beta1=0.900;
 beta2=0.900;
 clip=1.00
--------------------
Instructions for updating:
Use tf.cast instead.
Epoch: 1; Training Loss: 0.65261; Test Loss: 0.00000; Test MAP: 0.00000; Test MRR: 0.00000
Epoch: 2; Training Loss: 0.57333; Test Loss: 0.00000; Test MAP: 0.00000; Test MRR: 0.00000
Epoch: 3; Training Loss: 0.50473; Test Loss: 0.00000; Test MAP: 0.00000; Test MRR: 0.00000
Epoch: 4; Training Loss: 0.44798; Test Loss: 0.00000; Test MAP: 0.00000; Test MRR: 0.00000
Epoch: 5; Training Loss: 0.40237; Test Loss: 0.00000; Test MAP: 0.00000; Test MRR: 0.00000
Epoch: 6; Training Loss: 0.36658; Test Loss: 0.00000; Test MAP: 0.00000; Test MRR: 0.00000
Epoch: 7; Training Loss: 0.33891; Test Loss: 0.00000; Test MAP: 0.00000; Tes

In [11]:
for k, v in final_scores.items():    
    cl_size = k[0]
    lam = k[1]
    neg = k[2]
    for k2, v2 in v.items():        
        print ("%d\t%0.1f\t%d\t%s\t%0.5f\t%0.5f\t%0.5f" 
               % (cl_size, lam, neg, k2, v2[0], v2[1], v2[2]) )

1	0.0	10	MRR	0.10734	0.21708	0.13984
1	0.0	10	MAP	0.04743	0.10768	0.06527
1	0.0	10	P@1	0.08515	0.17381	0.11141
1	0.0	10	P@5	0.04508	0.10260	0.06211
1	0.0	10	P@10	0.04245	0.09854	0.05905
5	0.0	10	MRR	0.19376	0.40343	0.25585
5	0.0	10	MAP	0.09394	0.20040	0.12546
5	0.0	10	P@1	0.14664	0.32506	0.19947
5	0.0	10	P@5	0.09010	0.18758	0.11897
5	0.0	10	P@10	0.08608	0.18370	0.11499
10	0.0	10	MRR	0.21118	0.38461	0.26257
10	0.0	10	MAP	0.09865	0.19040	0.12581
10	0.0	10	P@1	0.16367	0.32054	0.21014
10	0.0	10	P@5	0.09314	0.18025	0.11895
10	0.0	10	P@10	0.08905	0.17245	0.11374


# Analysis of Prediction quality vs hypernym freq in training set

In [None]:
# perform freq analysis of all hypernyms in training set; query terms will not features in test but test terms
# will certainly be related to hypernyms found in training set.
def get_hypernym_freq(dataset):
    all_hypernyms = Counter([y for x, y in data.token_to_words(dataset)])
    cnt_distinct_hyper = sum(all_hypernyms.values())
    #sorted([(y, x) for x, y in all_hypernyms.items()], reverse=True)
    hyper_freq = {w:round((cnt/cnt_distinct_hyper), 5) for w, cnt in all_hypernyms.items()}
    return hyper_freq

In [None]:
np.sum(list(hyper_freq.values()))

In [None]:
predictions = yummy.evaluator.predict(test_data_split[4])

In [None]:
predictions

In [None]:
# we score as per the usual way the predictions
test_tuples = data.token_to_words(test_data_split[4])
scorer = semeval_eval.HypernymEvaluation(test_tuples)
# get scores
score_names, all_scores = scorer.get_evaluation_scores(predictions)
# initialise scores (MRR, MAP, ...)
scores = {s:0.0 for s in score_names }
for k in range(len(score_names)):    
    scores[score_names[k]] = float('%.5f' % (sum([score_list[k] for score_list in all_scores]) / len(all_scores)))    
    
# we create a dictionary of ground truth hypernyms for the test split of interest    
ground_truth = defaultdict(list)
for x, y in test_tuples:
    ground_truth[x].append(y)
ground_truth.default_factory = None    

In [None]:
# all MAP scores
list(predictions.items())[34]

In [None]:
def get_score_freq_matrix(test_data, predictions, hyper_freq, jitter=False):
    # we score as per the usual way the predictions
    test_tuples = data.token_to_words(test_data)
    scorer = semeval_eval.HypernymEvaluation(test_tuples)
    # get scores
    score_names, all_scores = scorer.get_evaluation_scores(predictions)
    # initialise scores (MRR, MAP, ...)
    scores = {s:0.0 for s in score_names }
    for k in range(len(score_names)):    
        scores[score_names[k]] = float('%.5f' % (sum([score_list[k] for score_list in all_scores]) / len(all_scores)))    

    # we create a dictionary of ground truth hypernyms for the test split of interest    
    ground_truth = defaultdict(list)
    for x, y in test_tuples:
        ground_truth[x].append(y)
    ground_truth.default_factory = None    


    # retain MAP scores only from score list of lists
    all_map = np.round(np.asarray(all_scores)[:,1], 3)

    # iterate over every query term in test set; create dataset with AP score for word, median freq of ground
    # truth hypernym, based on appearance in training set.  Hypernyms that did not appear at all, will be assigned
    # freq of 0.
    score_freq_matrix = np.zeros((all_map.shape[0], 2))
    # sort prediction keys explicityly to make sure we process in the same order processed by evaluator
    for idx, w in enumerate(sorted(predictions.keys())):
        # find freq of predictions
        score_freq_matrix[idx][0] = all_map[idx]
        gold = ground_truth[w]
        score_freq_matrix[idx][1] = np.median(([hyper_freq[g] if g in hyper_freq else 0. for g in gold]))
        
    if jitter:
        # add some jitter to the signal to make it easier to interpret in the scatterplot
        mu, sigma = 0, 0.01 
        # creating a noise with the same dimension as the dataset (2,2) 
        noise = np.random.normal(mu, sigma, (all_map.shape[0], 2) )
        score_freq_matrix =  score_freq_matrix + noise

    score_freq = pd.DataFrame(score_freq_matrix, columns=['AP', 'Median Freq'])
    # add query word to data frame
    score_freq = score_freq.assign(word=pd.Series(list(predictions.keys())).values)
    return score_freq

In [None]:
#score_freq.loc[score_freq.MAP == 1.,].sort_values('Median Freq', ascending=False)

# prediction contains the generated hypernyms for YAMANE on the 5th fold of the training_data
score_freq_yam=get_score_freq_matrix(test_data_split[4], predictions, hyper_freq)

# prediction_2 contains the generated hypernyms for CRIM on the 5th fold of the training data
score_freq=get_score_freq_matrix(test_data_split[4], predictions_2, hyper_freq)

In [None]:
#sns.relplot(x="Median Freq", y="MAP", data=score_freq);
plt.figure(figsize=(12,8))
ax = sns.regplot(x="Median Freq", y="AP", data=score_freq_yam, x_jitter=0.01, y_jitter=0.005, marker="x");
ax.set(ylim=(-0.05, 1.05))

In [None]:
plt.figure(figsize=(12,8))
ax = sns.regplot(x="Median Freq", y="AP", data=score_freq, x_jitter=0.01, y_jitter=0.005);
ax.set(ylim=(-0.05, 1.05))


In [None]:
top_yummy_wrong

In [None]:
# analyze top-ranked word for term words scoring 0 MAP
yummy_wrong_terms = score_freq_yam.loc[score_freq_yam.MAP==0, 'word'].tolist()
crim_wrong_terms  = score_freq.loc[score_freq.MAP==0, 'word'].tolist()

top_crim_wrong = []
top_yummy_wrong = []

for w in crim_wrong_terms:
    top_crim_wrong.append(predictions_2[w][0])
    
for w in yummy_wrong_terms:
    top_yummy_wrong.append(predictions[w][0])    
    
top_crim_wrong =  sorted([(v,k) for k, v in Counter(top_crim_wrong).items()], reverse=True)[:15]
top_yummy_wrong =  sorted([(v,k) for k, v in Counter(top_yummy_wrong).items()], reverse=True)[:15]

# multiply words according to frequency
yummy_wrong_flat = [li for lol in list(map(lambda w: [w[1]] * w[0], top_yummy_wrong)) for li in lol  ]
crim_wrong_flat = [li for lol in list(map(lambda w: [w[1]] * w[0], top_crim_wrong)) for li in lol  ]

combined_wrong_list = list(zip(['CRIM'] * len(crim_wrong_flat), crim_wrong_flat))
combined_wrong_list.extend(list(zip(['Yamane'] * len(yummy_wrong_flat), yummy_wrong_flat)))

In [None]:
incorrect_df =  pd.DataFrame(combined_wrong_list, columns=['Model', 'Highest Ranked Incorrect Word'])

In [None]:
incorrect_df.loc[incorrect_df.Model == 'CRIM',]

In [None]:
plt.figure(figsize=(12,8))
g = sns.countplot(x="Highest Ranked Incorrect Word", 
                  palette=sns.cubehelix_palette(15, start=2, rot=0.35, dark=0.47, light=0.85, reverse=True), 
                  data=incorrect_df.loc[incorrect_df.Model == 'CRIM',])
g.set_xticklabels(g.get_xticklabels(), rotation=30)

In [None]:
for w in sorted([(v, k)for k, v in hyper_freq.items()], reverse=True)[:10]:
    print (w[1], w[0])
    
hyper_freq['action']

In [None]:
plt.figure(figsize=(12,8))
g = sns.countplot(x="Highest Ranked Incorrect Word", 
                  palette=sns.cubehelix_palette(15, start=2, rot=0.35, dark=0.47, light=0.85, reverse=True), 
                  data=incorrect_df.loc[incorrect_df.Model == 'Yamane',])
g.set_xticklabels(g.get_xticklabels(), rotation=30)

In [None]:
for w in sorted([(v, k)for k, v in hyper_freq.items()], reverse=True)[:10]:
    print (w[1], w[0])

In [None]:
# least frequent hypernyms
for w in [(v, k) for k, v in hyper_freq.items() if v == min(hyper_freq.values())][:10]:
    print (w[1], w[0])

In [None]:
# let's see good scores low frequency
good_words = score_freq.loc[(score_freq.AP >= 1.0) & (score_freq['Median Freq'] < 0.005) , 'word'].tolist()
score_freq.loc[(score_freq.AP >= 1.0) & (score_freq['Median Freq'] < 0.005) , ]

In [None]:
for w in good_words:
    print ("%s: %s" % (w, ", ".join(predictions_2[w])))
    print ("%s: %s" % (w, ", ".join(hyper_dict[w])))
    print ("-"*30)
    
#print (hyper_dict['intercourse'])
#predictions_2['intercourse']


# Scratch Pad

In [None]:
reload(semeval_data)
reload(multiprojection_model)
reload(crim_evaluator)
reload(multiprojection_dual)
reload(crim_dual_evaluator)

In [None]:
len(set(read_vocab(vocab_file_name)).difference(set(vocabulary)))

In [None]:
# remove vocab term having no vector in embeddings
def get_terms_having_vectors(w2v, dataset):
    return [(q,h) for q, h in dataset if q in w2v and h in w2v]

# remove any words which don't have corresponding embeddings 
for w in word_types:
    validation[w] = get_terms_having_vectors(model, validation[w])
    test[w] = get_terms_having_vectors(model, test[w])
    training[w] = get_terms_having_vectors(model, training[w])

vocabulary = list(filter(lambda w: w in model, vocabulary))   