In [1]:
from gensim.models.keyedvectors import KeyedVectors
import numpy as np

import codecs
import argparse
import csv
import random

from collections import defaultdict

# very useful feature used to reload python modules
from importlib import reload

# import module that loads data, tokenises the tuples, initialises the embeddings matrix
import crim_data

import multiprojection_model
# contains code to evaluate according to semeval2018 metrics
import semeval_eval
import crim_evaluator

from sklearn.model_selection import KFold

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# initialise embeddings and normalise to unit-norm
#model = KeyedVectors.load_word2vec_format('embeddings/GoogleNews-vectors-negative300.bin', binary=True)
#model = KeyedVectors.load_word2vec_format('embeddings/glove.42B.300d.txt', binary=False)
model = KeyedVectors.load_word2vec_format('embeddings/wiki-news-300d-1M.vec', binary=False)

#model.save_word2vec_format('embeddings/GoogleNews-vectors-negative300.txt', binary=False)
model.init_sims(replace=True)

In [None]:
# import Ustalov et al. prepare module
import prepare

sub_train = prepare.read_subsumptions('subsumptions-train.txt')
sub_test = prepare.read_subsumptions('subsumptions-test.txt')
sub_validation = prepare.read_subsumptions('subsumptions-validation.txt')
synonyms = prepare.read_synonyms('synonyms.txt')  

# remove vocab term having no vector in embeddings
def get_terms_having_vectors(w2v, dataset):
    return [(q,h) for q, h in dataset if q in w2v and h in w2v]

sub_train = get_terms_having_vectors(model, sub_train)
sub_test = get_terms_having_vectors(model, sub_test)
sub_validation = get_terms_having_vectors(model, sub_validation)
synonyms = prepare.get_synonymys_having_vectors(synonyms, model)
synonyms.default_factory = None

# create hypernym dictionary
hyper_dict = defaultdict(list)
for x, y in sub_train + sub_test + sub_validation:        
    hyper_dict[x].append(y)
    
hyper_dict.default_factory = None

print ("Total number of tuples in entire set: %d" % (len([x for (x,y) in sub_train + sub_test + sub_validation])))
print ("Unique hyponyms in set: %d" % (len(set([x for (x,y) in sub_train + sub_test + sub_validation]))))

In [None]:
args = {'w2v':model,
        'train':sub_train, 'test':sub_test, 'validation':sub_validation, 'synonyms':synonyms, 
        'limited_vocab_n': 250000
       }
data = crim_data.CrimData(args)

In [None]:
# convert full dataset to array
all_data_tokens = np.asarray(data.all_data_token)

kf = KFold(n_splits=5)
kf.get_n_splits(all_data_tokens[:,0])

# split data into 5 different train-test folds
train_data_split = []
test_data_split = []
for k in kf.split(all_data_tokens[:,0]):    
    k_train_split = all_data_tokens[k[0]]
    k_test_split = all_data_tokens[k[1]]
    
    train_data_split.append(k_train_split)
    test_data_split.append(k_test_split)

# output training-test split sizes    
for tr, te in zip(train_data_split, test_data_split):
    print ("Training tuples: %d; test tuples: %d" % (len(tr), len(te)))

In [None]:
def train_and_evaluate_1_fold(hyp_model, train_split, test_split):    
    
    # fit model
    # the test split is only used to measure the test loss
    hyp_model.fit(train_split, test_split)    
    # this step should not be required since the model is dynamically linked to the evaluator
    hyp_model.evaluator.set_model(hyp_model.model)
    # generates predictions according to trained model
    predictions = hyp_model.evaluator.predict(test_split)
    # this converts the tokens back to words for evaluation
    test_tuples = data.token_to_words(test_split)
    # here we have a scorer that will mark our effort according to this particular test split
    scorer = semeval_eval.HypernymEvaluation(test_tuples)
    # get scores
    score_names, all_scores = scorer.get_evaluation_scores(predictions)
    # initialise scores (MRR, MAP, ...)
    scores = {s:0.0 for s in score_names }
    for k in range(len(score_names)):    
        scores[score_names[k]] = float('%.5f' % (sum([score_list[k] for score_list in all_scores]) / len(all_scores)))    

    return scores
    

In [None]:
from itertools import product

# standard model parameters - we won't be changing these
args['data'] = data
args['epochs'] = 15
args['batch_size']= 32
args['synonym_sample_n']= 5
args['phi_k'] = 1
args['lambda_c'] = 0.
args['negative_sample_n'] = 10

# generate parameter combinations
_clusters = [1, 5, 10]
_lambda_c = [0, 0.1, 1]
_neg_count = [1, 5, 10]

parameters = [_clusters, _lambda_c, _neg_count]

param_list = list(product(*parameters))

# initialise hypernymy discovery model which we will reuse by resetting the model with new args
hyp_model = multiprojection_model.MultiProjModel(args)


In [None]:
# initialise final_scores dictionary
final_scores = {k:defaultdict(list) for k in param_list}

for _param in param_list:
    print ("Running test with following parameters: phi_k: %d; lambda_c: %0.2f; neg_count: %d" \
           % (_param[0], _param[1], _param[2]))

    args['phi_k'] = _param[0]
    args['lambda_c'] = _param[1]
    args['negative_sample_n'] = _param[2]    
    
    # iterate over every split to get score distribution
    for idx, td in enumerate(train_data_split):              
        hyp_model.reset_model(args=args)
        
        scores = train_and_evaluate_1_fold(hyp_model, td, test_data_split[idx])
        for s, v  in scores.items():
            final_scores[_param][s].append(v)

In [None]:
final_scores

{'MRR': 0.48189,
 'MAP': 0.28752,
 'P@1': 0.46398,
 'P@5': 0.27521,
 'P@10': 0.25954}
 
Fitting model with following parameters: batch_size=32; phi_k=10; lambda_c=1.00; epochs=15; negative_count=10; synonym_count=5
 {'MRR': 0.45751,
 'MAP': 0.30459,
 'P@1': 0.39407,
 'P@5': 0.30215,
 'P@10': 0.28868}

{'MRR': 0.26624,
 'MAP': 0.17138,
 'P@1': 0.19492,
 'P@5': 0.1721,
 'P@10': 0.16572}
 
{'MRR': 0.43478,
 'MAP': 0.29167,
 'P@1': 0.38136,
 'P@5': 0.28898,
 'P@10': 0.27513}
 
{'MRR': 0.52221,
 'MAP': 0.3812,
 'P@1': 0.49467,
 'P@5': 0.37676,
 'P@10': 0.36168}

# Scratch Pad

In [None]:
reload(crim_data)
reload(multiprojection_model)
reload(crim_evaluator)

In [None]:
predictions

In [None]:
# get queries from tuples
#hyp_model.evaluator.predict_word('mare')
predictions = hyp_model.evaluator.predict(test_data_split[0])

In [None]:
test_tuples = data.token_to_words(test_data_split[0])
scorer = semeval_eval.HypernymEvaluation(test_tuples)

# get scores
score_names, all_scores = scorer.get_evaluation_scores(predictions)

scores = {s:0.0 for s in score_names }

for k in range(len(score_names)):    
    scores[score_names[k]] = float('%.5f' % (sum([score_list[k] for score_list in all_scores]) / len(all_scores)))    

scores

In [None]:
from itertools import product
# create score dictionary
_clusters = [1, 5, 10]
_lambda_c = [0, 0.1, 1]
_neg_count = [1, 5, 10]

parameters = [_clusters, _lambda_c, _neg_count]

param_list = list(product(*parameters))
final_scores = {k:defaultdict(list) for k in param_list}