In [None]:
from gensim.models.keyedvectors import KeyedVectors
import numpy as np

import codecs
import argparse
import csv
import random

from collections import defaultdict

# very useful feature used to reload python modules
from importlib import reload

# import module that loads data, tokenises the tuples, initialises the embeddings matrix
import crim_data

import multiprojection_model
import yamane_model
# contains code to evaluate according to semeval2018 metrics
import semeval_eval
import crim_evaluator
import yamane_evaluator

from sklearn.model_selection import KFold

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from collections import Counter

In [None]:
# initialise embeddings and normalise to unit-norm
#model = KeyedVectors.load_word2vec_format('embeddings/GoogleNews-vectors-negative300.bin', binary=True)
model = KeyedVectors.load_word2vec_format('embeddings/glove.42B.300d.txt', binary=False)
#model = KeyedVectors.load_word2vec_format('embeddings/wiki-news-300d-1M.vec', binary=False)

#model.save_word2vec_format('embeddings/GoogleNews-vectors-negative300.txt', binary=False)
model.init_sims(replace=True)

In [None]:
# import Ustalov et al. prepare module
import prepare

sub_train = prepare.read_subsumptions('subsumptions-train.txt')
sub_test = prepare.read_subsumptions('subsumptions-test.txt')
sub_validation = prepare.read_subsumptions('subsumptions-validation.txt')


# remove vocab term having no vector in embeddings
def get_terms_having_vectors(w2v, dataset):
    return [(q,h) for q, h in dataset if q in w2v and h in w2v]

sub_train = get_terms_having_vectors(model, sub_train)
sub_test = get_terms_having_vectors(model, sub_test)
sub_validation = get_terms_having_vectors(model, sub_validation)


# create hypernym dictionary
hyper_dict = defaultdict(list)
for x, y in sub_train + sub_test + sub_validation:        
    hyper_dict[x].append(y)
    
hyper_dict.default_factory = None

# to ensure that synonyms are not also hypernyms
synonyms = prepare.read_synonyms('synonyms.txt', hyper_dict)  
synonyms = prepare.get_synonymys_having_vectors(synonyms, model)
synonyms.default_factory = None


print ("Total number of tuples in entire set: %d" % (len([x for (x,y) in sub_train + sub_test + sub_validation])))
print ("Unique hyponyms in set: %d" % (len(set([x for (x,y) in sub_train + sub_test + sub_validation]))))

In [None]:
args = {'w2v':model,
        'train':sub_train, 'test':sub_test, 'validation':sub_validation, 'synonyms':synonyms, 
        'limited_vocab_n': 250000
       }
data = crim_data.CrimData(args)

In [None]:
# convert full dataset to array
all_data_tokens = np.asarray(data.all_data_token)

kf = KFold(n_splits=5, random_state=42)
kf.get_n_splits(all_data_tokens[:,0])

# split data into 5 different train-test folds
train_data_split = []
test_data_split = []
for k in kf.split(all_data_tokens[:,0]):    
    k_train_split = all_data_tokens[k[0]]
    k_test_split = all_data_tokens[k[1]]
    
    train_data_split.append(k_train_split)
    test_data_split.append(k_test_split)

# output training-test split sizes    
for tr, te in zip(train_data_split, test_data_split):
    print ("Training tuples: %d; test tuples: %d" % (len(tr), len(te)))

In [None]:
def train_and_evaluate_1_fold(hyp_model, train_split, test_split):    
    
    # fit model
    # the test split is only used to measure the test loss
    hyp_model.fit(train_split, test_split)    
    # this step should not be required since the model is dynamically linked to the evaluator
    hyp_model.evaluator.set_model(hyp_model.model)
    # generates predictions according to trained model
    predictions = hyp_model.evaluator.predict(test_split)
    # this converts the tokens back to words for evaluation
    test_tuples = data.token_to_words(test_split)
    # here we have a scorer that will mark our effort according to this particular test split
    scorer = semeval_eval.HypernymEvaluation(test_tuples)
    # get scores
    score_names, all_scores = scorer.get_evaluation_scores(predictions)
    # initialise scores (MRR, MAP, ...)
    scores = {s:0.0 for s in score_names }
    for k in range(len(score_names)):    
        scores[score_names[k]] = float('%.5f' % (sum([score_list[k] for score_list in all_scores]) / len(all_scores)))    

    return scores
    

# MULTI-PROJECTION Section

In [None]:
embeddings_layer = multiprojection_model.get_embeddings_model(data.embeddings_matrix, 5)

In [None]:
from itertools import product

# standard model parameters - we won't be changing these
args['data']              = data
args['embeddings_layer']  = embeddings_layer
args['epochs']            = 10
args['batch_size']        = 32
args['synonym_sample_n']  = 5
args['phi_k']             = 5
args['lambda_c']          = 0.
args['negative_sample_n'] = 10
args['save_path']         = 'glove_multiproj.npz'
args['patience']          = 2
args['eval_after_epoch']  = True

# generate parameter combinations
_clusters = [10, 5, 1]
_lambda_c = [0, 0.1, 1]
_neg_count = [10, 5, 1]

parameters = [_clusters, _lambda_c, _neg_count]

param_list = list(product(*parameters))

# initialise hypernymy discovery model which we will reuse by resetting the model with new args
hyp_model = multiprojection_model.MultiProjModel(args)


In [None]:
# initialise final_scores dictionary
final_scores = {k:defaultdict(list) for k in param_list}

for idx2, _param in enumerate(param_list):
    print ("Running test with following parameters: phi_k: %d; lambda_c: %0.2f; neg_count: %d" \
           % (_param[0], _param[1], _param[2]))

    args['phi_k'] = _param[0]
    args['lambda_c'] = _param[1]
    args['negative_sample_n'] = _param[2]    
    
    # iterate over every split to get score distribution
    for idx, td in enumerate(train_data_split):              
        hyp_model.reset_model(args=args)
        
        scores = train_and_evaluate_1_fold(hyp_model, td, test_data_split[idx])
        for s, v  in scores.items():
            final_scores[_param][s].append(v)
    print ("")
    print ("Finished %d from %d experiments" % (idx2+1, len(param_list)))
    print ("-"*30)

In [None]:
final_scores

In [None]:
for k, v in final_scores.items():    
    cl_size = k[0]
    lam = k[1]
    neg = k[2]
    for k2, v2 in v.items():        
        print ("%d,%0.1f,%d,%s,%0.5f,%0.5f,%0.5f,%0.5f,%0.5f" % (cl_size, lam, neg, k2, v2[0], v2[1], v2[2], v2[3], v2[4]) )
    

In [None]:
hyp_model.epochs=5
hyp_model.fit(train_data_split[0], test_data_split[0])

In [None]:
hyp_model.model.get_weights()[2:]


In [None]:
hyp_model.load_model()

# YAMANE Section

In [None]:
def yam_train_and_evaluate_1_fold(yam, train_split, test_split):    
    
    # fit model
    # the test split is only used to measure the test loss
    yam.fit(train_split, test_split)    
    # this step should not be required since the model is dynamically linked to the evaluator
    yam.evaluator.set_ensemble(yam)
    # generates predictions according to trained model
    predictions = yam.evaluator.predict(test_split)
    # this converts the tokens back to words for evaluation
    test_tuples = data.token_to_words(test_split)
    # here we have a scorer that will mark our effort according to this particular test split
    scorer = semeval_eval.HypernymEvaluation(test_tuples)
    # get scores
    score_names, all_scores = scorer.get_evaluation_scores(predictions)
    # initialise scores (MRR, MAP, ...)
    scores = {s:0.0 for s in score_names }
    for k in range(len(score_names)):    
        scores[score_names[k]] = float('%.5f' % (sum([score_list[k] for score_list in all_scores]) / len(all_scores)))    

    return scores
    

In [None]:
embeddings_layer2 = yamane_model.get_embeddings_model(data.embeddings_matrix)

In [None]:
from itertools import product

# initialise Yamane Model
args={'data':data, 'embeddings_layer': embeddings_layer2, 'lr':0.001,'lambda_c':0.16, 
      'negative_sample_n':5, 'epochs':10, 'save_path':'glove_yamane_016.npz', 'patience':2}

# generate parameter combinations
_lambda_c = [0.16]
_neg_count = [5]

parameters = [_lambda_c, _neg_count]
param_list = list(product(*parameters))

yummy = yamane_model.YamaneEnsemble(args)


In [None]:
# initialise final_scores dictionary
final_scores = {k:defaultdict(list) for k in param_list}

for idx2, _param in enumerate(param_list):
    print ("Running test with following parameters: lambda_c: %0.2f; neg_count: %d" \
           % (_param[0], _param[1]))
    
    args['lambda_c'] = _param[0]
    args['negative_sample_n'] = _param[1]    
    
    # iterate over every split to get score distribution
    for idx, td in enumerate(train_data_split):              
        yummy.reset_ensemble(args=args)
        
        scores = yam_train_and_evaluate_1_fold(yummy, td, test_data_split[idx])
        for s, v  in scores.items():
            final_scores[_param][s].append(v)
    print ("")
    print ("Finished %d from %d experiments" % (idx2+1, len(param_list)))
    print ("-"*30)

In [None]:
yummy.sample_clusters
Counter(yummy.sample_clusters)

In [None]:
final_scores
# nicer output of final scores
for k, v in final_scores.items():    
    lam = k[0]
    neg = k[1]
    for k2, v2 in v.items():        
        print ("%0.2f,%d,%s,%0.5f,%0.5f,%0.5f,%0.5f,%0.5f" % (lam, neg, k2, v2[0], v2[1], v2[2], v2[3], v2[4]) )

# get mean scores        
for v in final_scores.values():
    for k, v2 in v.items():
        print ("%s: %0.5f" % (k, np.mean(v2)))

In [None]:
# peek into the one of the clusters
for c in range(len(Counter(yummy.sample_clusters))):
    print ("Cluster %d" % (c))
    for idx, i in enumerate(np.where(yummy.sample_clusters == c)[0]):
        if idx < 30:
            print (data.tokenizer.sequences_to_texts([train_data_split[4][i]]))
        else:
            break
    print ("-" * 30)

In [None]:
# get hypernym representation by total in cluster
def get_hypernym_rep_in_cluster(fold, clusters, c, top_running_perc):
    hyper_freq = list(map(lambda w:data.tokenizer.index_word[w], fold[:,1][np.where(clusters == c)]))
    # group hypernyms and count instances
    hyper_freq = Counter(hyper_freq)
    total_pairs = sum(hyper_freq.values())    
    total_uniq_hypers = len(hyper_freq.keys())
    
    running_total = 0.
    result = []
    for count, word in sorted(((value, key) for (key,value) in hyper_freq.items()), reverse = True):                
        perc_total = round(count / (1. * total_pairs), 5)
        running_total += perc_total        
        result.append((word, count, total_uniq_hypers, perc_total))        
        if running_total > top_running_perc:
            break
    return result

for c in range(len(Counter(yummy.sample_clusters))):
    print ("Cluster %d" % (c))
    for word, pairs, tot, perc_pairs in get_hypernym_rep_in_cluster(train_data_split[4], yummy.sample_clusters, c, 0.4):
        print ("%s,%d,%d,%0.5f" % (word, pairs, tot, perc_pairs))
    print ("-"*30)

# Scratch Pad

In [None]:
reload(crim_data)
reload(multiprojection_model)
reload(crim_evaluator)
reload(yamane_model)
reload(yamane_evaluator)

In [None]:
predictions

In [None]:
# get queries from tuples
#hyp_model.evaluator.predict_word('mare')
predictions = hyp_model.evaluator.predict(test_data_split[0])

In [None]:
test_tuples = data.token_to_words(test_data_split[0])
scorer = semeval_eval.HypernymEvaluation(test_tuples)

# get scores
score_names, all_scores = scorer.get_evaluation_scores(predictions)

scores = {s:0.0 for s in score_names }

for k in range(len(score_names)):    
    scores[score_names[k]] = float('%.5f' % (sum([score_list[k] for score_list in all_scores]) / len(all_scores)))    

scores

In [None]:
from itertools import product
# create score dictionary
_clusters = [1, 5, 10]
_lambda_c = [0, 0.1, 1]
_neg_count = [1, 5, 10]

parameters = [_clusters, _lambda_c, _neg_count]

param_list = list(product(*parameters))
final_scores = {k:defaultdict(list) for k in param_list}

In [None]:
predictions = yummy.evaluator.predict(test_data_split[4])
test_tuples = data.token_to_words(test_data_split[4])
scorer = semeval_eval.HypernymEvaluation(test_tuples)
# get scores
score_names, all_scores = scorer.get_evaluation_scores(predictions)
# initialise scores (MRR, MAP, ...)
scores = {s:0.0 for s in score_names }
for k in range(len(score_names)):    
    scores[score_names[k]] = float('%.5f' % (sum([score_list[k] for score_list in all_scores]) / len(all_scores)))    

scores