In [1]:
from gensim.models.keyedvectors import KeyedVectors
import numpy as np

import io
import os
import unicodecsv as csv
import codecs
import argparse
import random

from collections import Counter
from collections import defaultdict

# very useful feature used to reload python modules
from importlib import reload

# import module that loads data, tokenises the tuples, initialises the embeddings matrix
import crim_data
import semeval_data

import multiprojection_model
# contains code to evaluate according to semeval2018 metrics
import semeval_eval
import crim_evaluator


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


# SemEval 2018, Task 9 Preamble

In [2]:
# initialise embeddings and normalise to unit-norm
#model = KeyedVectors.load_word2vec_format('embeddings/GoogleNews-vectors-negative300.bin', binary=True)
#model = KeyedVectors.load_word2vec_format('embeddings/glove.42B.300d.txt', binary=False)
model = KeyedVectors.load_word2vec_format('embeddings/fast_umbc_5ng_8w_300d.vec', binary=False)

#model.save_word2vec_format('embeddings/GoogleNews-vectors-negative300.txt', binary=False)
model.init_sims(replace=True)

In [3]:
# ignore words which don't feature in embeddings model
def read_subsumptions(filenames, w2v, word_type='Both'):
    hypo, hyper = filenames
        
    subsumptions = []
    is_concept = []
    
    with open(hypo, mode='r') as f_hypo, open(hyper, mode='r') as f_hyper:         
        for x, y in zip(f_hypo, f_hyper):
            query, category = x.strip().split("\t")                        
            query = query.replace(" ", "_").lower()                        
            y = y.strip()            
            # check that we have embeddings for query word            
            if (query in w2v and (word_type==category or word_type=='Both')):
                for h in y.split("\t"):
                    h = h.replace(" ", "_").lower()
                    if h in w2v:
                        subsumptions.append((query, h))
                        is_concept.append(0 if category=='Entity' else 1 )
        
        return is_concept, subsumptions
                    
# ignore vocab entries not having correspononding embeddings                                                
def read_vocab(filename, w2v):
        
    vocab = []    
    # load data itemsf
    with open(filename, mode='r') as f:        
        for word in f:
            word = word.strip().replace(" ","_").lower()            
            if word in w2v:
                vocab.append(word)
                              
    return vocab

In [4]:
# import SemEval data
data_file_names = list(map(lambda x: '1A.english.%s.data.txt'%(x), ['trial', 'test', 'training']))
gold_file_names = list(map(lambda x: '1A.english.%s.gold.txt'%(x), ['trial', 'test', 'training']))
vocab_file_name = '1A.english.vocabulary.txt'

file_names = list(zip(data_file_names, gold_file_names))
# 0 = validation; 1 = test; 2 = training
# create a dictionary for every dataset which maintains concepts, entities, and both
validation, test, training = {}, {}, {}
word_types = ['Concept', 'Entity','Both']
for w in word_types:
    validation[w] = read_subsumptions(file_names[0], model, w)[1] 
    test[w] = read_subsumptions(file_names[1], model, w)[1]
    if w == 'Both':
        is_concept, training[w] = read_subsumptions(file_names[2], model, w)
    else:
        training[w] = read_subsumptions(file_names[2], model, w)[1]

vocabulary = read_vocab(vocab_file_name, model)
    
# create hypernym dictionary
hyper_dict = defaultdict(list)
for x, y in validation['Both'] + test['Both'] + training['Both']:
    hyper_dict[x].append(y)
    
hyper_dict.default_factory = None

# print some quick stats
for w in word_types:
    print (w)
    print ("Tuples in validation set: %d; tuples in test set: %d; tuples in training: %d" 
          % (len(validation[w]), len(test[w]), len(training[w])))
    
    print ("Unique hyponyms in validation set: %d; hyponyms in test set: %d; hyponyms in training: %d"
          % (len(set([x for (x,y) in validation[w]])), len(set([x for (x,y) in test[w]])), len(set([x for (x,y) in training[w]]))))
    print ("-"*30)
    
print ("Vocab size: %d" % len(vocabulary))

# 647 terms were missing from the model vocab due to them appearing only once in the corpus.
# According to the technical paper, only words appearing at least 5 times were considered for vocab 
# from within the general-purpose corpus

# Also the word épée, is found without accents in the model.  Don't know why. But we'll ignore this single
# word since it should not make a difference in the scheme of things.

Concept
Tuples in validation set: 112; tuples in test set: 4935; tuples in training: 7182
Unique hyponyms in validation set: 30; hyponyms in test set: 1057; hyponyms in training: 978
------------------------------
Entity
Tuples in validation set: 88; tuples in test set: 2112; tuples in training: 4595
Unique hyponyms in validation set: 20; hyponyms in test set: 443; hyponyms in training: 521
------------------------------
Both
Tuples in validation set: 200; tuples in test set: 7047; tuples in training: 11777
Unique hyponyms in validation set: 50; hyponyms in test set: 1499; hyponyms in training: 1499
------------------------------
Vocab size: 218106


In [5]:
args = {'w2v':model,
        'train':training, 'test':test, 'validation':validation, 'vocabulary':vocabulary, 
        'is_concept':is_concept        
       }

data = semeval_data.SemevalData(args)



Initialising SemevalData...
Creating tokenizer
Dataset vocabulary size is 219034
Vocab size is 219034 words
Initialising negative sampler
Tokenising all dataset tuples
Creating embeddings matrix
Done!


In [6]:
def train_and_evaluate(hyp_model, train_split, test_split):    
    
    # fit model
    # the test split is only used to measure the test loss
    hyp_model.fit(train_split, test_split)    
    # this step should not be required since the model is dynamically linked to the evaluator
    hyp_model.evaluator.set_model(hyp_model.model)
    # generates predictions according to trained model
    predictions = hyp_model.evaluator.predict(test_split)
    # this converts the tokens back to words for evaluation
    test_tuples = data.token_to_words(test_split)
    # here we have a scorer that will mark our effort according to this particular test split
    scorer = semeval_eval.HypernymEvaluation(test_tuples)
    # get scores
    score_names, all_scores = scorer.get_evaluation_scores(predictions)
    # initialise scores (MRR, MAP, ...)
    scores = {s:0.0 for s in score_names }
    for k in range(len(score_names)):    
        scores[score_names[k]] = float('%.5f' % (sum([score_list[k] for score_list in all_scores]) / len(all_scores)))    

    return scores
    

# MULTI-PROJECTION Section

In [7]:
embeddings_layer = multiprojection_model.get_embeddings_model(data.embeddings_matrix, 1)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Colocations handled automatically by placer.


In [26]:
from itertools import product

# standard model parameters - we won't be changing these
args['data']              = data
args['embeddings_layer']  = embeddings_layer
args['epochs']            = 15
args['batch_size']        = 32
args['synonym_sample_n']  = 1
args['phi_k']             = 1
args['lambda_c']          = 0.
args['negative_sample_n'] = 10
args['save_path']         = 'ft_semeval.npz'
args['patience']          = 5
args['eval_after_epoch']  = True
args['lr']                = 0.001
args['beta1']             = 0.9
args['beta2']             = 0.9
args['clip_value']        = 1.

# generate parameter combinations
#_clusters = [10, 5, 1]
_clusters = [20]
#_lambda_c = [0, 0.1, 1]
_lambda_c = [0.1]
#_neg_count = [10, 5, 1]
_neg_count = [10]

parameters = [_clusters, _lambda_c, _neg_count]

param_list = list(product(*parameters))

# initialise hypernymy discovery model which we will reuse by resetting the model with new args
hyp_model = multiprojection_model.MultiProjModel(args)



In [27]:
# initialise final_scores dictionary
final_scores = {k:defaultdict(list) for k in param_list}

for idx2, _param in enumerate(param_list):
    print ("Running test with following parameters: phi_k: %d; lambda_c: %0.2f; neg_count: %d" \
           % (_param[0], _param[1], _param[2]))

    args['phi_k'] = _param[0]
    args['lambda_c'] = _param[1]
    args['negative_sample_n'] = _param[2]    
    
    # iterate over every split to get score distribution    
    hyp_model.reset_model(args=args)
        
    scores = train_and_evaluate(hyp_model, 
                                data.train_data_token['Both'], 
                                data.valid_data_token['Both'])
    for s, v  in scores.items():
        final_scores[_param][s].append(v)

    print ("")
    print ("Finished %d from %d experiments" % (idx2+1, len(param_list)))
    print ("-"*30)

Running test with following parameters: phi_k: 20; lambda_c: 0.10; neg_count: 10
Fitting model with following parameters:
 batch_size=32;
 phi_k=20;
 lambda_c=0.10;
 epochs=15;
 negative_count=10;
 synonym_count=1
Optimizer parameters:
 lr=0.00100;
 beta1=0.900;
 beta2=0.900;
 clip=1.00
--------------------
Epoch: 1; Training Loss: 0.35098; Test Loss: 0.20820; Test MAP: 0.05572; Test MRR: 0.11667
Epoch: 2; Training Loss: 0.18792; Test Loss: 0.13223; Test MAP: 0.11574; Test MRR: 0.24200
Epoch: 3; Training Loss: 0.13827; Test Loss: 0.11501; Test MAP: 0.12083; Test MRR: 0.20922
Epoch: 4; Training Loss: 0.11500; Test Loss: 0.12621; Test MAP: 0.12610; Test MRR: 0.23233
Epoch: 5; Training Loss: 0.10265; Test Loss: 0.13158; Test MAP: 0.10870; Test MRR: 0.22667
Epoch: 6; Training Loss: 0.09542; Test Loss: 0.12986; Test MAP: 0.09583; Test MRR: 0.17719
Epoch: 7; Training Loss: 0.09169; Test Loss: 0.14730; Test MAP: 0.13086; Test MRR: 0.24300
Epoch: 8; Training Loss: 0.08821; Test Loss: 0.12704; 

In [28]:
final_scores

{(20, 0.1, 10): defaultdict(list,
             {'MRR': [0.243],
              'MAP': [0.13086],
              'P@1': [0.18],
              'P@5': [0.13067],
              'P@10': [0.12043]})}

In [None]:
for k, v in final_scores.items():    
    cl_size = k[0]
    lam = k[1]
    neg = k[2]
    for k2, v2 in v.items():        
        print ("%d,%0.1f,%d,%s,%0.5f,%0.5f,%0.5f,%0.5f,%0.5f" 
               % (cl_size, lam, neg, k2, v2[0], v2[1], v2[2], v2[3], v2[4]) )
    

In [None]:
#hyp_model.load_model()
#weights = np.load(hyp_model.save_path)

In [29]:
predictions = hyp_model.evaluator.predict(data.test_data_token['Both'])

Done 100
Done 200
Done 300
Done 400
Done 500
Done 600
Done 700
Done 800
Done 900
Done 1000
Done 1100
Done 1200
Done 1300
Done 1400


In [30]:
# we score as per the usual way the predictions
test_tuples = data.token_to_words(data.test_data_token['Both'])
scorer = semeval_eval.HypernymEvaluation(test_tuples)
# get scores
score_names, all_scores = scorer.get_evaluation_scores(predictions)
# initialise scores (MRR, MAP, ...)
scores = {s:0.0 for s in score_names }
for k in range(len(score_names)):    
    scores[score_names[k]] = float('%.5f' % (sum([score_list[k] for score_list in all_scores]) / len(all_scores)))    


In [31]:
scores

{'MRR': 0.25797,
 'MAP': 0.12555,
 'P@1': 0.20013,
 'P@5': 0.12258,
 'P@10': 0.11428}

In [25]:
scores

{'MRR': 0.25982,
 'MAP': 0.12531,
 'P@1': 0.20814,
 'P@5': 0.11955,
 'P@10': 0.11424}

In [19]:
scores

{'MRR': 0.2738,
 'MAP': 0.13202,
 'P@1': 0.21948,
 'P@5': 0.12572,
 'P@10': 0.11991}

{'20th_century': ['historical_period',
  'time_period',
  'time',
  'course_of_study',
  'inition',
  'musical_performance',
  'musical_work',
  'course_of_instruction',
  'historical_region',
  'literary_work',
  'musical_time',
  'context',
  'literary_movement',
  'determent',
  'piece_of_music'],
 '25th_hour': ['movie',
  'motion_picture',
  'television_program',
  'television_production',
  'film_production',
  'feature_film',
  'television',
  'film',
  'motion-picture_film',
  'television_system',
  'musical_film',
  'broadcasting',
  'video_game',
  'movie_industry',
  'television_studio'],
 'aac': ['software_application',
  'communication_software',
  'application_program',
  'computing_system',
  'application_software',
  'computer_language',
  'applications_software',
  'computer_system',
  'software_program',
  'software',
  'personal_computer',
  'software_package',
  'programing_language',
  'data_format',
  'data_input_device'],
 'aare': ['channel',
  'conduit',
  'desti

In [13]:
scores

{'MRR': 0.2379,
 'MAP': 0.11526,
 'P@1': 0.19213,
 'P@5': 0.10909,
 'P@10': 0.10434}

In [None]:
plt.figure(figsize=(12,8))
ax = sns.regplot(x="Median Freq", y="MAP", data=score_freq, x_jitter=0.02, y_jitter=0.01);
ax.set(ylim=(-0.05, 1.05))

# Train model 

# Analysis of Prediction quality vs hypernym freq in training set

In [None]:
# perform freq analysis of all hypernyms in training set; query terms will not features in test but test terms
# will certainly be related to hypernyms found in training set.
def get_hypernym_freq(dataset):
    all_hypernyms = Counter([y for x, y in data.token_to_words(dataset)])
    cnt_distinct_hyper = sum(all_hypernyms.values())
    #sorted([(y, x) for x, y in all_hypernyms.items()], reverse=True)
    hyper_freq = {w:round((cnt/cnt_distinct_hyper), 5) for w, cnt in all_hypernyms.items()}
    return hyper_freq

In [None]:
np.sum(list(hyper_freq.values()))

In [None]:
predictions = yummy.evaluator.predict(test_data_split[4])

In [None]:
predictions

In [None]:
# we score as per the usual way the predictions
test_tuples = data.token_to_words(test_data_split[4])
scorer = semeval_eval.HypernymEvaluation(test_tuples)
# get scores
score_names, all_scores = scorer.get_evaluation_scores(predictions)
# initialise scores (MRR, MAP, ...)
scores = {s:0.0 for s in score_names }
for k in range(len(score_names)):    
    scores[score_names[k]] = float('%.5f' % (sum([score_list[k] for score_list in all_scores]) / len(all_scores)))    
    
# we create a dictionary of ground truth hypernyms for the test split of interest    
ground_truth = defaultdict(list)
for x, y in test_tuples:
    ground_truth[x].append(y)
ground_truth.default_factory = None    

In [None]:
# all MAP scores
list(predictions.items())[34]

In [None]:
def get_score_freq_matrix(test_data, predictions, hyper_freq, jitter=False):
    # we score as per the usual way the predictions
    test_tuples = data.token_to_words(test_data)
    scorer = semeval_eval.HypernymEvaluation(test_tuples)
    # get scores
    score_names, all_scores = scorer.get_evaluation_scores(predictions)
    # initialise scores (MRR, MAP, ...)
    scores = {s:0.0 for s in score_names }
    for k in range(len(score_names)):    
        scores[score_names[k]] = float('%.5f' % (sum([score_list[k] for score_list in all_scores]) / len(all_scores)))    

    # we create a dictionary of ground truth hypernyms for the test split of interest    
    ground_truth = defaultdict(list)
    for x, y in test_tuples:
        ground_truth[x].append(y)
    ground_truth.default_factory = None    


    # retain MAP scores only from score list of lists
    all_map = np.round(np.asarray(all_scores)[:,1], 3)

    # iterate over every query term in test set; create dataset with AP score for word, median freq of ground
    # truth hypernym, based on appearance in training set.  Hypernyms that did not appear at all, will be assigned
    # freq of 0.
    score_freq_matrix = np.zeros((all_map.shape[0], 2))
    # sort prediction keys explicityly to make sure we process in the same order processed by evaluator
    for idx, w in enumerate(sorted(predictions.keys())):
        # find freq of predictions
        score_freq_matrix[idx][0] = all_map[idx]
        gold = ground_truth[w]
        score_freq_matrix[idx][1] = np.median(([hyper_freq[g] if g in hyper_freq else 0. for g in gold]))
        
    if jitter:
        # add some jitter to the signal to make it easier to interpret in the scatterplot
        mu, sigma = 0, 0.01 
        # creating a noise with the same dimension as the dataset (2,2) 
        noise = np.random.normal(mu, sigma, (all_map.shape[0], 2) )
        score_freq_matrix =  score_freq_matrix + noise

    score_freq = pd.DataFrame(score_freq_matrix, columns=['AP', 'Median Freq'])
    # add query word to data frame
    score_freq = score_freq.assign(word=pd.Series(list(predictions.keys())).values)
    return score_freq

In [None]:
#score_freq.loc[score_freq.MAP == 1.,].sort_values('Median Freq', ascending=False)

# prediction contains the generated hypernyms for YAMANE on the 5th fold of the training_data
score_freq_yam=get_score_freq_matrix(test_data_split[4], predictions, hyper_freq)

# prediction_2 contains the generated hypernyms for CRIM on the 5th fold of the training data
score_freq=get_score_freq_matrix(test_data_split[4], predictions_2, hyper_freq)

In [None]:
#sns.relplot(x="Median Freq", y="MAP", data=score_freq);
plt.figure(figsize=(12,8))
ax = sns.regplot(x="Median Freq", y="AP", data=score_freq_yam, x_jitter=0.01, y_jitter=0.005, marker="x");
ax.set(ylim=(-0.05, 1.05))

In [None]:
plt.figure(figsize=(12,8))
ax = sns.regplot(x="Median Freq", y="AP", data=score_freq, x_jitter=0.01, y_jitter=0.005);
ax.set(ylim=(-0.05, 1.05))


In [None]:
top_yummy_wrong

In [None]:
# analyze top-ranked word for term words scoring 0 MAP
yummy_wrong_terms = score_freq_yam.loc[score_freq_yam.MAP==0, 'word'].tolist()
crim_wrong_terms  = score_freq.loc[score_freq.MAP==0, 'word'].tolist()

top_crim_wrong = []
top_yummy_wrong = []

for w in crim_wrong_terms:
    top_crim_wrong.append(predictions_2[w][0])
    
for w in yummy_wrong_terms:
    top_yummy_wrong.append(predictions[w][0])    
    
top_crim_wrong =  sorted([(v,k) for k, v in Counter(top_crim_wrong).items()], reverse=True)[:15]
top_yummy_wrong =  sorted([(v,k) for k, v in Counter(top_yummy_wrong).items()], reverse=True)[:15]

# multiply words according to frequency
yummy_wrong_flat = [li for lol in list(map(lambda w: [w[1]] * w[0], top_yummy_wrong)) for li in lol  ]
crim_wrong_flat = [li for lol in list(map(lambda w: [w[1]] * w[0], top_crim_wrong)) for li in lol  ]

combined_wrong_list = list(zip(['CRIM'] * len(crim_wrong_flat), crim_wrong_flat))
combined_wrong_list.extend(list(zip(['Yamane'] * len(yummy_wrong_flat), yummy_wrong_flat)))

In [None]:
incorrect_df =  pd.DataFrame(combined_wrong_list, columns=['Model', 'Highest Ranked Incorrect Word'])

In [None]:
incorrect_df.loc[incorrect_df.Model == 'CRIM',]

In [None]:
plt.figure(figsize=(12,8))
g = sns.countplot(x="Highest Ranked Incorrect Word", 
                  palette=sns.cubehelix_palette(15, start=2, rot=0.35, dark=0.47, light=0.85, reverse=True), 
                  data=incorrect_df.loc[incorrect_df.Model == 'CRIM',])
g.set_xticklabels(g.get_xticklabels(), rotation=30)

In [None]:
for w in sorted([(v, k)for k, v in hyper_freq.items()], reverse=True)[:10]:
    print (w[1], w[0])
    
hyper_freq['action']

In [None]:
plt.figure(figsize=(12,8))
g = sns.countplot(x="Highest Ranked Incorrect Word", 
                  palette=sns.cubehelix_palette(15, start=2, rot=0.35, dark=0.47, light=0.85, reverse=True), 
                  data=incorrect_df.loc[incorrect_df.Model == 'Yamane',])
g.set_xticklabels(g.get_xticklabels(), rotation=30)

In [None]:
for w in sorted([(v, k)for k, v in hyper_freq.items()], reverse=True)[:10]:
    print (w[1], w[0])

In [None]:
# least frequent hypernyms
for w in [(v, k) for k, v in hyper_freq.items() if v == min(hyper_freq.values())][:10]:
    print (w[1], w[0])

In [None]:
# let's see good scores low frequency
good_words = score_freq.loc[(score_freq.AP >= 1.0) & (score_freq['Median Freq'] < 0.005) , 'word'].tolist()
score_freq.loc[(score_freq.AP >= 1.0) & (score_freq['Median Freq'] < 0.005) , ]

In [None]:
for w in good_words:
    print ("%s: %s" % (w, ", ".join(predictions_2[w])))
    print ("%s: %s" % (w, ", ".join(hyper_dict[w])))
    print ("-"*30)
    
#print (hyper_dict['intercourse'])
#predictions_2['intercourse']


# Fitting linear regression model on median freq , MAP data

In [None]:
from sklearn.linear_model import LinearRegression

slr = LinearRegression()
yummy_array =  score_freq_yam.loc[:,['Median Freq', 'AP']].values


slr.fit(yummy_array[:,0].reshape(-1,1), yummy_array[:,1])
print (slr.coef_[0], slr.intercept_)
print (slr.score(yummy_array[:,0].reshape(-1,1), yummy_array[:,1]))

In [None]:
crim_array =  score_freq.loc[:,['Median Freq', 'AP']].values


slr.fit(crim_array[:,0].reshape(-1,1), crim_array[:,1])
print (slr.coef_[0], slr.intercept_)
print (slr.score(crim_array[:,0].reshape(-1,1), crim_array[:,1]))

# Scratch Pad

In [None]:
reload(semeval_data)
reload(multiprojection_model)
reload(crim_evaluator)

In [None]:
len(set(read_vocab(vocab_file_name)).difference(set(vocabulary)))

In [None]:
# remove vocab term having no vector in embeddings
def get_terms_having_vectors(w2v, dataset):
    return [(q,h) for q, h in dataset if q in w2v and h in w2v]

# remove any words which don't have corresponding embeddings 
for w in word_types:
    validation[w] = get_terms_having_vectors(model, validation[w])
    test[w] = get_terms_having_vectors(model, test[w])
    training[w] = get_terms_having_vectors(model, training[w])

vocabulary = list(filter(lambda w: w in model, vocabulary))   