In [2]:
import numpy as np
#from gensim.models.keyedvectors import KeyedVectors

# Evaluation routines

In [3]:
class HypernymEvaluation:
    
    def __init__(self, dataset, tokenizer, feature_extractor, scorer):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor
        self.scorer = scorer
                

    def convert_hypernyms_to_one_line(self):
        ordered_queries = sorted(list(set(self.dataset[0])))
        one_line = {}
        for w in ordered_queries:
            word_hypernyms = [h for q, h in zip(*self.dataset) if q == w]
            one_line[w] = word_hypernyms
        return one_line

    # taken from task_scorer.py provided with shared task resources
    def mean_reciprocal_rank(self, r):
        """Score is reciprocal of the rank of the first relevant item
        First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
        Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
        Args:
            r: Relevance scores (list or numpy) in rank order
                (first element is the first item)
        Returns:
            Mean reciprocal rank
        """
        r = np.asarray(r).nonzero()[0]
        return 1. / (r[0] + 1) if r.size else 0.

    def precision_at_k(self, r, k, n):
        """Score is precision @ k
        Relevance is binary (nonzero is relevant).
        Args:
            r: Relevance scores (list or numpy) in rank order
                (first element is the first item)
        Returns:
            Precision @ k
        Raises:
            ValueError: len(r) must be >= k
        """
        assert k >= 1
        r = np.asarray(r)[:k] != 0
        if r.size != k:
            raise ValueError('Relevance score length < k')
        return (np.mean(r)*k)/min(k,n)
        # Modified from the first version. Now the gold elements are taken into account

    def average_precision(self, r,n):
        """Score is average precision (area under PR curve)
        Relevance is binary (nonzero is relevant).
        Args:
            r: Relevance scores (list or numpy) in rank order
                (first element is the first item)
        Returns:
            Average precision
        """
        r = np.asarray(r) != 0
        out = [self.precision_at_k(r, k + 1, n) for k in range(r.size)]
        #Modified from the first version (removed "if r[k]"). All elements (zero and nonzero) are taken into account
        if not out:
            return 0.
        return np.mean(out)

    def mean_average_precision(self, r, n):
        """Score is mean average precision
        Relevance is binary (nonzero is relevant).
        Args:
            r: Relevance scores (list or numpy) in rank order
                (first element is the first item)
        Returns:
            Mean average precision
        """
        return self.average_precision(r,n)

    # predictions is a dictionary whereby key is query term and value is a list of ranked hypernym predictions
    def get_evaluation_scores(self, predictions):
        all_scores = []    
        scores_names = ['MRR', 'MAP', 'P@1', 'P@5', 'P@10']
        for query, gold_hyps in self.convert_hypernyms_to_one_line().items():

            avg_pat1 = []
            avg_pat2 = []
            avg_pat3 = []

            pred_hyps = predictions[query]
            gold_hyps_n = len(gold_hyps)    
            r = [0 for i in range(15)]

            for j in range(len(pred_hyps)):
                # I believe it's not fair to bias evaluation on how many hypernyms were found in gold set
                # if anything a shorter list (ex. because a hypernym is very particular) will already make 
                # it harder for a match to be found but if system returns correct hypernym in second place
                # why should it be ignored?
                #if j < gold_hyps_n:
                pred_hyp = pred_hyps[j]
                if pred_hyp in gold_hyps:
                    r[j] = 1

            avg_pat1.append(self.precision_at_k(r,1,gold_hyps_n))
            avg_pat2.append(self.precision_at_k(r,5,gold_hyps_n))
            avg_pat3.append(self.precision_at_k(r,10,gold_hyps_n))    

            mrr_score_numb = self.mean_reciprocal_rank(r)
            map_score_numb = self.mean_average_precision(r,gold_hyps_n)
            avg_pat1_numb = sum(avg_pat1)/len(avg_pat1)
            avg_pat2_numb = sum(avg_pat2)/len(avg_pat2)
            avg_pat3_numb = sum(avg_pat3)/len(avg_pat3)

            score_results = [mrr_score_numb, map_score_numb, avg_pat1_numb, avg_pat2_numb, avg_pat3_numb]
            all_scores.append(score_results)
        return scores_names, all_scores

    # return predictions for user-defined list of terms
    def predict_ltr_hypernym(self, queries):        
        ordered_queries = sorted(list(set(queries)))
        results = {}

        #phi_matrix = self.feature_extractor.get_layer(name='Phi').get_weights()[0]
        phi_matrix = [l.get_weights()[0] for l in self.feature_extractor.layers if type(l) == Dense and l.name.startswith('Phi') ]
        phi_matrix = np.asarray(phi_matrix)
        embeddings_q = self.feature_extractor.get_layer(name='TermEmbedding_Q').get_weights()[0]
        embeddings_h = self.feature_extractor.get_layer(name='TermEmbedding_H').get_weights()[0]

        for idx, word in enumerate(ordered_queries):        
            if (idx + 1) % 100 == 0:
                print ("Done", idx + 1)

            q_idx = self.tokenizer.word_index[word]        
            word_phi = np.dot(embeddings_q[q_idx], phi_matrix)            
            word_phi = np.mean(word_phi, axis=0)
            
            # normalise phi projection
            #word_phi /= np.linalg.norm(word_phi)
                        
            hyp_scores = self.scorer([embeddings[1:] - word_phi])
            top_words = np.argsort(hyp_scores[0].flatten())[::-1][:15] + 1
            results[word] = self.tokenizer.sequences_to_texts(top_words.reshape(-1,1))                        

        return results
    
    # return predictions for all terms initially passed to class
    def predict_ltr_hypernyms(self):
        return self.predict_ltr_hypernym(self.dataset[0])


class HypernymEvaluation_SquareDiff(HypernymEvaluation):
    def predict_ltr_hypernym(self, queries):        
        ordered_queries = sorted(list(set(queries)))
        results = {}

        #phi_matrix = self.feature_extractor.get_layer(name='Phi').get_weights()[0]
        phi_matrix = [l.get_weights()[0] for l in self.feature_extractor.layers if type(l) == Dense and l.name.startswith('Phi') ]
        phi_matrix = np.asarray(phi_matrix)
        #embeddings_q = self.feature_extractor.get_layer(name='TermEmbedding_Q').get_weights()[0]
        #embeddings_h = self.feature_extractor.get_layer(name='TermEmbedding_H').get_weights()[0]
        embeddings = self.feature_extractor.get_layer(name='TermEmbedding').get_weights()[0]

        for idx, word in enumerate(ordered_queries):        
            if (idx + 1) % 100 == 0:
                print ("Done", idx + 1)

            q_idx = self.tokenizer.word_index[word]        
            word_phi = np.dot(embeddings[q_idx], phi_matrix)            
            word_phi = np.mean(word_phi, axis=0)
                        
            
            # square vector different as per model
            hyp_scores = self.scorer([(embeddings[1:] - word_phi)**2])            
            top_words = np.argsort(hyp_scores[0].flatten())[::-1][:15] + 1
            results[word] = self.tokenizer.sequences_to_texts(top_words.reshape(-1,1))                        

        return results    
    
class HypernymEvaluation_MSE(HypernymEvaluation):
    def __init__(self, dataset, tokenizer, feature_extractor):
        HypernymEvaluation.__init__(self, dataset, tokenizer, feature_extractor, None)
        
    
    def predict_ltr_hypernym(self, queries):
        ordered_queries = sorted(list(set(queries)))
        results = {}

        #phi_matrix = self.feature_extractor.get_layer(name='Phi').get_weights()[0]
        phi_matrix = [l.get_weights()[0] for l in self.feature_extractor.layers if type(l) == Dense and l.name.startswith('Phi') ]
        phi_matrix = np.asarray(phi_matrix)
        embeddings = self.feature_extractor.get_layer(name='TermEmbedding').get_weights()[0]        

        for idx, word in enumerate(ordered_queries):        
            if (idx + 1) % 100 == 0:
                print ("Done", idx + 1)

            q_idx = self.tokenizer.word_index[word]        
            word_phi = np.dot(embeddings[q_idx], phi_matrix)            
            word_phi = np.mean(word_phi, axis=0)
            
            # normalise phi projection
            word_phi /= np.linalg.norm(word_phi)
            
            # square vector different as per model
            hyp_scores = np.dot(embeddings[1:], word_phi)
            #hyp_scores = self.scorer([embeddings[1:] - word_phi])
            top_words = np.argsort(hyp_scores.flatten())[::-1][:15] + 1
            results[word] = self.tokenizer.sequences_to_texts(top_words.reshape(-1,1))                        

        return results
    
class HypernymEvaluation_CRIM(HypernymEvaluation):
    def __init__(self, dataset, tokenizer, feature_extractor):
        HypernymEvaluation.__init__(self, dataset, tokenizer, feature_extractor, None)

    def predict_ltr_hypernym(self, queries):
        ordered_queries = sorted(list(set(queries)))
        results = {}
        
        phi_matrix = [l.get_weights()[0] for l in self.feature_extractor.layers if type(l) == Dense and l.name.startswith('Phi') ]
        phi_matrix = np.asarray(phi_matrix)
        embeddings = self.feature_extractor.get_layer(name='TermEmbedding').get_weights()[0]        
        
        cluster_weight = self.feature_extractor.get_layer(name='Prediction').get_weights()[0]
        bias = self.feature_extractor.get_layer(name='Prediction').get_weights()[1]

        for idx, word in enumerate(ordered_queries):        
            if (idx + 1) % 100 == 0:
                print ("Done", idx + 1)

            q_idx = self.tokenizer.word_index[word]        
            word_phi = np.dot(embeddings[q_idx], phi_matrix)                                                
    
            sim_matrix = np.dot(cluster_weight.T, np.dot(embeddings[1:], word_phi.T).T) + bias
            top_words = np.argsort(sim_matrix[0])[::-1][:15] + 1
                                                
            results[word] = self.tokenizer.sequences_to_texts(top_words.reshape(-1,1))                        

        return results        
    
class HypernymEvaluation_CRIM_Max(HypernymEvaluation):
    def __init__(self, dataset, tokenizer, feature_extractor):
        HypernymEvaluation.__init__(self, dataset, tokenizer, feature_extractor, None)

    def predict_ltr_hypernym(self, queries):
        ordered_queries = sorted(list(set(queries)))
        results = {}
        
        phi_matrix = [l.get_weights()[0] for l in self.feature_extractor.layers if type(l) == Dense and l.name.startswith('Phi') ]
        phi_matrix = np.asarray(phi_matrix)
        embeddings = self.feature_extractor.get_layer(name='TermEmbedding').get_weights()[0]        
        
        cluster_weight = self.feature_extractor.get_layer(name='Prediction').get_weights()[0]
        bias = self.feature_extractor.get_layer(name='Prediction').get_weights()[1]

        for idx, word in enumerate(ordered_queries):        
            if (idx + 1) % 100 == 0:
                print ("Done", idx + 1)

            q_idx = self.tokenizer.word_index[word]        
            word_phi = np.dot(embeddings[q_idx], phi_matrix)
            
            word_phi /= np.linalg.norm(word_phi, axis=1).reshape(-1,1)
    
            sim_matrix = np.dot(embeddings[1:], word_phi.T)    
            max_sim = np.mean(sim_matrix, 1).reshape(1,-1)
            sim_matrix = np.dot(cluster_weight.T, max_sim) + bias        
            
            top_words = np.argsort(sim_matrix[0])[::-1][:15] + 1
                                                
            results[word] = self.tokenizer.sequences_to_texts(top_words.reshape(-1,1))                        

        return results        
    

    def predict_ltr_hypernym(self, queries):        
        ordered_queries = sorted(list(set(queries)))
        results = {}

        phi_matrix = self.feature_extractor.get_layer(name='Phi').get_weights()[0]
        embeddings = self.feature_extractor.get_layer(name='TermEmbedding').get_weights()[0]

        for idx, word in enumerate(ordered_queries):        
            if (idx + 1) % 100 == 0:
                print ("Done", idx + 1)

            q_idx = self.tokenizer.word_index[word]        
            word_phi = np.dot(embeddings[q_idx], phi_matrix)
            # normalise phi projection
            #word_phi /= np.linalg.norm(word_phi)
            
            # square vector different as per model
            hyp_scores = self.scorer([(embeddings[1:] - word_phi)**2])
            #hyp_scores = self.scorer([embeddings[1:] - word_phi])
            top_words = np.argsort(hyp_scores[0].flatten())[::-1][:15] + 1
            results[word] = self.tokenizer.sequences_to_texts(top_words.reshape(-1,1))                        

        return results


In [None]:
# test final MRR score

get_score = K.function([s_vi], [rel_score])
he = HypernymEvaluation((data.valid_query, data.valid_hyper), data.tokenizer, feature_extractor, get_score)
predictions = he.predict_ltr_hypernyms()
#predictions = mrr_logger.predictions
_, all_scores = he.get_evaluation_scores(predictions)
mrr = round(sum([score_list[0] for score_list in all_scores]) / len(all_scores), 5)                                
print mrr



In [None]:
he = HypernymEvaluation_MSE((data.valid_query, data.valid_hyper), data.tokenizer, projection_model)
he.predict_ltr_hypernyms()

In [4]:
from collections import defaultdict

def get_synyonyms(hyponyms, hypernyms, n=15):
    synonyms = {}
    
    # prepare hypernym lookup dictionary
    hyper_lookup = defaultdict(list)
    for q, h in zip(hyponyms, hypernyms):
        hyper_lookup[q].append(h)
                
    for term in set(hyponyms):        
        synonyms[term] = list(filter(lambda x: x not in hyper_lookup[x], zip(*model.most_similar(term, topn=20))[0]))[:n]
        
    return synonyms
    
#get_synyonyms(train_query + test_query + valid_query, train_hyper + test_hyper + valid_hyper)    
#get_synyonyms(valid_query, valid_hyper)    

def get_random(hyponyms, hypernyms, vocab, n = 15):
    
    random_words = {}
    
    # prepare hypernym lookup dictionary
    hyper_lookup = defaultdict(list)
    for q, h in zip(hyponyms, hypernyms):
        hyper_lookup[q].append(h)
            
    for term in set(hyponyms):                
        some_words = np.random.choice(vocab, 20, replace=False)        
        random_words[term] = list(filter(lambda x: x not in hyper_lookup[x], some_words))[:n]
            
    return random_words


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Data class that encapsulates all word-based data I need to train the various algorithms
# We assume that we have all pre-filtered any words that don't feature in the embeddings
class Data:
    def __init__(self, 
                 train_query, train_hyper, 
                 test_query, test_hyper, 
                 valid_query, valid_hyper, 
                 vocab, embeddings):
        
                
        # encapsulate input variables so that all the data can be passed via class instance reference
        self.train_query = train_query
        self.train_hyper = train_hyper
        self.test_query = test_query
        self.test_hyper = test_hyper
        self.valid_query = valid_query
        self.valid_hyper = valid_hyper
        self.vocab = vocab
        
        #self.synonyms = synonyms
                
        # determine dimensionality of embeddings
        self.embeddings_dim = embeddings['animal'].shape[0]
        
        print ("Tokenising words...")
        # intialise and fit tokenizer
        self.tokenizer = tokenizer = Tokenizer(num_words = 300000, filters='')
        self.tokenizer.fit_on_texts(train_query + test_query + valid_query + vocab)
        
        print ("Creating embedding matrix...")
        # construct embedding_matrix
        self.embedding_matrix = np.zeros((len(self.tokenizer.word_index)+1, self.embeddings_dim), dtype='float32')

        for word, i in self.tokenizer.word_index.items():
            if i < len(self.tokenizer.word_index) + 1:
                embedding_vector = embeddings[word]
                if embedding_vector is not None:
                    # normalise vector (already normalised)
                    #embedding_vector /= np.linalg.norm(embedding_vector)
                    self.embedding_matrix[i,:] = embedding_vector  
        # confirm shape
        assert self.embedding_matrix.shape == (len(self.tokenizer.word_index)+1, self.embeddings_dim)
        
        print ("Creating random words/synonyms...")
        self.random_words = get_random(train_query + test_query + valid_query, train_hyper + test_hyper + valid_hyper, vocab)  
        self.synonyms = get_synyonyms(train_query + test_query + valid_query, train_hyper + test_hyper + valid_hyper)

In [6]:
#data = Data(train_query, train_hyper, test_query, test_hyper, valid_query, valid_hyper, vocab, model)
import pickle
import os

dest = os.path.join('.', 'pickle')
#pickle.dump(data, open(os.path.join(dest, 'semeval_data.pkl'), 'wb'), protocol=2)
data = pickle.load(open(os.path.join(dest, 'semeval_data.pkl'), 'rb'))

# Prepare dataset for fitting RankNet

In [None]:
from collections import Counter
#term_count = Counter(data.train_query)

query = []
hyper = []
negat = []
n_negative = 1
for q, h in zip(data.train_query, data.train_hyper):        
    # mix of random and synonyms
    rands = np.random.choice(data.random_words[q], n_negative, replace=False).tolist()
    
    # append query word to negatives    
    #rands.append(q)
    
    query.extend([q] * n_negative)
    hyper.extend([h] * n_negative)
    negat.extend(rands)
                                         

query_seq, hyper_seq, neg_seq = map(lambda x: data.tokenizer.texts_to_sequences(x), 
                                    [query, hyper, negat])
    

## Prepare validation data for fitting RankNet

In [None]:
v_query = []
v_hyper = []
v_negat = []

n_negative = 1
for q, h in zip(data.valid_query, data.valid_hyper):        
    rands = np.random.choice(data.random_words[q], n_negative, replace=False).tolist()
    #rands.append(q)
    
    v_query.extend([q] * n_negative)
    v_hyper.extend([h] * n_negative)
    v_negat.extend(rands)
                                 

v_query_seq, v_hyper_seq, v_neg_seq = map(lambda x: data.tokenizer.texts_to_sequences(x), 
                                    [v_query, v_hyper, v_negat])

In [None]:
zip(query[:10], hyper[:10], negat[:10])

In [None]:
print query_seq[0], hyper_seq[0], neg_seq[0]
print map(lambda x: len(x), [query_seq, hyper_seq, neg_seq])

In [7]:
# keras imports

from tensorflow.keras.layers import Input, Dense, Embedding,  Flatten,  Dropout, Subtract, Activation, Lambda, concatenate, Dot
from tensorflow.keras.models import Model, save_model, load_model
from tensorflow.keras.initializers import RandomNormal, Zeros, Ones
from tensorflow.keras.regularizers import l2, l1, l1_l2
from tensorflow.keras.constraints import UnitNorm, MinMaxNorm
from tensorflow.keras.optimizers import Adam

from tensorflow.keras import backend as K
import tensorflow as tf

from tensorflow.keras import models
from tensorflow.keras.utils import get_custom_objects
from tensorflow.python.framework import dtypes
from tensorflow.keras.initializers import Initializer

# Feature Extractor/Pair-wise LTR Model code

In [8]:
class RandomIdentity(Initializer):
    def __init__(self, dtype=dtypes.float32):
        self.dtype = dtypes.as_dtype(dtype)

    
    def __call__(self, shape, dtype=None, partition_info=None):
        if dtype is None:
            dtype = self.dtype
        
        rnorm = K.random_normal((shape[-1],shape[-1]), mean=0., stddev=0.01)        
        #identity = K.eye(shape[-1], dtype='float32')        
        rident = tf.eye(shape[-1]) * rnorm
        return rident
    
    def get_config(self):
        return {"dtype": self.dtype.name}

    
class RandomPlusIdentity(Initializer):
    def __init__(self, dtype=dtypes.float32):
        self.dtype = dtypes.as_dtype(dtype)

    
    def __call__(self, shape, dtype=None, partition_info=None):
        if dtype is None:
            dtype = self.dtype
        
        rnorm = K.random_normal((shape[-1],shape[-1]), mean=0., stddev=0.01)    
        rident = tf.eye(shape[-1]) + rnorm
        return rident            
    
    def get_config(self):
        return {"dtype": self.dtype.name}
        

get_custom_objects().update({'RandomIdentity': RandomIdentity})
get_custom_objects().update({'RandomPlusIdentity': RandomPlusIdentity})


In [None]:
def get_RankNet_model(feature_extractor, dropout_rate):
    hypo_input  = Input(shape=(1,), name='Hyponym')
    hyper_input = Input(shape=(1,), name='Hypernym')
    negative_input = Input(shape=(1,), name='Negative')

    s_vi, s_vj = feature_extractor([hypo_input, hyper_input, negative_input])

    # now we can pipe our extracted features into a RankNet model
    h_1 = Dense(128, activation = "relu")
    h_2 = Dense(64, activation = "relu")
    h_3 = Dense(32, activation = "relu")
    s = Dense(1)

    # "relevant" document score
    h_1_rel = h_1(s_vi)
    h_1_rel = Dropout(dropout_rate)(h_1_rel)
    h_2_rel = h_2(h_1_rel)
    h_2_rel = Dropout(dropout_rate)(h_2_rel)
    h_3_rel = h_3(h_2_rel)
    h_3_rel = Dropout(dropout_rate)(h_3_rel)
    rel_score = s(h_3_rel)

    # "irrelevant" document score
    h_1_irr = h_1(s_vj)
    h_1_irr = Dropout(dropout_rate)(h_1_irr)
    h_2_irr = h_2(h_1_irr)
    h_2_irr = Dropout(dropout_rate)(h_2_irr)
    h_3_irr = h_3(h_2_irr)
    h_3_irr = Dropout(dropout_rate)(h_3_irr)
    irr_score = s(h_3_irr)

    # Subtract scores.
    diff = Subtract()([rel_score, irr_score])
    # Pass difference through sigmoid function.
    prob = Activation("sigmoid")(diff)

    model = Model(inputs=[hypo_input, hyper_input, negative_input], outputs=prob)
    model.compile(optimizer = 'adadelta', loss = "binary_crossentropy")

    get_score = K.function([s_vi], [rel_score])
    return model, get_score

# Standard model where we learn phi projection and Ranknet weights together

In [None]:
# inputs
hypo_input  = Input(shape=(1,), name='Hyponym')
hyper_input = Input(shape=(1,), name='Hypernym')
negative_input = Input(shape=(1,), name='Negative')

# lookup word embedding from word index
embedding_layer = Embedding(data.embedding_matrix.shape[0], 
                           data.embedding_matrix.shape[1], name='TermEmbedding',
                           embeddings_constraint = UnitNorm(axis=1))

hypo_embedding = embedding_layer(hypo_input)    
hyper_embedding = embedding_layer(hyper_input)
neg_embedding = embedding_layer(negative_input)

# dropout 0.3 of the embeddings parameters
hypo_embedding = Dropout(0.3, name='DropHypo')(hypo_embedding)
hyper_embedding = Dropout(0.3, name='DropHyper')(hyper_embedding)
neg_embedding = Dropout(0.3, name='DropNeg')(neg_embedding)

# first part of the feature extractor
phi = Dense(200, activation=None, use_bias=False, 
            kernel_initializer=RandomIdentity(),
            name='Phi') (hypo_embedding)

# attempt toL unit norm phi
#phi = Lambda(lambda x: K.l2_normalize(x, axis=-1), name='NormPhi')(phi)

# flatten outputs
phi = Flatten(name='FlattenPhi')(phi)
# dropout phi
phi = Dropout(0.3, name='DropoutPhi')(phi)

hyper_embedding = Flatten(name='FlattenHyper')(hyper_embedding)    
neg_embedding = Flatten(name='FlattenNeg')(neg_embedding)

# extract features from query (hyponym) and doc i (relevant hypernym), doc j (irrelevant hypernym)
vi = Subtract(name='Sub1')([hyper_embedding, phi])
# square the subtraction
vi = Lambda(lambda x: K.square(x))(vi)

vj = Subtract(name='Sub2')([neg_embedding, phi])
# square the substraction vector
vj = Lambda(lambda x: K.square(x))(vj)

feature_extractor =  Model(inputs=[hypo_input, hyper_input, negative_input], outputs=[vi, vj])
feature_extractor.get_layer(name='TermEmbedding').set_weights([data.embedding_matrix])
feature_extractor.get_layer(name='TermEmbedding').trainable = False

########################### END OF FEATURE EXTRACTOR DEFINITION #################################




# Multi-phi approach

In [None]:
# inputs
hypo_input  = Input(shape=(1,), name='Hyponym')
hyper_input = Input(shape=(1,), name='Hypernym')
negative_input = Input(shape=(1,), name='Negative')

# lookup word embedding from word index
query_embedding_layer = Embedding(data.embedding_matrix.shape[0], 
                           data.embedding_matrix.shape[1], name='TermEmbedding_Q',
                           embeddings_constraint = UnitNorm(axis=1))

hyper_embedding_layer = Embedding(data.embedding_matrix.shape[0], 
                           data.embedding_matrix.shape[1], name='TermEmbedding_H',
                           embeddings_constraint = UnitNorm(axis=1))

hypo_embedding = query_embedding_layer(hypo_input)    
hyper_embedding = hyper_embedding_layer(hyper_input)
neg_embedding = hyper_embedding_layer(negative_input)

# dropout 0.3 of the embeddings parameters
hypo_embedding = Dropout(0.3, name='DropHypo')(hypo_embedding)
hyper_embedding = Dropout(0.3, name='DropHyper')(hyper_embedding)
neg_embedding = Dropout(0.3, name='DropNeg')(neg_embedding)

phi_layer = []

# build k projection matrices
phi_k = 1
for i in range(phi_k):
    phi_layer.append(Dense(200, activation=None, use_bias=False, 
                           kernel_initializer=RandomIdentity(),
                           name='Phi%d' % (i)) (hypo_embedding))

if phi_k == 1:
    # flatten tensors
    phi = Flatten(name='FlattenPhi')(phi_layer[0])    
else:
    phi = concatenate(phi_layer, axis=1)
    phi = Lambda(lambda x: K.mean(x, axis=1, keepdims=False))(phi)
    
        
# attempt toL unit norm phi
#phi = Lambda(lambda x: K.l2_normalize(x, axis=-1), name='NormPhi')(phi)
        
# dropout phi
phi = Dropout(0.3, name='DropoutPhi')(phi)
        
hyper_embedding = Flatten(name='FlattenHyper')(hyper_embedding) 
neg_embedding = Flatten(name='FlattenNeg')(neg_embedding)        

# extract features from query (hyponym) and doc i (relevant hypernym), doc j (irrelevant hypernym)
vi = Subtract(name='Sub1')([hyper_embedding, phi])
# square the subtraction
vi = Lambda(lambda x: K.square(x))(vi)

vj = Subtract(name='Sub2')([neg_embedding, phi])
# square the substraction vector
vj = Lambda(lambda x: K.square(x))(vj)

feature_extractor =  Model(inputs=[hypo_input, hyper_input, negative_input], outputs=[vi, vj])
feature_extractor.get_layer(name='TermEmbedding_Q').set_weights([data.embedding_matrix])
feature_extractor.get_layer(name='TermEmbedding_H').set_weights([data.embedding_matrix])

feature_extractor.get_layer(name='TermEmbedding_Q').trainable = False
feature_extractor.get_layer(name='TermEmbedding_H').trainable = False

########################### END OF FEATURE EXTRACTOR DEFINITION #################################


# Feature extractor initially trained on MSE;

After training we will create a new model (the feature extractor proper), set the embeddings and projection matrix weight; set all layers to untrainable and have a go on the LTR;

In [None]:
# random_similarity is the dot product between the negative term and the hyponym projection
# model should strive to minimise this value.
# c is a regularisation weight
def custom_loss(random_similarity, c):
    def mse(y_true, y_pred):                        
        return K.mean(K.square(y_pred - y_true), axis=-1) + (c * K.mean(K.square(random_similarity)))
        #return (y_pred - y_true) + (c * K.square(random_similarity))
                                                    
    return mse


# inputs
hypo_input  = Input(shape=(1,), name='Hyponym')
hyper_input  = Input(shape=(1,), name='Hypernym')
negative_input = Input(shape=(1,), name='Negative')

# lookup word embedding from word index
embedding_layer = Embedding(data.embedding_matrix.shape[0], 
                           data.embedding_matrix.shape[1], name='TermEmbedding',
                           embeddings_constraint = UnitNorm(axis=1))

hypo_embedding = embedding_layer(hypo_input)  
hyper_embedding = embedding_layer(hyper_input)  
neg_embedding = embedding_layer(negative_input)

# dropout 0.3 of the embeddings parameters
hypo_embedding = Dropout(0.3, name='DropHypo')(hypo_embedding)
hyper_embedding = Dropout(0.3, name='DropHyper')(hyper_embedding)
neg_embedding = Dropout(0.3, name='DropNeg')(neg_embedding)

phi_layer = []

# build k projection matrices
phi_k = 1
for i in range(phi_k):
    phi_layer.append(Dense(200, activation=None, use_bias=False, 
                           kernel_initializer=RandomNormal(mean=0., stddev=0.01),
                           name='Phi%d' % (i)) (hypo_embedding))

if phi_k == 1:
    # flatten tensors
    phi = Flatten(name='FlattenPhi')(phi_layer[0])    
else:
    phi = concatenate(phi_layer, axis=1)
    phi = Lambda(lambda x: K.mean(x, axis=1, keepdims=False))(phi)
                    
# dropout phi
#phi = Dropout(0.3, name='DropoutPhi')(phi)
neg_embedding = Flatten(name='FlattenNeg')(neg_embedding)        
hyper_embedding = Flatten(name='FlattenHyper')(hyper_embedding)

hyper_similarity = Dot(axes=-1, normalize=True, name='DotProductHyper')([phi, hyper_embedding])
random_similarity = Dot(axes=-1, normalize=True, name='DotProductRand')([phi, neg_embedding])


# initialise custom loss function
mse = custom_loss(random_similarity, c=1.)

# extract features from query (hyponym) and doc i (relevant hypernym), doc j (irrelevant hypernym)
vi = Subtract(name='Sub1')([hyper_embedding, phi])
# square the subtraction
vi = Lambda(lambda x: K.square(x))(vi)

vj = Subtract(name='Sub2')([neg_embedding, phi])
# square the substraction vector
vj = Lambda(lambda x: K.square(x))(vj)

feature_extractor = Model(inputs=[hypo_input, hyper_input, negative_input], outputs=[vi, vj])

# create projection model
projection_model =  Model(inputs=[hypo_input, hyper_input, negative_input], outputs=hyper_similarity)
projection_model.get_layer(name='TermEmbedding').set_weights([data.embedding_matrix])
projection_model.get_layer(name='TermEmbedding').trainable = False
adam = Adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.9, clipnorm=1.)
projection_model.compile(optimizer = adam, loss = mse)

########################### END OF FEATURE EXTRACTOR DEFINITION #################################

# Predict hypoynym projection directly; minimise by computing MSE with target hypernym vector

In [None]:
# random_similarity is the dot product between the negative term and the hyponym projection
# model should strive to minimise this value.
# c is a regularisation weight
def custom_loss(similarities, c):
    def mse(y_true, y_pred):                        
        return K.mean(K.sum(K.square(y_pred - y_true), axis=-1)) +\
    (c[0] * K.mean(K.square(similarities[0]))) +\
    (c[1] * K.mean(K.square(similarities[1])))
        #return (y_pred - y_true) + (c * K.square(random_similarity))
                                                    
    return mse


# inputs
hypo_input  = Input(shape=(1,), name='Hyponym')
negative_input = Input(shape=(1,), name='Negative')

# lookup word embedding from word index
embedding_layer = Embedding(data.embedding_matrix.shape[0], 
                           data.embedding_matrix.shape[1], name='TermEmbedding',
                           embeddings_constraint = UnitNorm(axis=1))

hypo_embedding = embedding_layer(hypo_input)  
neg_embedding = embedding_layer(negative_input)

phi_layer = []

# build k projection matrices

phi = Dense(200, activation=None, use_bias=False, 
                           kernel_initializer=RandomNormal(mean=0., stddev=0.01),
                           name='Phi') (hypo_embedding)

phi = Flatten(name='FlattenPhi')(phi)    
neg_embedding = Flatten(name='FlattenNeg')(neg_embedding)        

random_similarity = Dot(axes=-1, normalize=True, name='DotProductRand')([phi, neg_embedding])
query_similarity = Dot(axes=-1, normalize=True, name='DotProductRand2')([phi, hypo_embedding])

# initialise custom loss function
mse = custom_loss([random_similarity, query_similarity], c=[1., 0.1])

# create projection model
projection_model =  Model(inputs=[hypo_input, negative_input], outputs=phi)
projection_model.get_layer(name='TermEmbedding').set_weights([data.embedding_matrix])
projection_model.get_layer(name='TermEmbedding').trainable = False
adam = Adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.9, clipnorm=1.)
projection_model.compile(optimizer = adam, loss = mse)

########################### END OF FEATURE EXTRACTOR DEFINITION #################################

# CRIM model - multi-phi + logistic regressor


In [9]:
dropout_rate = 0.3
phi_k = 10
max_or_combine = True

hypo_input  = Input(shape=(1,), name='Hyponym')
hyper_input = Input(shape=(1,), name='Hypernym')

# lookup word embedding from word index
embedding_layer = Embedding(data.embedding_matrix.shape[0], 
                           data.embedding_matrix.shape[1], name='TermEmbedding',
                           embeddings_constraint = UnitNorm(axis=1))

hypo_embedding = embedding_layer(hypo_input)  
hyper_embedding = embedding_layer(hyper_input)

hypo_embedding = Dropout(dropout_rate, name='Dropout_Hypo')(hypo_embedding)
hyper_embedding = Dropout(dropout_rate, name='Dropout_Hyper')(hyper_embedding)
    
phi_layer = []
for i in range(phi_k):
    phi_layer.append(Dense(data.embedding_matrix.shape[1], activation=None, use_bias=False, 
                           activity_regularizer=None,
                           kernel_initializer=RandomIdentity(),                               
                           name='Phi%d' % (i)) (hypo_embedding))            
if phi_k == 1:
    # flatten tensors
    phi = Flatten(name='Flatten_Phi')(phi_layer[0])
    hyper_embedding = Flatten(name='Flatten_Hyper')(hyper_embedding)    
else:
    phi = concatenate(phi_layer, axis=1)

phi = Dropout(dropout_rate, name='Dropout_Phi')(phi)

# this is referred to as "s" in the "CRIM" paper    
phi_hyper = Dot(axes=-1, normalize=True, name='DotProduct1')([phi, hyper_embedding])                    

if phi_k > 1:
    if max_or_combine:
        phi_hyper = Lambda(lambda x: K.mean(x, axis=1, keepdims=True))(phi_hyper)
    phi_hyper = Flatten(name='Flatten_PhiHyper')(phi_hyper)

predictions = Dense(1, activation="sigmoid", name='Prediction',
                    use_bias=True,                    
                    kernel_initializer='random_normal',
                    kernel_constraint= None,                        
                    bias_initializer=Zeros(),                                            
                    kernel_regularizer=None,                        
                    bias_regularizer=None
                   ) (phi_hyper)

# create projection model
projection_model =  Model(inputs=[hypo_input, hyper_input], outputs=predictions)
projection_model.get_layer(name='TermEmbedding').set_weights([data.embedding_matrix])
projection_model.get_layer(name='TermEmbedding').trainable = False
adam = Adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.9, clipnorm=1.)
projection_model.compile(optimizer = adam, loss = 'binary_crossentropy')

########################### END OF FEATURE EXTRACTOR DEFINITION #################################

In [None]:
projection_model.summary()


# Create copy of model but with separate hypo and hyper embeddings



In [10]:
dropout_rate = 0.3
phi_k = 10
max_or_combine = True

hypo_input  = Input(shape=(1,), name='Hyponym')
hyper_input = Input(shape=(1,), name='Hypernym')

# lookup word embedding from word index
embedding_layer_q = Embedding(data.embedding_matrix.shape[0], 
                           data.embedding_matrix.shape[1], name='TermEmbedding_Q',
                           embeddings_constraint = UnitNorm(axis=1))

embedding_layer_h = Embedding(data.embedding_matrix.shape[0], 
                           data.embedding_matrix.shape[1], name='TermEmbedding_H',
                           embeddings_constraint = UnitNorm(axis=1))


hypo_embedding = embedding_layer_q(hypo_input)  
hyper_embedding = embedding_layer_h(hyper_input)

hypo_embedding = Dropout(dropout_rate, name='Dropout_Hypo')(hypo_embedding)
hyper_embedding = Dropout(dropout_rate, name='Dropout_Hyper')(hyper_embedding)
    
phi_layer = []
for i in range(phi_k):
    phi_layer.append(Dense(data.embedding_matrix.shape[1], activation=None, use_bias=False, 
                           activity_regularizer=None,
                           kernel_initializer=RandomIdentity(),                               
                           name='Phi%d' % (i)) (hypo_embedding))            
if phi_k == 1:
    # flatten tensors
    phi = Flatten(name='Flatten_Phi')(phi_layer[0])
    hyper_embedding = Flatten(name='Flatten_Hyper')(hyper_embedding)    
else:
    phi = concatenate(phi_layer, axis=1)

phi = Dropout(dropout_rate, name='Dropout_Phi')(phi)

# this is referred to as "s" in the "CRIM" paper    
phi_hyper = Dot(axes=-1, normalize=True, name='DotProduct1')([phi, hyper_embedding])                    

if phi_k > 1:
    if max_or_combine:
        phi_hyper = Lambda(lambda x: K.mean(x, axis=1, keepdims=True))(phi_hyper)
    phi_hyper = Flatten(name='Flatten_PhiHyper')(phi_hyper)

predictions = Dense(1, activation="sigmoid", name='Prediction',
                    use_bias=True,                    
                    kernel_initializer='random_normal',
                    kernel_constraint= None,                        
                    bias_initializer=Zeros(),                                            
                    kernel_regularizer=None,                        
                    bias_regularizer=None
                   ) (phi_hyper)

# create projection model
projection_model_fine =  Model(inputs=[hypo_input, hyper_input], outputs=predictions)
#projection_model_fine.get_layer(name='TermEmbedding').set_weights([data.embedding_matrix])
#projection_model_fine.get_layer(name='TermEmbedding').trainable = False
#adam = Adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.9, clipnorm=1.)
#projection_model.compile(optimizer = adam, loss = 'binary_crossentropy')

# Write callback that returns MRR at the end of every epoch

In [11]:
from tensorflow.keras.callbacks import Callback

class MRRLogger(Callback):
    def set_evaluator(self, hypernym_evaluator):
        self.he = hypernym_evaluator        
    
    def on_train_begin(self, logs={}):
        self.mrr = []
        self.map = []
        
    def on_epoch_begin(self, epoch, logs={}):
        self.predictions = []
    
    def on_epoch_end(self, epoch, logs={}):
        # no need to invoke validation data passed into fit function;
        # validation data are encapsulated in hypernym_evaluator
        self.predictions = self.he.predict_ltr_hypernyms()
        _, all_scores = self.he.get_evaluation_scores(self.predictions)        
        epoch_mrr = round(sum([score_list[0] for score_list in all_scores]) / len(all_scores), 5)  
        epoch_map = round(sum([score_list[1] for score_list in all_scores]) / len(all_scores), 5)  
        self.mrr.append(epoch_mrr)
        self.map.append(epoch_map)
        print ("; MRR: %.4f; MAP: %.4f" % (epoch_mrr, epoch_map)) 

        
class BestModelWeightSaver(Callback):
    def set_mrr_logger(self, mrr_logger):
        self.mrr_logger = mrr_logger
        
    def set_filepath(self, filepath):
        # file path should include placeholders for epoch and validation MRR
        self.filepath = filepath
    
    def on_train_begin(self, logs={}):
        self.best_metric = 0.
    
    def on_epoch_end(self, epoch, logs={}):
        # decide on whether we're going to overwrite weights or ignore this epoch
        # because of inferior results
        test_metric = np.sqrt(mrr_logger.map[::-1][0] * mrr_logger.mrr[::-1][0])
        if test_metric > self.best_metric:
            # we have new highest MRR: save weights to disc
            self.best_metric = test_metric
            np.savez_compressed(self.filepath % (epoch, self.best_metric), self.model.get_weights())            
                    

# Run Standard Model

In [None]:
y = np.ones((len(query_seq), 1))
v_y = np.ones((len(v_query), 1))
# train model
NUM_EPOCHS = 10
BATCH_SIZE = 32

# get RankNet model and scorer
model, get_score = get_RankNet_model(feature_extractor, dropout_rate=0.2)

# initialise MRR callback

# initialise evaluator
he = HypernymEvaluation((data.valid_query, data.valid_hyper), data.tokenizer, feature_extractor, get_score)
mrr_logger = MRRLogger()
mrr_logger.set_evaluator(he)

weight_saver = BestModelWeightSaver()
weight_saver.set_mrr_logger(mrr_logger)
weight_saver.set_filepath('models/best_ltr_e%s_mrr%.4f')


history = model.fit([query_seq, hyper_seq, neg_seq], y, 
                    validation_data = ([v_query_seq, v_hyper_seq, v_neg_seq], v_y),
                    batch_size = BATCH_SIZE, epochs = NUM_EPOCHS, verbose = 1,
                    callbacks=[mrr_logger, weight_saver]
                   )


# Run MSE-like model

In [None]:
# Prepare to run MSE model
y = [1.] * len(hyper_seq)
v_y = [1.] * len(v_hyper_seq)

# train model
NUM_EPOCHS = 12
BATCH_SIZE = 32


# initialise MRR callback

# initialise evaluator
he = HypernymEvaluation_MSE((data.valid_query, data.valid_hyper), data.tokenizer, feature_extractor)
mrr_logger = MRRLogger()
mrr_logger.set_evaluator(he)

weight_saver = BestModelWeightSaver()
weight_saver.set_mrr_logger(mrr_logger)
weight_saver.set_filepath('models/best_ltr_e%s_mrr%.4f')

history = projection_model.fit([query_seq, hyper_seq, neg_seq], y, 
                               validation_data = ([v_query_seq, v_hyper_seq, v_neg_seq], v_y),
                               batch_size = BATCH_SIZE, epochs = NUM_EPOCHS, verbose = 1,
                               callbacks=[mrr_logger, weight_saver])


# Run proper MSE model

In [None]:
# prepare y vectors
y = np.zeros((len(hyper_seq),200))
for idx, h in enumerate(hyper_seq):
    y[idx] = data.embedding_matrix[h[0]]

v_y = np.zeros((len(v_hyper_seq),200))
for idx, h in enumerate(v_hyper_seq):
    v_y[idx] = data.embedding_matrix[h[0]]

    
# train model
NUM_EPOCHS = 20
BATCH_SIZE = 512

he = HypernymEvaluation_MSE((data.valid_query, data.valid_hyper), data.tokenizer, projection_model)
mrr_logger = MRRLogger()
mrr_logger.set_evaluator(he)

weight_saver = BestModelWeightSaver()
weight_saver.set_mrr_logger(mrr_logger)
weight_saver.set_filepath('models/best_ltr_e%s_mrr%.4f')


history = projection_model.fit([query_seq, neg_seq], y, 
                               validation_data = ([v_query_seq, v_neg_seq], v_y),
                               batch_size = BATCH_SIZE, epochs = NUM_EPOCHS, verbose = 1,
                               callbacks=[mrr_logger, weight_saver])



# Train CRIM model using general fit function



In [12]:
# first prepare training set composed of separate positive and negative instances
neg_crim_query = []
neg_crim_hyper = []

n_negative = 10
for q, h in zip(data.train_query, data.train_hyper):        
    # mix of random and synonyms
    rands = np.random.choice(data.random_words[q], n_negative, replace=False).tolist()            
    neg_crim_query.extend([q] * n_negative)    
    neg_crim_hyper.extend(rands)
                                         
query = data.train_query + neg_crim_query
hyper = data.train_hyper + neg_crim_hyper
y = [1.] * len(data.train_query) + [0.] * len(neg_crim_query)

query_seq, hyper_seq = map(lambda x: data.tokenizer.texts_to_sequences(x), [query, hyper])
v_query_seq, v_hyper_seq = map(lambda x: data.tokenizer.texts_to_sequences(x), [data.valid_query, data.valid_hyper])

v_y = [1.] * len(v_query_seq)


In [None]:
# train model
NUM_EPOCHS = 10
BATCH_SIZE = 64

# initialise MRR callback

# initialise evaluator
he = HypernymEvaluation_CRIM_Max((data.valid_query, data.valid_hyper), data.tokenizer, projection_model)
mrr_logger = MRRLogger()
mrr_logger.set_evaluator(he)

weight_saver = BestModelWeightSaver()
weight_saver.set_mrr_logger(mrr_logger)
weight_saver.set_filepath('models/best_ltr_e%s_mrr%.4f')


history = projection_model.fit([query_seq, hyper_seq], y, validation_data = ([v_query_seq, v_hyper_seq], v_y), 
                               batch_size = BATCH_SIZE, epochs = NUM_EPOCHS, verbose = 1,
                               callbacks=[mrr_logger, weight_saver])

In [None]:
np.savez_compressed('models/best_ltr_e%s_mrr%.4f' % (10, 0.1044), projection_model.get_weights())            

In [15]:
he = HypernymEvaluation_CRIM_Max((data.valid_query, data.valid_hyper), data.tokenizer, projection_model)
he.predict_ltr_hypernym(['dirham'])

{'dirham': [u'monetary_unit',
  u'sale',
  u'the_treasury',
  u'amount_of_money',
  u'natural_person',
  u'paid',
  u'sum_of_money',
  u'stamp_duty',
  u'legal_tender',
  u'ration_card',
  u'market_price',
  u'net_worth',
  u'person',
  u'payment',
  u'gold_coin']}

In [14]:
projection_model_fine.get_layer(name='TermEmbedding_Q').set_weights([data.embedding_matrix])
projection_model_fine.get_layer(name='TermEmbedding_H').set_weights([data.embedding_matrix])

projection_model_fine.get_layer(name='TermEmbedding_H').trainable = True
projection_model_fine.get_layer(name='TermEmbedding_Q').trainable = True

# get projection matrices
dense = map(lambda x: x.get_weights()[0], [l for l in projection_model.layers if l.name.startswith('Phi')])
dense = np.asarray(dense)
# get sigmoid weights
lr_weights = projection_model.get_layer(name='Prediction').get_weights()
    
# inject pre-trained embedding weights into Embedding layer
        
phi_projections = [l for l in projection_model_fine.layers if l.name.startswith('Phi')]    
for idx, phi_projection in enumerate(phi_projections):
    phi_projection.set_weights([dense[idx]])
    phi_projection.trainable = False

projection_model_fine.get_layer(name='Prediction').set_weights(lr_weights)
projection_model_fine.get_layer(name='Prediction').trainable = True
                
projection_model_fine.compile(optimizer='adadelta', loss='binary_crossentropy')

# Fine-tune model, but keeping Phi frozen


In [None]:
# train model
NUM_EPOCHS = 3
BATCH_SIZE = 64

# initialise MRR callback

# initialise evaluator
he = HypernymEvaluation_CRIM_Max((data.valid_query, data.valid_hyper), data.tokenizer, projection_model_fine)
mrr_logger = MRRLogger()
mrr_logger.set_evaluator(he)

weight_saver = BestModelWeightSaver()
weight_saver.set_mrr_logger(mrr_logger)
weight_saver.set_filepath('models/best_ltr_e%s_mrr%.4f')


history = projection_model_fine.fit([query_seq, hyper_seq], y, validation_data = ([v_query_seq, v_hyper_seq], v_y), 
                               batch_size = BATCH_SIZE, epochs = NUM_EPOCHS, verbose = 1,
                               callbacks=[mrr_logger, weight_saver])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 129569 samples, validate on 200 samples
Epoch 1/3


# Train RankNet model from already trained feature extractor

In [None]:
# run ranknet on trained model
# first set feature_extractor Phi to non-trainable
feature_extractor.get_layer(name='Phi0').trainable = False

y = np.ones((len(query_seq), 1))
v_y = np.ones((len(v_query), 1))
# train model
NUM_EPOCHS = 10
BATCH_SIZE = 32

# get RankNet model and scorer
model, get_score = get_RankNet_model(feature_extractor, dropout_rate=0.2)

# initialise MRR callback

# initialise evaluator
he = HypernymEvaluation_SquareDiff((data.valid_query, data.valid_hyper), data.tokenizer, feature_extractor, get_score)
mrr_logger = MRRLogger()
mrr_logger.set_evaluator(he)

weight_saver = BestModelWeightSaver()
weight_saver.set_mrr_logger(mrr_logger)
weight_saver.set_filepath('models/best_ltr_e%s_mrr%.4f')


history = model.fit([query_seq, hyper_seq, neg_seq], y, 
                    validation_data = ([v_query_seq, v_hyper_seq, v_neg_seq], v_y),
                    batch_size = BATCH_SIZE, epochs = NUM_EPOCHS, verbose = 1,
                    callbacks=[mrr_logger, weight_saver]
                   )

# Load Model from Weights

In [13]:
# load saved weights
model_weights = np.load('models/best_ltr_e3_mrr0.1257.npz') 
model_weights = model_weights['arr_0'].tolist()
projection_model.set_weights(model_weights)
                  
# refresh scorer
#get_score = K.function([s_vi], [rel_score])


In [None]:
#feature_extractor.summary()
for idx, (loss, val_loss, mrr, mean_prec) in enumerate(zip(history.history['loss'], history.history['val_loss'], mrr_logger.mrr, mrr_logger.map)):
    print idx+1, loss, val_loss, mrr, mean_prec


In [None]:
he = HypernymEvaluation_CRIM_Max((data.valid_query, data.valid_hyper), data.tokenizer, projection_model)
he.predict_ltr_hypernym(['rod_laver'])



In [None]:
mrr_logger.predictions


# Evaluate on Test set

In [None]:
# calculate metrics
print ("Generating predictions...")

he_test = HypernymEvaluation_CRIM_Max((data.test_query, data.test_hyper), data.tokenizer, projection_model)
ltr_predictions = he_test.predict_ltr_hypernyms()


print ("CRIM evaluation:")
score_names, all_scores = he_test.get_evaluation_scores(ltr_predictions)
for k in range(len(score_names)):
    print (score_names[k]+': '+str(round(sum([score_list[k] for score_list in all_scores]) / len(all_scores), 5)))

# Rough work

In [None]:
print projection_model.test_on_batch([query_seq[0], neg_seq[0]], y[0].reshape(1,-1))
print np.sum(np.square(projection_model.predict([query_seq[0], neg_seq[0]]) - y[0]))
phi = projection_model.get_layer(name='Phi').get_weights()[0]

proj = np.dot(data.embedding_matrix[query_seq[0][0]], phi)
negative_ex =  data.embedding_matrix[neg_seq[0][0]]
pos_ex = y[0]

print np.sum(np.square(proj - pos_ex))
print np.sum(np.square(proj - negative_ex))

In [None]:
a = data.tokenizer.word_index['dog']
b = data.tokenizer.word_index['animal']
proj = np.dot(data.embedding_matrix[a], phi)


np.sum(np.square(proj - data.embedding_matrix[a]))

from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(proj[np.newaxis,:], data.embedding_matrix[b][np.newaxis,:])


In [None]:
phi_matrix = [l.get_weights()[0] for l in projection_model.layers if type(l) == Dense and l.name.startswith('Phi') ]
phi_matrix = np.asarray(phi_matrix)

word = data.embedding_matrix[data.tokenizer.word_index['rod_laver']]
pos = data.embedding_matrix[data.tokenizer.word_index['athlete']]
neg = data.embedding_matrix[data.tokenizer.word_index['tennis_player']]

cluster_weight = projection_model.get_layer(name='Prediction').get_weights()[0]
bias = projection_model.get_layer(name='Prediction').get_weights()[1]

#proj


In [None]:
cluster_weight, bias

In [None]:
proj = np.dot(word, phi_matrix)
proj = np.mean(proj, axis=0)

proj /= np.linalg.norm(proj)

print np.dot(proj, neg) * cluster_weight + bias


