In [1]:
from gensim.models.keyedvectors import KeyedVectors
import numpy as np

### Import Word2Vec binary word embeddings

In [2]:
w2v = 'GoogleNews-vectors-negative300.txt'

In [None]:
#model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
# save to text version
#model.save_word2vec_format('GoogleNews-vectors-negative300.txt', binary=False)

In [3]:
model = KeyedVectors.load_word2vec_format(w2v, binary=False)
# pre-compute L2 norms of vectors
model.init_sims(replace=True)

In [4]:
import codecs
import os
import csv
from collections import defaultdict

def read_subsumptions(filename):
    subsumptions = []

    with codecs.open(filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)

        for row in reader:
            subsumptions.append((row[0], row[1]))

    return subsumptions

def read_synonyms(filename):
    synonyms = defaultdict(lambda: list())

    with codecs.open(filename,encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)

        for row in reader:
            for word in row[1].split(','):
                synonyms[row[0]].append(word)
    
    synonyms.default_factory = None
    return synonyms

In [5]:
train_subs = read_subsumptions('subsumptions-train.txt.orig')
test_subs = read_subsumptions('subsumptions-test.txt.orig')
valid_subs = read_subsumptions('subsumptions-validation.txt.orig')

synonyms = read_synonyms('synonyms.txt')

### Construct pre-trained word embeddings dictionary

In [6]:
# eliminate training tuples for which no embedding exists
from collections import Counter

def get_terms_having_vectors(dataset):    
    query, hyper = \
    zip(*[(q,h) for q, h in dataset 
          if q in model and h in model])
    
    return list(query), list(hyper)
    
train_query, train_hyper = get_terms_having_vectors(train_subs)
test_query, test_hyper = get_terms_having_vectors(test_subs)

assert len(train_query) == len(train_hyper)
assert len(test_query) == len(test_hyper)

### Remove OOV from synonym list

In [7]:
for k, v in list(synonyms.items()):
    if k not in model:
        synonyms.pop(k)
    else:
        for word in v:
            if word not in model:
                v.remove(word)
    
# flatten list of synonyms    
syns = [word for v in synonyms.values() for word in v]    

# confirm that all words in synonym vocab have embeddings representation
assert len(list(filter(lambda x: x in model, syns)))==len(syns)

### Define class Data which encapsulates all the bits and pieces we require for training algorithms

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Data class that encapsulates all word-based data I need to train the various algorithms
# We assume that we have all pre-filtered any words that don't feature in the embeddings
class Data:
    def __init__(self, train_query, train_hyper, test_query, test_hyper, synonyms, embeddings):
        # construct vocab made up from term and hypernyms 
        # we will choose negative samples from this vocab after exhausting
        # the synonyms
        self.neg_vocab = set(train_hyper + test_hyper)
        
        # encapsulate input variables so that all the data can be passed via class instance reference
        self.train_query = train_query
        self.train_hyper = train_hyper
        self.test_query = test_query
        self.test_hyper = test_hyper
        self.synonyms = synonyms
        
        # calculate size of term and hypernym dataset (train + test)
        n_hyponyms = len(set(train_query + test_query + syns))
        # hypernyms will be introduced in the model as either training,
        # gold positives, test gold positives (when evaluation) or
        # negative synonyms.
        n_hypernyms = len(set(train_hyper + test_hyper))

        # determine dimensionality of embeddings
        self.embeddings_dim = embeddings['animal'].shape[0]
        # intialise and fit tokenizer
        self.tokenizer = Tokenizer(num_words = n_hyponyms + n_hypernyms + 1, filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~)')
        self.tokenizer.fit_on_texts(train_query + train_hyper + test_query + test_hyper + syns)
        
        # construct embedding_matrix
        self.embedding_matrix = np.zeros((len(self.tokenizer.word_index)+1, self.embeddings_dim), dtype='float32')

        for word, i in self.tokenizer.word_index.items():
            if i < len(self.tokenizer.word_index) + 1:
                embedding_vector = embeddings[word]
                if embedding_vector is not None:
                    # normalise vector (already normalised)
                    #embedding_vector /= np.linalg.norm(embedding_vector)
                    self.embedding_matrix[i,:] = embedding_vector  
        # confirm shape
        assert self.embedding_matrix.shape == (len(self.tokenizer.word_index)+1, self.embeddings_dim)

In [9]:
data = Data(train_query, train_hyper, test_query, test_hyper, synonyms, model)

## Negative Sampling Strategies

In [10]:
# first exhaust synonyms;
# find the rest by drawing random terms from neg_vocab;
# however, make sure that chosen words are not valid hypernyms;
# finally, tokenise back to ids;    


# positive_sample and terms both expect tuples where positive_sample = (query, hyper)
# and terms = (all_query_terms, all_hyper_terms)
def get_negative_words(positive_sample, word_hypernyms, data, sample_size=5):
    neg_samples = []
    # we need to make a copy of the synonym list
    # synonmys will form part of out negative examples
    if positive_sample[0] in data.synonyms:
        neg_samples = list(synonyms[positive_sample[0]])        
    
    # there might not be enough; compound with random words
    if len(neg_samples) >= sample_size:
        # jumble negative sample indices        
        neg_samples = np.random.choice(neg_samples, sample_size, replace=False)
    else:
        # get current sample's hypernyms
        positive_hypernyms = word_hypernyms[positive_sample[0]]
        
        # eliminate correct hypernyms from neg_vocab
        word_choice = [nv for nv in data.neg_vocab if nv not in positive_hypernyms and nv not in neg_samples]        
        # choose m - len(neg_samples)
        neg_samples.extend(np.random.choice(word_choice, (sample_size-len(neg_samples))).tolist())
            
    return neg_samples
    

In [11]:
def get_random_negative_words(positive_sample, word_hypernyms, data, sample_size=5):
    neg_samples = []
    
    positive_hypernyms = word_hypernyms[positive_sample[0]]
    
    sample_space = filter(lambda w: w not in positive_hypernyms, data.tokenizer.word_index.keys())
    neg_samples = np.random.choice(list(sample_space), sample_size, replace=False)
    
    return neg_samples


In [12]:
# find most similar words to given hypernym which is not valid hypernym of word
def get_similar_hypernyms(positive_sample, word_hypernyms, data, sample_size=5):    
    word = data.tokenizer.word_index[positive_sample[1]]
    candidate_words = list(filter(lambda w: w != word, data.tokenizer.index_word.keys()))
    sims = list(map(lambda c: np.dot(data.embedding_matrix[c], data.embedding_matrix[word]), candidate_words))

    # get 30 most similar words to hypernyms
    most_sim_idx = np.argsort(sims)[::-1][:30]    
    similar_hypernyms = [data.tokenizer.index_word[candidate_words[idx]] for idx in most_sim_idx]
    
    # make sure that similar words are not actual hypernyms    
    positive_hypernym = word_hypernyms[positive_sample[0]]
    
    return list(filter(lambda x: x not in positive_hypernym, similar_hypernyms))[:sample_size]

In [13]:
# get one similar hypernym and random words
def get_similar_hyponyms(positive_sample, word_hypernyms, data, sample_size=5):    
    # get current sample's hypernyms
    positive_hypernyms = word_hypernyms[positive_sample[0]]
    
    # add query term to hypernym list
    positive_hypernyms.append(positive_sample[0])        

    
    # get candidate words - all vocab except hypernyms of current word and current word
    candidate_words = list(filter(lambda w: w not in positive_hypernyms, data.tokenizer.word_index.keys()))
    
    hypo_sims = list(map(lambda c: np.dot(        
                            data.embedding_matrix[data.tokenizer.word_index[c]], 
                            data.embedding_matrix[data.tokenizer.word_index[positive_sample[0]]]), candidate_words))
    
    most_sim_idx = np.argsort(hypo_sims)[::-1][:sample_size]
    return list(map(lambda i: candidate_words[i], most_sim_idx))

In [14]:
# get one similar hypernym and random words
def mix_sim_hyper_random(positive_sample, word_hypernyms, data, sample_size=5):    
    # init neg_samples
    neg_samples = []
    
    # get current sample's hypernyms
    positive_hypernyms = word_hypernyms[positive_sample[0]]
    
    # add query term to hypernym list
    positive_hypernyms.append(positive_sample[0])    
    
    # get candidate words - all vocab except hypernyms of current word and current word
    candidate_words = list(filter(lambda w: w not in positive_hypernyms, data.tokenizer.word_index.keys()))
    
    
    # find similarity of all candidate words w.r.t. current hypernym
    hyper_sims = list(map(lambda c: np.dot(
                            data.embedding_matrix[data.tokenizer.word_index[c]], 
                            data.embedding_matrix[data.tokenizer.word_index[positive_sample[1]]]), candidate_words))

    # get most similar word to hypernym which is not hypernym
    most_sim_idx = np.argsort(hyper_sims)[::-1][0]    
    # append most similar hypernym to negative samples
    neg_samples.append(candidate_words[most_sim_idx])
        
    if len(neg_samples) < sample_size:
        neg_samples.extend(get_negative_words(positive_sample, word_hypernyms, data, sample_size=sample_size-1))
    
    return neg_samples


In [15]:
# Create list of tuples where every element follows (word, negative_word)
def get_negative_tuples(terms, data, negative_words_lambda, sample_size):
    # convert terms to dictionary
    input_query, input_hyper = terms
    unq_input_query = sorted(list(set(input_query)))
    
    word_hypernyms = {}
    for w in unq_input_query:        
        word_hypernyms[w] = [h for q, h in zip(input_query, input_hyper) if q == w]
        
            
    negative_tuples = []    
    for words in zip(*terms):
        negatives = negative_words_lambda(words, word_hypernyms, data, sample_size)
        negative_tuples.extend(
                [(words, n) for n in negatives]
        )    
    return negative_tuples

In [565]:
get_negative_tuples((data.test_query, data.test_hyper), data, get_random_negative_words, 5)

[(('vision', 'experience'), 'bookstore'),
 (('vision', 'experience'), 'content'),
 (('vision', 'experience'), 'mold'),
 (('vision', 'experience'), 'missouri'),
 (('vision', 'experience'), 'man'),
 (('lime', 'citrus'), 'tobacco'),
 (('lime', 'citrus'), 'useful'),
 (('lime', 'citrus'), 'yew'),
 (('lime', 'citrus'), 'cyclical'),
 (('lime', 'citrus'), 'placental'),
 (('lime', 'plant'), 'dwell'),
 (('lime', 'plant'), 'look'),
 (('lime', 'plant'), 'player'),
 (('lime', 'plant'), 'company'),
 (('lime', 'plant'), 'rest'),
 (('lime', 'tree'), 'machine'),
 (('lime', 'tree'), 'separation'),
 (('lime', 'tree'), 'fold'),
 (('lime', 'tree'), 'handcart'),
 (('lime', 'tree'), 'mechanical'),
 (('lime', 'food'), 'grind'),
 (('lime', 'food'), 'ideology'),
 (('lime', 'food'), 'receive'),
 (('lime', 'food'), 'abalone'),
 (('lime', 'food'), 'hippopotamus'),
 (('lime', 'produce'), 'seaplane'),
 (('lime', 'produce'), 'exchange'),
 (('lime', 'produce'), 'sunlight'),
 (('lime', 'produce'), 'devolve'),
 (('lime'

In [16]:
# function that returns negative samples alongside set of positive samples
# we need to pass:
# the batch hyponym terms, batch of hypernym terms, negative_tuples, tokenizer 
# to create sequences
def extend_batch_with_negatives(batch_X_term, batch_X_hyper, negative_tuples,                              
                                tokenizer):
    # initialise negative tuples container
    positive_words = [(tokenizer.index_word[term_id], tokenizer.index_word[hyper_id]) \
                          for term_id, hyper_id in zip(batch_X_term.flatten(), batch_X_hyper.flatten())]
    
    # tokenize -ve samples
    neg_terms, neg_hyper = zip(*[(qh[0], h) for qh, h in negative_tuples if qh in positive_words])
    
    neg_terms_seq = tokenizer.texts_to_sequences(neg_terms)
    neg_hyper_seq = tokenizer.texts_to_sequences(neg_hyper)

    # before increasing size of our batch, let's set the actual y values
    # the first n terms are true (1s), and the rest are the -ve samples (0)
    batch_y_label = np.concatenate((
            np.ones(batch_X_term.shape[0]),
            np.zeros(len(neg_terms_seq))
    ))
    # finally, stack -ve sequences at the bottom of +ves to 
    # create our final training batch
    # at most, batch size will be 192 samples            

    batch_X_term = np.vstack((batch_X_term, np.array(neg_terms_seq)))
    batch_X_hyper = np.vstack((batch_X_hyper, np.array(neg_hyper_seq)))
    
    return batch_X_term, batch_X_hyper, batch_y_label

## Model Definition in Keras

In [17]:
from tensorflow.keras import backend as K
from tensorflow.keras.constraints import Constraint

class ForceToOne (Constraint):    
    def __call__(self, w):
        w /= w
        return w

In [24]:
from tensorflow.keras.layers import Input, Dense, Embedding, Dot, Flatten, Concatenate, concatenate, Reshape, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import RandomNormal, Zeros, Ones
from tensorflow.keras.regularizers import l2, l1, l1_l2
from tensorflow.keras.constraints import UnitNorm

from tensorflow.keras import backend as K
import tensorflow as tf

# Phi layer initialiser
def random_identity(shape, dtype="float32", partition_info=None):    
    rnorm = K.random_normal((shape[-1],shape[-1]), mean=0., stddev=0.05)
    #identity = K.eye(shape[-1], dtype='float32')        
    rident = tf.eye(shape[-1]) * rnorm
    return rident

def random_normal(shape, dtype="float32", partition_info=None): 
    return K.random_normal((shape[-1],shape[-1]), 
                             mean=0., stddev=0.05) 

def get_CRIM_model(phi_k=1, train_embeddings=False,\
                   embeddings_dim=300, vocab_size=1000,\
                   embeddings_matrix=None,
                   phi_init = None,
                   phi_activity_regularisation = None,
                   sigmoid_kernel_regularisation = None,
                   sigmoid_bias_regularisation = None,
                   sigmoid_kernel_constraint = None,
                   do_dropout = False
                  ):
    
    hypo_input  = Input(shape=(1,), name='Hyponym')
    hyper_input = Input(shape=(1,), name='Hypernym')
    
    embedding_layer = Embedding(vocab_size + 1, embeddings_dim, embeddings_constraint = UnitNorm(axis=1), 
                                name='TermEmbedding')
    
    
    hypo_embedding = embedding_layer(hypo_input)    
    hyper_embedding = embedding_layer(hyper_input)
    
    # Add Dropout to avoid overfit
    if do_dropout:
        hypo_embedding = Dropout(0.25)(hypo_embedding)
        hyper_embedding = Dropout(0.25)(hyper_embedding)
    
    phi_layer = []
    for i in range(phi_k):
        phi_layer.append(Dense(embeddings_dim, activation=None, use_bias=False, 
                               activity_regularizer=phi_activity_regularisation,
                               kernel_initializer=phi_init,                               
                               name='Phi%d' % (i))(hypo_embedding))

    #phi1 = Dense(embeddings_dim, activation=None, use_bias=False, 
                #kernel_initializer=random_identity, name='Phi1')(hypo_embedding)

    if phi_k == 1:
        # flatten tensors
        phi = Flatten()(phi_layer[0])
        hyper_embedding = Flatten()(hyper_embedding)    
    else:
        phi = concatenate(phi_layer, axis=1)

    
    # this is referred to as "s" in the "CRIM" paper    
    phi_hyper = Dot(axes=-1, normalize=False, name='DotProduct')([phi, hyper_embedding])
    
    if phi_k > 1:
        phi_hyper = Flatten()(phi_hyper)
    
    predictions = Dense(1, activation="sigmoid", name='Prediction',
                        use_bias=True,
                        kernel_initializer=Ones,
                        kernel_constraint= sigmoid_kernel_constraint,
                        bias_initializer=Zeros,                        
                        kernel_regularizer=sigmoid_kernel_regularisation,
                        bias_regularizer=sigmoid_bias_regularisation
                       )(phi_hyper)

    # instantiate model
    model = Model(inputs=[hypo_input, hyper_input], outputs=predictions)
        
    # inject pre-trained embedding weights into Embedding layer
    model.get_layer(name='TermEmbedding').set_weights([embeddings_matrix])
    model.get_layer(name='TermEmbedding').trainable = train_embeddings    

    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


### The training algorithm incorporates mini-batch stochastic descent and negative sampling

In [139]:
def train(model,       # the model which parameters will be learnt
          epochs,      # number of epochs to run          
          batch_size,  # size of mini-batch
          m,           # number of negative samples
          data,        # data required for training                              
          neg_strategy
         ):

    # create negative tuples
    #negative_tuples = get_negative_tuples(data.train_query + data.test_query,
     #                                     data.train_hyper + data.test_hyper, data.neg_vocab, m)
    
    print ("Generating negative tuples...")
    negative_tuples = get_negative_tuples((data.train_query + data.test_query, data.train_hyper + data.test_hyper), 
                                           data, neg_strategy, m)
    print ("Negative tuples...ok")
    
    # create sequences
    term_train_seq = data.tokenizer.texts_to_sequences(data.train_query)
    hyper_train_seq = data.tokenizer.texts_to_sequences(data.train_hyper)

    term_test_seq = data.tokenizer.texts_to_sequences(data.test_query)
    hyper_test_seq = data.tokenizer.texts_to_sequences(data.test_hyper)
                
    samples = np.arange(len(term_train_seq))
    validation_samples = np.arange(len(term_test_seq))
    
    # train algorithm
    for epoch in range(epochs):
        # reset loss
        loss = 0.
        test_loss = 0.
                        
        np.random.shuffle(samples)        

        shuffled_X_term, shuffled_X_hyper =\
            np.array(term_train_seq, dtype='int32')[samples],\
            np.array(hyper_train_seq, dtype='int32')[samples]

        for b in range(0, len(samples), batch_size):
            # product mini-batch, consisting of 32 +ve samples
            batch_X_term = shuffled_X_term[b:b + batch_size] 
            batch_X_hyper = shuffled_X_hyper[b:b + batch_size]

            # complement +ve samples with negatives
            batch_X_term, batch_X_hyper, batch_y_label =\
            extend_batch_with_negatives(batch_X_term, batch_X_hyper,
                                        negative_tuples,
                                        data.tokenizer
                                       )            
            
            # shuffle validation set indices
            np.random.shuffle(validation_samples)
            # pick batch of shuffled test instances with size equal to training batch
            batch_X_test_term, batch_X_test_hyper =\
                np.array(term_test_seq, dtype='int32')[validation_samples[:batch_size]],\
                np.array(hyper_test_seq, dtype='int32')[validation_samples[:batch_size]]
            
            # distort test batch with some negatives to check how algorithm fares with
            # negatives
            batch_X_test_term, batch_X_test_hyper, batch_y_test_label =\
            extend_batch_with_negatives(batch_X_test_term, batch_X_test_hyper,
                                        negative_tuples,
                                        data.tokenizer
                                       )            

            # train on batch
            loss += model.train_on_batch([batch_X_term, batch_X_hyper], 
                                          batch_y_label)[0]
            
            test_loss += model.test_on_batch([batch_X_test_term, batch_X_test_hyper], 
                                              batch_y_test_label)[0]                
            
        print('Epoch:', epoch+1, 'Loss:', loss, 'Test Loss:', test_loss)    


In [692]:
from tensorflow.keras.initializers import RandomNormal, Zeros, Ones
from tensorflow.keras.regularizers import l2, l1, l1_l2

#rand_norm_m0_sd001 = RandomNormal(mean = 0.0, stddev=0.01, seed=42)
#rand_norm = RandomNormal(mean = 0.0, stddev=1., seed=42)

# negative sampling options
neg_sampling_options = {'synonym':get_negative_words, 
                        'mix_hyper_synonym': mix_sim_hyper_random,
                        'similar_hyponym': get_similar_hyponyms,
                        'random': get_random_negative_words
                       }

# phi random init options
phi_init_options = {'random_identity': random_identity, 'random_normal': random_normal}

# implement mini-batch stochastic training
epochs = 15

batch_size = 32

# number of negative samples
m = 10
phi_k = 1
train_embeddings = False
negative_option = 'mix_hyper_synonym'
phi_init_option = 'random_identity'
do_dropout = False
np.random.seed(42)

# create model
crim_model = get_CRIM_model(phi_k = phi_k, train_embeddings = train_embeddings,
                            embeddings_dim = data.embeddings_dim, vocab_size = len(data.tokenizer.word_counts),
                            embeddings_matrix = data.embedding_matrix,
                            phi_init = phi_init_options[phi_init_option],                            
                            sigmoid_kernel_regularisation = l2(0.001),
                            sigmoid_bias_regularisation = l2(0.001),
                            sigmoid_kernel_constraint = None,#ForceToOne(),
                            do_dropout = do_dropout
                           )

print ("Training started...")
print ('Epochs: ', epochs, 'Batch size: ', batch_size, 'm: ', m, 'pki_k: ', phi_k, 'train_embeddings: ', train_embeddings,
      'Negative sampling: ', negative_option, 'Phi Init: ', phi_init_option, 'Dropout: ', do_dropout)

train(crim_model, epochs, batch_size, m, data, neg_sampling_options[negative_option])

Training started...
Epochs:  15 Batch size:  32 m:  10 pki_k:  1 train_embeddings:  False Negative sampling:  mix_hyper_synonym Phi Init:  random_identity Dropout:  False
Generating negative tuples...
Negative tuples...ok
Epoch: 1 Loss: 57.722871869802475 Test Loss: 56.6726176738739
Epoch: 2 Loss: 35.62424777448177 Test Loss: 35.56502993404865
Epoch: 3 Loss: 29.496768444776535 Test Loss: 30.083672389388084
Epoch: 4 Loss: 25.99754847586155 Test Loss: 26.973308578133583
Epoch: 5 Loss: 23.48804147541523 Test Loss: 24.98801949620247
Epoch: 6 Loss: 21.520301684737206 Test Loss: 23.68659245967865
Epoch: 7 Loss: 19.911133125424385 Test Loss: 22.670902863144875
Epoch: 8 Loss: 18.51924930512905 Test Loss: 22.519148021936417
Epoch: 9 Loss: 17.283996485173702 Test Loss: 21.71220640093088
Epoch: 10 Loss: 16.171499885618687 Test Loss: 21.015945971012115
Epoch: 11 Loss: 15.155263021588326 Test Loss: 21.38067301362753
Epoch: 12 Loss: 14.221636325120926 Test Loss: 21.29163869470358
Epoch: 13 Loss: 13.

In [693]:
# have a look at the prediction layer weights
print (crim_model.get_layer(name='Prediction').get_weights())
#projs = ['Phi0', 'Phi1', 'Phi2', 'Phi3', 'Phi4']
projs = ['Phi0']
for p in projs:
    print (np.mean(crim_model.get_layer(name=p).get_weights()[0]))

[array([[2.9366813]], dtype=float32), array([-0.589812], dtype=float32)]
0.0014073642


<tensorflow.python.keras.engine.training.Model at 0x7faaf4135cf8>

## Evaluation  code

Main observations:<br>
1. Tendency is for the model to overfit if we make the model larger than 1 projection matrix;
1. Negative samples are important for the model to learn which words are not hypernyms;
1. Although the model does seem to learn the correct words that are related to hypernymy to the query terms, it does not stop it from predicting with high confidence that similar but completely unrelated words are also hypernyms;
    1. This is really apparent for animals where the model is not able to distinguish between vertebrate and invertebrate; mammal; animal; and so forth;
    1. It's possible that we did not have enough examples to distinguish the various types of animals from each other;
    1. Also, more targeted negative samples could have helped but these would have to be hand-created;


In [1010]:
hyper_candidates = [[data.tokenizer.word_index[hyper]] for hyper in data.tokenizer.word_index.keys()]

def crim_get_top_hypernyms(query, hyper_candidates, model, data, top):
    query_index = data.tokenizer.word_index[query]    
    valid_candidates = list(filter(lambda w: w != [query_index], hyper_candidates))
    
    candidate_sim = list(map(lambda x: model.predict([[query_index], x]).flatten()[0], valid_candidates))       
    top_idx = np.argsort(candidate_sim)[::-1][:top]    
    top_hyper = np.array(valid_candidates)[top_idx].flatten()
    
    return [(data.tokenizer.index_word[t], candidate_sim[top_idx[i]]) for i, t in enumerate(top_hyper)]


crim_get_top_hypernyms('sofa', hyper_candidates, crim_model, data, 15)

[('object', 0.9918138),
 ('artefact', 0.9729182),
 ('furnishing', 0.9690847),
 ('artifact', 0.95833313),
 ('place', 0.9513724),
 ('material', 0.9458266),
 ('furniture', 0.9362237),
 ('structure', 0.92470324),
 ('durable', 0.90841484),
 ('activity', 0.90772265),
 ('property', 0.8954227),
 ('piece', 0.8928697),
 ('equipment', 0.8792076),
 ('event', 0.8615166),
 ('building', 0.8571182)]

In [976]:
i = data.tokenizer.word_index['pool']
j = data.tokenizer.word_index['group']
crim_model.predict([[i], [j]])

array([[0.26786485]], dtype=float32)

### MRR, p@k evaluation

In [196]:
def convert_hypernyms_to_one_line(data):
    ordered_queries = sorted(list(set(data.test_query)))
    one_line = {}
    for w in ordered_queries:
        word_hypernyms = [h for q, h in zip(data.test_query, data.test_hyper) if q == w]
        one_line[w] = word_hypernyms
    return one_line

In [677]:
# taken from task_scorer.py provided with shared task resources
def mean_reciprocal_rank(r):
    """Score is reciprocal of the rank of the first relevant item
    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean reciprocal rank
    """
    r = np.asarray(r).nonzero()[0]
    return 1. / (r[0] + 1) if r.size else 0.

def precision_at_k(r, k, n):
    """Score is precision @ k
    Relevance is binary (nonzero is relevant).
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Precision @ k
    Raises:
        ValueError: len(r) must be >= k
    """
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return (np.mean(r)*k)/min(k,n)
    # Modified from the first version. Now the gold elements are taken into account

# used by Ustalov from https://github.com/nlpub/hyperstar/blob/master/evaluate.py
def compute_ats(data, measures):
    return [sum(measures[j].values()) / len(data.test_query) for j in range(len(measures))]



In [680]:
# predictions is a dictionary whereby key is query term and value is a list of ranked hypernym predictions
def get_evaluation_scores(data, predictions):
    all_scores = []    
    scores_names = ['MRR', 'P@1', 'P@5', 'P@10']
    for query, gold_hyps in convert_hypernyms_to_one_line(data).items():

        avg_pat1 = []
        avg_pat2 = []
        avg_pat3 = []

        pred_hyps = predictions[query]
        gold_hyps_n = len(gold_hyps)    
        r = [0 for i in range(15)]

        for j in range(len(pred_hyps)):
            if j < gold_hyps_n:
                pred_hyp = pred_hyps[j]
                if pred_hyp in gold_hyps:
                    r[j] = 1

        avg_pat1.append(precision_at_k(r,1,gold_hyps_n))
        avg_pat2.append(precision_at_k(r,5,gold_hyps_n))
        avg_pat3.append(precision_at_k(r,10,gold_hyps_n))    

        mrr_score_numb = mean_reciprocal_rank(r)
        avg_pat1_numb = sum(avg_pat1)/len(avg_pat1)
        avg_pat2_numb = sum(avg_pat2)/len(avg_pat2)
        avg_pat3_numb = sum(avg_pat3)/len(avg_pat3)

        score_results = [mrr_score_numb, avg_pat1_numb, avg_pat2_numb, avg_pat3_numb]
        all_scores.append(score_results)
    return scores_names, all_scores

def get_ustalov_evaluation_scores(data, predictions):
    measures = [{} for _ in range(10)]

    for i, (t,h) in enumerate(zip(data.test_query, data.test_hyper)):
        actual = predictions[t]
        for j in range(0, len(measures)):
            measures[j][(t, h)] = 1. if h in actual[:j + 1] else 0.

    ats = compute_ats(data, measures) 
    return ats


In [874]:
from sklearn.metrics.pairwise import cosine_similarity

# alternative hypernym generator by applying Phi weights to hyponym and see which 
# words are closest to this vector
def alt_get_hypernym(word, model, data, embeddings, top, bias = 0):
    q_idx = data.tokenizer.word_index[word]    
    
    q = embeddings[q_idx]    
        
    try:
        _phi = model.get_layer(name='Phi0').get_weights()[0]
    except ValueError:
        _phi = model.get_layer(name='Phi').get_weights()[0]
        
    #
    #_phi = model.get_layer(name='Phi0').get_weights()[0] +\
    #       model.get_layer(name='Phi1').get_weights()[0] +\
    #       model.get_layer(name='Phi2').get_weights()[0]    
    _proj = np.dot(q, _phi)
    #_proj /= np.linalg.norm(_proj)
    
    #sim = cosine_similarity(embeddings[1:], _proj.reshape(1,-1)).flatten() 
    sim = np.array(list(map(lambda v: np.dot(v, _proj), embeddings[1:]))) + bias
    
    return list(map(lambda i: (data.tokenizer.index_word[i+1], sim[i]), np.argsort(sim)[::-1][:top]))

def ustalov_get_hypernyms(word, _model, data, embeddings, top):
    q_idx = data.tokenizer.word_index[word]        
    q = embeddings[q_idx]       
    
    _phi = _model.get_layer(name='Phi0').get_weights()[0]
        
    Y_hat = np.dot(q, _phi)
    Y_hat /= np.linalg.norm(Y_hat)    
    
    return model.similar_by_vector(Y_hat)


In [667]:
ustalov_get_hypernyms('lime', crim_model, data, crim_model.get_layer(name="TermEmbedding").get_weights()[0], 300)

  if np.issubdtype(vec.dtype, np.int):


[('Ky.Busch_##-###', 0.5357945561408997),
 ('M.Kenseth_###-###', 0.5254930257797241),
 ('J.McMurray_###-###', 0.5196651816368103),
 ('E.Sadler_###-###', 0.5190649032592773),
 ('G.Biffle_###-###', 0.4628258943557739),
 ('Nasdaq_NASDAQ_TRIN', 0.4541865885257721),
 ('mso_para_margin_0in', 0.4513596296310425),
 ('T.Stewart_##-###', 0.44903600215911865),
 ('HuMax_IL8_TM', 0.43771377205848694),
 ('K.Kahne_###-###', 0.43192803859710693)]

In [1321]:
# test get hypernym routine
alt_get_hypernym('deaf',crim_model, data, crim_model.get_layer(name="TermEmbedding").get_weights()[0], 15)

[('person', 0.8147533),
 ('vertebrate', 0.63432205),
 ('deaf', 0.4659867),
 ('human', 0.4530565),
 ('responsive', 0.43644807),
 ('neural', 0.33325133),
 ('communication', 0.17331588),
 ('relational', 0.16485256),
 ('animal', 0.14382486),
 ('patient', 0.11207277),
 ('competent', 0.09620102),
 ('placental', 0.08047774),
 ('emotion', 0.0673254),
 ('profession', 0.06708583),
 ('familial', 0.056225777)]

In [572]:
def predict_crim_hypernyms(data, model):
    hyper_candidates = [[data.tokenizer.word_index[hyper]] for hyper in data.tokenizer.word_index.keys()]
    ordered_queries = sorted(list(set(data.test_query)))
    results = {}
        
    embeddings = crim_model.get_layer(name="TermEmbedding").get_weights()[0]                     
    for idx, word in enumerate(ordered_queries):        
        if (idx + 1) % 25 == 0:
            print ("Done", idx + 1)
        #predicted_hypers = crim_get_top_hypernyms(word, hyper_candidates, model, data, 15)
        predicted_hypers = alt_get_hypernym(word, model, data, embeddings, 15)
        results[word] = [h for h, p in predicted_hypers]
        
    return results

In [694]:
import math

crim_predictions = predict_crim_hypernyms(data, crim_model)

print ("CRIM evaluation:")
score_names, all_scores = get_evaluation_scores(data, crim_predictions)
for k in range(len(score_names)):
    print (score_names[k]+': '+str(round(sum([score_list[k] for score_list in all_scores]) / len(all_scores), 5)))
print ("")
print ("Ustalov-style evaluation:")
ats = get_ustalov_evaluation_scores(data, crim_predictions)
ats_string = ', '.join(['A@%d=%.4f' % (j + 1, ats[j]) for j in range(len(ats))])
print (ats_string)

Done 25
Done 50
Done 75
Done 100
Done 125
Done 150
Done 175
Done 200
Done 225
Done 250
Done 275
Done 300
Done 325
Done 350
Done 375
Done 400
Done 425
Done 450
CRIM evaluation:
MRR: 0.52341
P@1: 0.5
P@5: 0.37957
P@10: 0.35763

Ustalov-style evaluation:
A@1=0.1468, A@2=0.2429, A@3=0.3240, A@4=0.3851, A@5=0.4409, A@6=0.4649, A@7=0.4916, A@8=0.5123, A@9=0.5305, A@10=0.5416


## Evaluation Results
* Epochs:  15 Batch size:  32 m:  5 pki_k:  1 train_embeddings:  False Negative sampling:  synonym Phi Init:  random_normal

CRIM evaluation:<br>
MRR: 0.48746<br>
P@1: 0.43584<br>
P@5: 0.34417<br>
P@10: 0.32088<br>

* Epochs:  20 Batch size:  32 m:  5 pki_k:  1 train_embeddings:  False Negative sampling:  synonym Phi Init:  random_normal

CRIM evaluation:<br>
MRR: 0.4903<br>
P@1: 0.43584<br>
P@5: 0.35612<br>
P@10: 0.33251<br>

* Epochs:  10 Batch size:  32 m:  5 pki_k:  5 train_embeddings:  False Negative sampling:  synonym Phi Init:  random_normal
    * LR parameters are unconstrained
    
CRIM evaluation:<br>
MRR: 0.49336<br>
P@1: 0.45133<br>
P@5: 0.36176<br>
P@10: 0.34156<br>

* Epochs:  15 Batch size:  32 m:  10 pki_k:  1 train_embeddings:  False Negative sampling:  synonym Phi Init:  random_normal
    * LR parameters constrained

CRIM evaluation:<br>
MRR: 0.49779<br>
P@1: 0.45133<br>
P@5: 0.36622<br>
P@10: 0.3412<br>

* Epochs:  15 Batch size:  32 m:  10 pki_k:  1 train_embeddings:  False Negative sampling:  synonym Phi Init:  random_identity
    * LR parameters constrained

CRIM evaluation:<br>
MRR: 0.49926<br>
P@1: 0.45354<br>
P@5: 0.36947<br>
P@10: 0.34335<br>

* Epochs:  15 Batch size:  32 m:  5 pki_k:  1 train_embeddings:  False Negative sampling:  mix_hyper_synonym Phi Init:  random_identity

CRIM evaluation:<br>
MRR: 0.49819<br>
P@1: 0.46239<br>
P@5: 0.33621<br>
P@10: 0.31726<br>

* Epochs:  15 Batch size:  32 m:  10 pki_k:  1 train_embeddings:  False Negative sampling:  mix_hyper_synonym Phi Init:  random_identity

CRIM evaluation:<br>
MRR: 0.52017<br>
P@1: 0.48894<br>
P@5: 0.36892<br>
P@10: 0.34381<br>

Ustalov-style evaluation:<br>
A@1=0.1435, A@2=0.2455, A@3=0.3253, A@4=0.3792, A@5=0.4162, A@6=0.4455, A@7=0.4695, A@8=0.4844, A@9=0.4981, A@10=0.5117<br>

*With regularisation and no constraining of LR parameters*<br>

CRIM evaluation:<br>
MRR: 0.52341<br>
P@1: 0.5<br>
P@5: 0.37957<br>
P@10: 0.35763<br>

Ustalov-style evaluation:
A@1=0.1468, A@2=0.2429, A@3=0.3240, A@4=0.3851, A@5=0.4409, A@6=0.4649, A@7=0.4916, A@8=0.5123, A@9=0.5305, A@10=0.5416<br>


* Epochs:  15 Batch size:  32 m:  10 pki_k:  1 train_embeddings:  False Negative sampling:  synonym Phi Init:  random_identity Dropout:  True

CRIM evaluation:<br>
MRR: 0.47622<br>
P@1: 0.43363<br>
P@5: 0.32861<br>
P@10: 0.3021<br>

* Epochs:  15 Batch size:  32 m:  10 pki_k:  1 train_embeddings:  False Negative sampling:  random Phi Init:  random_identity Dropout:  False

CRIM evaluation:<br>
MRR: 0.49723<br>
P@1: 0.44912<br>
P@5: 0.37105<br>
P@10: 0.35053<br>

Ustalov evaluation:<br>
A@1=0.1318, A@2=0.2390, A@3=0.3260, A@4=0.3968, A@5=0.4487, A@6=0.4857, A@7=0.5149, A@8=0.5448, A@9=0.5623,< A@10=0.5766<br>

**Note how evaluation metrics paint a different picture of which algorithm fares best.  The random negative sampling model with unconstrained (but regularised) prediction layer parameters, beats the constrained model using a mix of similar hypernyms and random synonyms.**
**The latter yields slightly more accuracy highly-ranked hypernyms but then has the tendency to suggest incorrect results with lesser probability.  

# Yamane Implementation


In [18]:
## attempt custom constraint to keep weight fixed at 1.
from tensorflow.keras.constraints import Constraint
from tensorflow.keras import backend as K

class ForceToOne (Constraint):    
    def __call__(self, w):
        w /= w
        return w

In [19]:
from tensorflow.keras.layers import Input, Dense, Embedding, Dot, Flatten, Concatenate, concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.initializers import RandomNormal, Zeros, Ones
from tensorflow.keras import backend as K

import tensorflow as tf

def get_new_cluster_model(embedding_layer, 
                          phi_dim = 300,
                          phi_init = None,
                          phi_activity_regularisation = None,                          
                          sigmoid_bias_regularisation = None,
                          sigmoid_kernel_constraint = None,
                          do_dropout = False):
    
    hypo_input = Input(shape=(1,), name='Hyponym')    
    hyper_input = Input(shape=(1,), name='Hypernym')
    
    hypo_embedding, hyper_embedding = embedding_layer([hypo_input, hyper_input])
    
    if do_dropout:
        hypo_embedding = Dropout(0.25)(hypo_embedding)
        hyper_embedding = Dropout(0.25)(hyper_embedding)
                
    phi = Dense(phi_dim, activation=None, use_bias=False,                 
                kernel_initializer=phi_init,                
                name='Phi0')(hypo_embedding)
    
    # flatten phi and hyper_embedding tensors
    phi = Flatten()(phi)
    hyper_embedding = Flatten()(hyper_embedding)
    
    phi_hyper = Dot(axes=-1, normalize=False, name='DotProduct')([phi, hyper_embedding])    
    
    predictions = Dense(1, activation = "sigmoid", 
                        bias_initializer = Zeros,
                        kernel_initializer = Ones,
                        kernel_constraint = sigmoid_kernel_constraint,                        
                        bias_regularizer=sigmoid_bias_regularisation, 
                        name='Prediction')(phi_hyper)
    # instantiate model
    model = Model(inputs=[hypo_input, hyper_input], outputs=predictions)
    
    # compile using binary_crossentropy loss
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return model

### We don't need a unique embedding layer for every sub-model.  

Instead, we can create a separate model for the embeddings and set the weights according to the pre-trained embeddings

In [20]:
def get_embeddings_model(dim, embedding_matrix):
    hypo_input = Input(shape=(1,))
    hyper_input = Input(shape=(1,))

    word_embedding = Embedding(embedding_matrix.shape[0], dim, name='WE')

    hypo_embedding = word_embedding(hypo_input)
    hyper_embedding = word_embedding(hyper_input)

    embedding_model = Model(inputs=[hypo_input, hyper_input], outputs=[hypo_embedding, hyper_embedding])

    # inject pre-trained embeddings into this mini, resusable model/layer
    embedding_model.get_layer(name='WE').set_weights([embedding_matrix])
    embedding_model.get_layer(name='WE').trainable = False
    return embedding_model

In [21]:
class YamaneCluster:
    def __init__(self, embedding_layer, phi_dim, phi_init, sigmoid_kernel_constraint):
        
        self.model = get_new_cluster_model(embedding_layer = embedding_layer, 
                                           phi_dim = phi_dim, 
                                           phi_init = phi_init, 
                                           sigmoid_kernel_constraint = sigmoid_kernel_constraint)
        self.epoch_count = 0
        self.loss = 0.
        self.test_loss = 0.
    
    def increment_epoch(self):
        self.epoch_count += 1
        
    def update_loss(self, new_loss):
        self.loss += new_loss
        
    def update_test_loss(self, new_loss):
        self.test_loss += new_loss

In [22]:
def yamane_train(
    epochs,      # number of epochs to run
    m,           # number of negative samples
    data,        # class instance containing all the data required for training/testing        
    embedding_layer,
    threshold    = 0.15,     # threshold; similarity below this score will trigger new cluster
    negative_option = 'random', # inject lambda responsible for determining negative sample choice
    phi_init_option = None,
    sigmoid_kernel_constraint = None):    
  
    
    neg_sampling_options = {'synonym':get_negative_words, 
                            'mix_hyper_synonym': mix_sim_hyper_random,
                            'similar_hyponym': get_similar_hyponyms,
                            'random': get_random_negative_words
                           }
    
    phi_init_options = {'random_identity': random_identity, 'random_normal': random_normal}
        
    print ("Generating negative tuples...")
    negative_tuples = get_negative_tuples((data.train_query + data.test_query, data.train_hyper + data.test_hyper), 
                                           data, neg_sampling_options[negative_option], m)
    print ("Negative tuples...ok")
    
    # create sequences
    # we have two sets of inputs: one for training query and hypernym terms;
    #                             another for the validation query/hyper terms;
    term_train_seq = data.tokenizer.texts_to_sequences(data.train_query)
    hyper_train_seq = data.tokenizer.texts_to_sequences(data.train_hyper)

    term_test_seq = data.tokenizer.texts_to_sequences(data.test_query)
    hyper_test_seq = data.tokenizer.texts_to_sequences(data.test_hyper)
    
    # convert all to arrays
    term_train_seq, hyper_train_seq, term_test_seq, hyper_test_seq =\
    [np.array(x, dtype='int32') for x in [term_train_seq, hyper_train_seq, term_test_seq, hyper_test_seq]]
            
    # this list stores which cluster each training sequence pertains to
    sample_clusters = np.zeros(len(term_train_seq), dtype='int32')
    
    print ("m: ", m, "lambda: ", threshold, "max epoch per cluster: ", epochs, 
           "Negative sampling: ", negative_option, 'Phi Init: ', phi_init_option)
    
    
    print ("Sample clusters size: ", len(sample_clusters))
    # list containing 1 model per cluster
    clusters = []
    # add default model to our list of models
    # we share the embedding layer loaded with the pre-trained weights
    # append tuple where 1st element is the cluster and 2nd element is the 
    # number of epochs that cluster is trained
    
    clusters.append(YamaneCluster(embedding_layer, phi_dim=data.embeddings_dim,
                                  phi_init = phi_init_options[phi_init_option], sigmoid_kernel_constraint = sigmoid_kernel_constraint))
    
    # get training set indices
    indices = np.arange(len(term_train_seq))  
    
    # get test set indices
    test_indices = np.arange(len(term_test_seq))
            
    # initialise each training sample to cluster 0
    sample_clusters[indices] = 0        
    
    # seed random generator
    np.random.seed(42)
    
    # indicator of "current" sample cluster index
    z_i = 0
    
    # train algorithm
    #for epoch in range(epochs):
    epoch = 0
    test_loss = 0.    
    
    while np.min([c.epoch_count for c in clusters]) < epochs:
        # reset loss for each cluster                        
        for c in clusters:
            if c.epoch_count < epochs:                
                c.loss = 0.
            c.test_loss = 0.
        
        test_loss = 0.
        
        # shuffle indices every epoch
        np.random.shuffle(indices)
        
        # train algorithm by stochastic gradient descent, one sample at a time
        for idx, i in enumerate(indices):                        
            if (idx + 1) % 500 == 0:
                print ("Processed ", idx+1, "samples...")
            
            # calculate similarity on all clusters
            sim = list(map(lambda x: x.model.predict([term_train_seq[i], hyper_train_seq[i]]), clusters))
            max_sim = np.argmax(sim)
            #print "Term:", tokenizer.index_word[term_train_seq[i][0]], 'Hyper:', tokenizer.index_word[hyper_train_seq[i][0]], "Max Similarity cluster:", max_sim, "(sim = %0.8f)" % (sim[max_sim])
            # limit cluster creation to a max of 25.
            if ((sim[max_sim] < threshold) and (len(clusters) < 25)): 
                # add new cluster to list of clusters
                clusters.append(YamaneCluster(embedding_layer, phi_dim=data.embeddings_dim,
                                phi_init = phi_init_options[phi_init_option], sigmoid_kernel_constraint = sigmoid_kernel_constraint))
                
                # assign current cluster index to latest model
                z_i = len(clusters) - 1
                sample_clusters[i] = z_i
            else:            
                z_i = max_sim
                sample_clusters[i] = z_i                
                        
            # if current cluster reached/exceeded epoch count, skip current sample (i.e don't update cluster)
            if clusters[z_i].epoch_count < epochs:                                            
                # extend samples in cluster with negative samples
                batch_X_term, batch_X_hyper, batch_y_label =\
                    extend_batch_with_negatives(term_train_seq[i], 
                                                hyper_train_seq[i],
                                                negative_tuples,
                                                data.tokenizer
                                               )  

                # update parameters of cluster 
                clusters[z_i].update_loss(
                    clusters[z_i].model.train_on_batch([batch_X_term, batch_X_hyper], batch_y_label)[0]
                )
            
            # measure test loss 
            # every 32 samples (and updates are processed), we will test performance on validation set
            # of 32 randomly chosen samples. We will record test loss of every cluster and report on 
            # lowest loss
            
            if (idx + 1) % 100 == 0:
                np.random.shuffle(test_indices)
                batch_query, batch_hyper = term_test_seq[test_indices[:32]], hyper_test_seq[test_indices[:32]]
                batch_query, batch_hyper, test_y_label =\
                    extend_batch_with_negatives(batch_query, 
                                                batch_hyper,
                                                negative_tuples,
                                                data.tokenizer
                                               )  
                #batch_label = [1.] * batch_query.shape[0]
                for q, h, l in zip(batch_query, batch_hyper, test_y_label):                                    
                    test_losses = list(map(lambda c: c.model.test_on_batch([q, h], [l])[0], clusters))
                    best_cluster = np.argmin(test_losses)
                    clusters[best_cluster].update_test_loss(
                        test_losses[best_cluster]
                    )                    
                                                                                                                      
        # increase epoch count for clusters
        for cluster in clusters:            
            cluster.epoch_count += 1
                
        print('Epoch:', max([c.epoch_count for c in clusters]), 'Cluster #:', len(clusters) ,
              'Loss:', np.mean([c.loss for c in clusters]),
              'Test Loss:', np.mean([c.test_loss for c in clusters]))
    return clusters, sample_clusters

In [None]:
import datetime


# initialise embedding later which will be shared among all clusters
embedding_layer = get_embeddings_model(dim=data.embeddings_dim, embedding_matrix=data.embedding_matrix)
epochs = 15
m = 10

print ("Training started...")
clusters, sample_clusters =\
    yamane_train(epochs, m, 
                 data,
                 embedding_layer,
                 threshold = 0.2,
                 negative_option = 'random',
                 phi_init_option = 'random_normal',
                 sigmoid_kernel_constraint = ForceToOne())

print (datetime.datetime.now())

Training started...
Generating negative tuples...
Negative tuples...ok
m:  10 lambda:  0.2 max epoch per cluster:  15 Negative sampling:  random Phi Init:  random_normal
Sample clusters size:  4374
Processed  500 samples...
Processed  1000 samples...
Processed  1500 samples...
Processed  2000 samples...
Processed  2500 samples...
Processed  3000 samples...
Processed  3500 samples...
Processed  4000 samples...
Epoch: 1 Cluster #: 19 Loss: 104.56347475024431 Test Loss: 223.83506843689094
Processed  500 samples...
Processed  1000 samples...
Processed  1500 samples...
Processed  2000 samples...
Processed  2500 samples...
Processed  3000 samples...
Processed  3500 samples...
Processed  4000 samples...
Epoch: 2 Cluster #: 25 Loss: 55.59035038062371 Test Loss: 86.60853448438691
Processed  500 samples...


## Yamane Clusters

Analysing the hypernym element of the training pairs consistuting a cluster, a pattern emerges. The training algorithm has realised a semantic split and has clustered terms around particular hypernyms. 
Give a model trained on the following parameters:
* m:  10 lambda:  0.15 max epoch per cluster:  15 Negative sampling:  random Phi Init:  random_normal

, the most populated clusters where in descending order: 
* Counter({4: 743, 0: 674, 7: 311, 1: 279, 5: 236, 13: 166, 11: 149, 22: 139, 23: 138, 6: 135, 24: 118, 21: 117, 18: 116, 19: 108, 17: 106, 3: 105, 20: 97, 14: 94, 16: 90, 15: 89, 9: 80, 10: 78, 12: 72, 8: 68, 2: 66})

For instance, cluster 4, the most populated clusters, features animals (282), mammals (119) and birds (50).  Cluster 0 (second most populated), features chordates (203), vertebrates(201), placentals (108) and invertebrates (56). <br>

It is likely that the model is learning the optimal projections to transform any given hyponym to the hypernym which features frequently in a cluster.  For instance if we multiply the embedding of"barley" with projection matrix learnt from the samples in cluster 0, the resultant vector would be similar to "vertebrate" even though barley is clearly not an animal.  

When we compute the product of the projection matric in every cluster and compare the result with every word in the vocab, the top 15 most similar words are:

[('plant', 9.292432),<br>
 ('food', 5.7546735),<br>
 ('herb', 5.4948673),<br>
 ('vertebrate', 5.256508),<br>
 ('chordate', 4.754751),<br>
 ('cereal', 4.3708754),<br>
 ('placental', 4.155033),<br>
 ('grass', 4.0776873),<br>
 ('factory', 3.7920942),<br>
 ('beverage', 3.3642926),<br>
 ('shrub', 3.3227572),<br>
 ('animal', 3.1966455),<br>
 ('produce', 2.8567104),<br>
 ('ruminant', 2.2009413),<br>
 ('angiosperm', 2.1797898)]<br>

Turns out that 5 (out of 6)  of the hypernyms in the gold test data were correctly generated.  However note that chordate, vertebrate and placental features in the top 10 words.

In [1299]:
hypers_in_cluster = list(map(lambda x: data.train_hyper[x], np.where(sample_clusters == 15)[0]))
freq_hyper_cluster = Counter(hypers_in_cluster)
sorted(freq_hyper_cluster.items(), key=lambda kv: kv[1], reverse=True)

[('action', 2),
 ('state', 2),
 ('space', 2),
 ('education', 2),
 ('light', 2),
 ('gray', 2),
 ('aid', 2),
 ('number', 1),
 ('religion', 1),
 ('brief', 1),
 ('unspecified', 1),
 ('process', 1),
 ('representational', 1),
 ('rear', 1),
 ('low', 1),
 ('flash', 1),
 ('vocation', 1),
 ('payment', 1),
 ('settlement', 1),
 ('finite', 1),
 ('direction', 1),
 ('mental', 1),
 ('country', 1),
 ('affect', 1),
 ('brochure', 1),
 ('pamphlet', 1),
 ('uncontrollable', 1),
 ('flower', 1),
 ('event', 1),
 ('art', 1),
 ('growth', 1),
 ('grow', 1),
 ('scale', 1),
 ('data', 1),
 ('hit', 1),
 ('act', 1),
 ('approach', 1),
 ('machine', 1),
 ('crystal', 1),
 ('labor', 1),
 ('relax', 1),
 ('soil', 1),
 ('man', 1),
 ('direct', 1),
 ('container', 1),
 ('part', 1),
 ('group', 1),
 ('explode', 1),
 ('muscle', 1),
 ('grub', 1),
 ('shade', 1),
 ('possible', 1),
 ('ethnic', 1),
 ('church', 1),
 ('abandoned', 1),
 ('government', 1),
 ('wake', 1),
 ('sleep', 1),
 ('structural', 1),
 ('computer', 1),
 ('school', 1),
 ('

In [926]:
from collections import Counter
# show distribution of samples over the trained clusters
print (Counter(sample_clusters))
print ("-"*30)
print ("Train and test loss per cluster")

for idx, c in enumerate(clusters):
    print (idx, c.epoch_count, c.loss, c.test_loss)
    


Counter({4: 743, 0: 674, 7: 311, 1: 279, 5: 236, 13: 166, 11: 149, 22: 139, 23: 138, 6: 135, 24: 118, 21: 117, 18: 116, 19: 108, 17: 106, 3: 105, 20: 97, 14: 94, 16: 90, 15: 89, 9: 80, 10: 78, 12: 72, 8: 68, 2: 66})
------------------------------
Train and test loss per cluster
0 18 2.077968140053599 26.584612346770555
1 18 0.14291605495270687 2.4012529607851434
2 18 3.011270795017481 15.81435315310955
3 18 1.3532882147701457 22.07756668852926
4 18 3.5027269545994386 26.21724240878588
5 18 0.9470082458719844 20.615928533309784
6 18 0.6829920894379029 9.659515039220878
7 18 3.2296777088486124 13.683177031301511
8 18 4.120160838589072 36.14920896291733
9 18 6.023580802604556 28.57362263975665
10 18 4.613474638201296 55.94250735640526
11 17 1.688921358785592 4.030947924022257
12 17 4.820179348811507 44.305954933166504
13 17 3.86210466385819 27.297351107948998
14 17 2.7912403827067465 18.074579687789083
15 17 6.939839515835047 62.952078223228455
16 17 4.9971641432493925 30.62264429219067
1

In [1316]:
def yamane_get_top_hypernym(query, hyper_candidates, clusters, data, top):    
    query_index = data.tokenizer.word_index[query]
    # remove actual query from candidates    
    valid_candidates = list(filter(lambda x: x[0]!=query_index, hyper_candidates))
    hyper_probs = []
    for idx, hyper in enumerate(valid_candidates):    
        if (idx+1) % 500 == 0:
            print ("Done", idx+1)
        candidate_sim = list(map(lambda x: x.model.predict([[query_index], hyper]).flatten()[0], clusters))
        hyper_probs.append(np.max(candidate_sim))
    
    top_idx = np.argsort(hyper_probs)[::-1][:top]
    top_hyper = np.array(valid_candidates)[top_idx].flatten()
            
    return [(data.tokenizer.index_word[t], hyper_probs[top_idx[i]]) for i, t in enumerate(top_hyper)]


def alt_yamane_get_top(word, hyper_candidates, clusters, data, top, bias_list):
    
    yam_results = []
    for idx, c in enumerate(clusters):
        yam_results.extend(\
            alt_get_hypernym(word, c.model, data, embedding_layer.get_layer(name="WE").get_weights()[0], 10, bias=yamane_bias[idx])
                          )
    return sorted(yam_results, key= lambda x:x[1], reverse=True)[:top]

def predict_yamane_hypernyms(data, clusters):
    hyper_candidates = [[data.tokenizer.word_index[hyper]] for hyper in data.tokenizer.word_index.keys()]
    
    # store biases in list
    yamane_bias = list(map(lambda c: c.model.get_layer('Prediction').get_weights()[1][0], clusters))
    
    ordered_queries = sorted(list(set(data.test_query)))
    
    results = {}
    for idx, word in enumerate(ordered_queries):
        if (idx + 1) % 25 == 0:
            print ("Done", idx + 1)
        
        # find clusters best suited to this word
        word_id = data.tokenizer.word_index[word]
        
        cluster_probs = neigh.predict_proba(data.embedding_matrix[word_id].reshape(1,-1))
        cluster_idx = np.where(cluster_probs > 0.)[1]
        #print (cluster_idx)
        specific_clusters = map(lambda c: clusters[c], cluster_idx)
        specific_bias = map(lambda c: yamane_bias[c], cluster_idx)

        predicted_hypers = alt_yamane_get_top(word, hyper_candidates, specific_clusters, data, 15, specific_bias )
        
        
        #predicted_hypers = yamane_get_top_hypernym(word, hyper_candidates, clusters, data, 15)
        #predicted_hypers = alt_yamane_get_top(word, hyper_candidates, clusters, data, 15, yamane_bias)        
        
        results[word] = [h for h, p in predicted_hypers]
        
    return results

In [1322]:
#yamane_predictions = predict_yamane_hypernyms(data, clusters)
hyper_candidates = [[data.tokenizer.word_index[hyper]] for hyper in data.tokenizer.word_index.keys()]
yamane_bias = list(map(lambda c: c.model.get_layer('Prediction').get_weights()[1][0], clusters))
alt_yamane_get_top('deaf', hyper_candidates, clusters, data, 15, yamane_bias )

[('person', 4.9793377),
 ('animal', 2.47742),
 ('human', 1.8638403),
 ('placental', 1.6627152),
 ('instrument', 1.5909408),
 ('chordate', 1.2495062),
 ('move', 1.1529647),
 ('vertebrate', 1.0523603),
 ('device', 1.0224875),
 ('employee', 1.0139179),
 ('mammal', 0.95096517),
 ('primate', 0.9028685),
 ('plant', 0.75226915),
 ('place', 0.63126254),
 ('change', 0.5526115)]

In [933]:
#yamane_get_top_hypernym('lime', hyper_candidates, clusters, data, 15)

Done 500
Done 1000
Done 1500
Done 2000
Done 2500


[('plant', 0.9999012),
 ('vertebrate', 0.99234927),
 ('food', 0.9917059),
 ('herb', 0.9899386),
 ('invertebrate', 0.9893778),
 ('chordate', 0.980608),
 ('angiosperm', 0.9798537),
 ('factory', 0.9751135),
 ('tree', 0.96472),
 ('produce', 0.961811),
 ('object', 0.9617095),
 ('animal', 0.9588083),
 ('mollusk', 0.9435754),
 ('shrub', 0.93360406),
 ('placental', 0.92714185)]

In [1317]:
import math

yamane_predictions = predict_yamane_hypernyms(data, clusters)

print ("Yamane evaluation:")
score_names, all_scores = get_evaluation_scores(data, yamane_predictions)
for k in range(len(score_names)):
    print (score_names[k]+': '+str(round(sum([score_list[k] for score_list in all_scores]) / len(all_scores), 5)))
print ("")
print ("Ustalov-style evaluation:")
ats = get_ustalov_evaluation_scores(data, yamane_predictions)
ats_string = ', '.join(['A@%d=%.4f' % (j + 1, ats[j]) for j in range(len(ats))])
print (ats_string)


Done 25
Done 50
Done 75
Done 100
Done 125
Done 150
Done 175
Done 200
Done 225
Done 250
Done 275
Done 300
Done 325
Done 350
Done 375
Done 400
Done 425
Done 450
Yamane evaluation:
MRR: 0.49281
P@1: 0.46018
P@5: 0.36722
P@10: 0.34988

Ustalov-style evaluation:
A@1=0.1351, A@2=0.2286, A@3=0.3156, A@4=0.3734, A@5=0.4273, A@6=0.4604, A@7=0.4864, A@8=0.5091, A@9=0.5208, A@10=0.5351


In [1323]:
yamane_predictions['deaf']

['instrument',
 'move',
 'device',
 'change',
 'system',
 'build',
 'step',
 'join',
 'expand',
 'push',
 'receptive',
 'departure',
 'positive',
 'organ',
 'tool']

# Yamane evaluation

* m:  5 lambda:  0.15 max epoch per cluster:  13 Negative sampling:  mix_hyper_synonym Phi Init:  random_identity

Yamane evaluation:<br>
MRR: 0.45324<br>
P@1: 0.4292<br>
P@5: 0.33544<br>
P@10: 0.30875<br>

Ustalov-style evaluation:<br>
A@1=0.1260, A@2=0.2188, A@3=0.3013, A@4=0.3669, A@5=0.4273, A@6=0.4526, A@7=0.4883, A@8=0.5052, A@9=0.5221, A@10=0.5305<br>

* m:  1 lambda:  0.15 max epoch per cluster:  13 Negative sampling:  random Phi Init:  random_normal

Yamane evaluation:<br>
MRR: 0.46375<br>
P@1: 0.4292<br>
P@5: 0.31766<br>
P@10: 0.29952<br>

Ustalov-style evaluation:<br>
A@1=0.1260, A@2=0.2195, A@3=0.2825, A@4=0.3416, A@5=0.3877, A@6=0.4247, A@7=0.4558, A@8=0.4838, A@9=0.5039, A@10=0.5195<br>

* m:  10 lambda:  0.15 max epoch per cluster:  15 Negative sampling:  random Phi Init:  random_normal

Yamane evaluation:<br>
MRR: 0.45739<br>
P@1: 0.41593<br>
P@5: 0.32338<br>
P@10: 0.30494<br>

Ustalov-style evaluation:<br>
A@1=0.1221, A@2=0.2058, A@3=0.2825, A@4=0.3442, A@5=0.3922, A@6=0.4312, A@7=0.4565, A@8=0.4851, A@9=0.5097, A@10=0.5286<br>

Employed novel method that makes better use learnt clusters. Rather than getting closest words according to projection in every cluster (and then filtering to leave the best 15), I trained a KNN classifier which attempts to find the better suited cluster for the term in question.  Reduces obvious wrong words since certain clusters have learnt a projection matrix that would transform several words to the strongest hypernym featuring in the cluster.

Yamane evaluation:<br>
MRR: 0.49281<br>
P@1: 0.46018<br>
P@5: 0.36722<br>
P@10: 0.34988<br>

Ustalov-style evaluation:<br>
A@1=0.1351, A@2=0.2286, A@3=0.3156, A@4=0.3734, A@5=0.4273, A@6=0.4604, A@7=0.4864, A@8=0.5091, A@9=0.5208, A@10=0.5351<br>

In [1155]:
i = data.tokenizer.word_index['baby']
j = data.tokenizer.word_index['child']
list(map(lambda c: c.model.predict([[i], [j]]),  clusters))

#plant = .99990785 (1)
#vertebrate = .99481356 (0)

[array([[4.969965e-05]], dtype=float32),
 array([[0.00042733]], dtype=float32),
 array([[0.01260389]], dtype=float32),
 array([[0.00849416]], dtype=float32),
 array([[0.01360245]], dtype=float32),
 array([[0.0467734]], dtype=float32),
 array([[0.0040016]], dtype=float32),
 array([[0.00175099]], dtype=float32),
 array([[0.02694884]], dtype=float32),
 array([[0.0825713]], dtype=float32),
 array([[0.04480027]], dtype=float32),
 array([[0.00564445]], dtype=float32),
 array([[0.13343635]], dtype=float32),
 array([[0.00391912]], dtype=float32),
 array([[0.02489026]], dtype=float32),
 array([[0.0522318]], dtype=float32),
 array([[0.02140403]], dtype=float32),
 array([[0.0131434]], dtype=float32),
 array([[0.01463544]], dtype=float32),
 array([[0.01401767]], dtype=float32),
 array([[0.92166936]], dtype=float32),
 array([[0.00787069]], dtype=float32),
 array([[0.02718614]], dtype=float32),
 array([[0.00640392]], dtype=float32),
 array([[0.01362688]], dtype=float32)]

# Scratch Pad

In [83]:
# find most similar words to given word
word = data.tokenizer.word_index['adventure']
candidate_words = list(filter(lambda w: w != word, data.tokenizer.index_word.keys()))
sims = list(map(lambda c: np.dot(data.embedding_matrix[c], data.embedding_matrix[word]), candidate_words))

most_sim_idx = np.argsort(sims)[::-1][:20]
#print most_sim_idx

[(data.tokenizer.index_word[candidate_words[idx]], sims[idx]) for idx in most_sim_idx]


[('journey', 0.50752056),
 ('fun', 0.4609611),
 ('trip', 0.45013088),
 ('ride', 0.38148546),
 ('movie', 0.3800118),
 ('magical', 0.3711656),
 ('drama', 0.3643443),
 ('vacation', 0.36418122),
 ('genre', 0.36171854),
 ('hero', 0.34951678),
 ('romantic', 0.34861058),
 ('action', 0.34297746),
 ('comedy', 0.34139755),
 ('hobby', 0.34065622),
 ('tour', 0.33641037),
 ('warrior', 0.33598843),
 ('dream', 0.3273697),
 ('canoe', 0.32696596),
 ('entertainment', 0.32566392),
 ('experience', 0.32389012)]

In [1063]:
# find most similar words to given word
word = data.tokenizer.word_index['vertebrate']
candidate_words = list(filter(lambda w: w != word, data.tokenizer.index_word.keys()))
tuned_embeddings = crim_model.get_layer(name='TermEmbedding').get_weights()[0]

sims = list(map(lambda c: np.dot(tuned_embeddings[c], tuned_embeddings[word]), candidate_words))

most_sim_idx = np.argsort(sims)[::-1][:20]
#print most_sim_idx

[(data.tokenizer.index_word[candidate_words[idx]], sims[idx]) for idx in most_sim_idx]

[('flatworm', 0.5927298),
 ('mammal', 0.588327),
 ('species', 0.5788616),
 ('ichthyosaur', 0.57654685),
 ('arthropod', 0.5764982),
 ('hominid', 0.5761199),
 ('invertebrate', 0.5743749),
 ('archosaur', 0.5668075),
 ('monotreme', 0.5665382),
 ('morphological', 0.5580014),
 ('angiosperm', 0.5572169),
 ('chordate', 0.54813623),
 ('trilobite', 0.5478228),
 ('pterosaur', 0.5452015),
 ('organism', 0.5425958),
 ('herbivore', 0.53324604),
 ('gastropod', 0.52406955),
 ('annelid', 0.51789314),
 ('crinoid', 0.51332396),
 ('planarian', 0.51214486)]

In [1062]:
model.most_similar('vertebrate', topn=20)

  if np.issubdtype(vec.dtype, np.int):


[('vertebrates', 0.7684643268585205),
 ('mammalian', 0.7008777856826782),
 ('mammals', 0.6720753908157349),
 ('chordates', 0.6348909139633179),
 ('placental_mammals', 0.6302692294120789),
 ('terrestrial_vertebrates', 0.624941349029541),
 ('monotremes', 0.6244850754737854),
 ('crocodilians', 0.6234016418457031),
 ('vertebrate_evolution', 0.6229584217071533),
 ('arthropods', 0.6225826144218445),
 ('vertebrate_animals', 0.6223456859588623),
 ('metazoans', 0.6213091015815735),
 ('herbivorous_dinosaurs', 0.621167004108429),
 ('eukaryote', 0.6205124258995056),
 ('metazoan', 0.6191608309745789),
 ('theropod', 0.6179888844490051),
 ('Acanthostega', 0.6108344793319702),
 ('multicellular', 0.6099957823753357),
 ('Platynereis', 0.6092511415481567),
 ('multicellular_organisms', 0.6080698370933533)]

In [659]:
model.similar_by_vector(model.vectors[2043])



  if np.issubdtype(vec.dtype, np.int):


[('dog', 0.9999999403953552),
 ('dogs', 0.8680489659309387),
 ('puppy', 0.8106428980827332),
 ('pit_bull', 0.7803961038589478),
 ('pooch', 0.7627376914024353),
 ('cat', 0.7609456777572632),
 ('golden_retriever', 0.7500901818275452),
 ('German_shepherd', 0.7465174198150635),
 ('Rottweiler', 0.7437615394592285),
 ('beagle', 0.7418621182441711)]

In [665]:
model.vocab['dog'].index

print (model.vectors[2043][:10])

data.tokenizer.word_index['dog']
print (data.embedding_matrix[50][:10])


[ 0.01719806 -0.00749344 -0.05798202  0.05405104 -0.02833585  0.01924545
  0.01965492 -0.02768068 -0.00515942 -0.02129283]
[ 0.01719806 -0.00749344 -0.05798202  0.05405104 -0.02833585  0.01924545
  0.01965492 -0.02768068 -0.00515942 -0.02129283]


In [679]:
measures = [{} for _ in range(10)]

for i, (t,h) in enumerate(zip(data.test_query, data.test_hyper)):
    actual = crim_predictions[t]
    for j in range(0, len(measures)):
        measures[j][(t, h)] = 1. if h in actual[:j + 1] else 0.

ats = compute_ats(data, measures) 
ats_string = ', '.join(['A@%d=%.4f' % (j + 1, ats[j]) for j in range(len(ats))])
print (ats_string)

A@1=0.1318, A@2=0.2390, A@3=0.3260, A@4=0.3968, A@5=0.4487, A@6=0.4857, A@7=0.5149, A@8=0.5448, A@9=0.5623, A@10=0.5766


In [1052]:
#measures[9]

In [805]:
def sigmoid(x, derivative=False):
    sigm = 1. / (1. + np.exp(-x))
    if derivative:
        return sigm * (1. - sigm)
    return sigm

sigmoid(-2.05)

0.11405238127979088

In [871]:
print (data.tokenizer.word_index['cat'])
cat = data.embedding_matrix[27]

for i in (np.argsort(list(map(lambda v: np.dot(v, cat), data.embedding_matrix[1:])))[::-1][:15] + 1):
    print (data.tokenizer.index_word[i])




27
cat
dog
feline
puppy
pup
pet
chihuahua
poodle
rabbit
raccoon
rottweiler
squirrel
hamster
animal
fox


In [1016]:
list(map(lambda c: c.model.get_layer('Prediction').get_weights()[1][0], clusters))

[-2.1402805,
 -1.616902,
 -1.0481057,
 -1.237945,
 -2.1051261,
 -2.102034,
 -1.2530707,
 -1.7784206,
 -1.0288585,
 -1.04917,
 -1.0502187,
 -1.6933552,
 -1.0537158,
 -1.5516595,
 -1.218102,
 -1.1043793,
 -1.1490285,
 -1.2724837,
 -1.3093464,
 -1.300655,
 -1.2495532,
 -1.4313546,
 -1.4133878,
 -1.4823934,
 -1.4082193]

In [1324]:
# create KNN dataset
#set(map(lambda i: data.train_query[i], np.where(sample_clusters == 0)[0])
corn_cluster_idx = set(sample_clusters[np.where(np.array(data.train_query) == 'deaf')[0]])
print (corn_cluster_idx)
specific_clusters = map(lambda c: clusters[c], corn_cluster_idx)
specific_bias = map(lambda c: yamane_bias[c], corn_cluster_idx)

alt_yamane_get_top('deaf', hyper_candidates, specific_clusters, data, 15, specific_bias )




set()


[]

In [1257]:
# prepare knn dataset based on learnt clusters
# we will attempt to allocate 
train_seq = np.array(data.tokenizer.texts_to_sequences(train_query))

X_knn = {}
for idx, c in enumerate(clusters):
    cluster_ids = np.where(sample_clusters == idx)
    # we can reduce duplicate terms to unique terms    
    uniq_terms = np.unique(train_seq[cluster_ids])
    #print (uniq_terms)    
    X_knn[idx] = data.embedding_matrix[uniq_terms]  

X_features = X_knn[0]
y = np.zeros(X_knn[0].shape[0], dtype='int16')

for k in range(1,len(clusters)):
    X_features = np.vstack((X_features, X_knn[k]))
    y = np.hstack((y, np.array([k] * X_knn[k].shape[0])))

In [1297]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5, weights='distance')
neigh.fit(X_features, y) 


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='distance')

In [1326]:
word_id = data.tokenizer.word_index['mouse']

cluster_probs = neigh.predict_proba(data.embedding_matrix[word_id].reshape(1,-1))
cluster_idx = np.where(cluster_probs > 0.)[1]
print (cluster_idx)
specific_clusters = map(lambda c: clusters[c], cluster_idx)
specific_bias = map(lambda c: yamane_bias[c], cluster_idx)

alt_yamane_get_top('mouse', hyper_candidates, specific_clusters, data, 15, specific_bias )



[ 0  4 11]


[('animal', 7.718166),
 ('vertebrate', 7.3052816),
 ('chordate', 6.3819933),
 ('placental', 5.169263),
 ('mammal', 4.9130435),
 ('device', 3.7047691),
 ('object', 3.5397835),
 ('carnivore', 3.4969392),
 ('feline', 3.0213947),
 ('canine', 2.9958434),
 ('primate', 2.7636704),
 ('reptile', 2.6483884),
 ('rodent', 2.5337505),
 ('bird', 2.5030375),
 ('creature', 2.3609676)]

159