# Projection Learning Models refactored for Python 3.6

In [1]:
from gensim.models.keyedvectors import KeyedVectors
import numpy as np

In [None]:
w2v = 'GoogleNews-vectors-negative300.txt'
model = KeyedVectors.load_word2vec_format(w2v, binary=False)
# pre-compute L2 norms of vectors
model.init_sims(replace=True)

In [None]:
import codecs
import os
import csv
from collections import defaultdict

def read_subsumptions(filename):
    subsumptions = []

    with codecs.open(filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)

        for row in reader:
            subsumptions.append((row[0], row[1]))

    return subsumptions

def read_synonyms(filename):
    synonyms = defaultdict(lambda: list())

    with codecs.open(filename,encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)

        for row in reader:
            for word in row[1].split(','):
                synonyms[row[0]].append(word)
    
    synonyms.default_factory = None
    return synonyms

In [None]:
train_subs = read_subsumptions('subsumptions-train.txt.orig')
test_subs = read_subsumptions('subsumptions-test.txt.orig')
valid_subs = read_subsumptions('subsumptions-validation.txt.orig')

synonyms = read_synonyms('synonyms.txt')

### Construct pre-trained word embeddings dictionary

In [None]:
# eliminate training tuples for which no embedding exists
from collections import Counter

def get_terms_having_vectors(dataset):    
    query, hyper = \
    zip(*[(q,h) for q, h in dataset 
          if q in model and h in model])
    
    return list(query), list(hyper)
    
train_query, train_hyper = get_terms_having_vectors(train_subs)
test_query, test_hyper = get_terms_having_vectors(test_subs)

assert len(train_query) == len(train_hyper)
assert len(test_query) == len(test_hyper)

### Remove OOV from synonym list

In [None]:
for k, v in list(synonyms.items()):
    if k not in model:
        synonyms.pop(k)
    else:
        for word in v:
            if word not in model:
                v.remove(word)
    
# flatten list of synonyms    
syns = [word for v in synonyms.values() for word in v]    

# confirm that all words in synonym vocab have embeddings representation
assert len(list(filter(lambda x: x in model, syns)))==len(syns)

### Define class Data which encapsulates all the bits and pieces we require for training algorithms

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Data class that encapsulates all word-based data I need to train the various algorithms
# We assume that we have all pre-filtered any words that don't feature in the embeddings
class Data:
    def __init__(self, train_query, train_hyper, test_query, test_hyper, synonyms, embeddings):
        # construct vocab made up from term and hypernyms 
        # we will choose negative samples from this vocab after exhausting
        # the synonyms
        self.neg_vocab = set(train_hyper + test_hyper)
        
        # encapsulate input variables so that all the data can be passed via class instance reference
        self.train_query = train_query
        self.train_hyper = train_hyper
        self.test_query = test_query
        self.test_hyper = test_hyper
        self.synonyms = synonyms
        
        # calculate size of term and hypernym dataset (train + test)
        n_hyponyms = len(set(train_query + test_query + syns))
        # hypernyms will be introduced in the model as either training,
        # gold positives, test gold positives (when evaluation) or
        # negative synonyms.
        n_hypernyms = len(set(train_hyper + test_hyper))

        # determine dimensionality of embeddings
        self.embeddings_dim = embeddings['animal'].shape[0]
        # intialise and fit tokenizer
        self.tokenizer = Tokenizer(num_words = n_hyponyms + n_hypernyms + 1, filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~)')
        self.tokenizer.fit_on_texts(train_query + train_hyper + test_query + test_hyper + syns)
        
        # construct embedding_matrix
        self.embedding_matrix = np.zeros((len(self.tokenizer.word_index)+1, self.embeddings_dim), dtype='float32')

        for word, i in self.tokenizer.word_index.items():
            if i < len(self.tokenizer.word_index) + 1:
                embedding_vector = embeddings[word]
                if embedding_vector is not None:
                    # normalise vector (already normalised)
                    #embedding_vector /= np.linalg.norm(embedding_vector)
                    self.embedding_matrix[i,:] = embedding_vector  
        # confirm shape
        assert self.embedding_matrix.shape == (len(self.tokenizer.word_index)+1, self.embeddings_dim)

In [None]:
data = Data(train_query, train_hyper, test_query, test_hyper, synonyms, model)

## Negative Sampling Strategies

In [None]:
# first exhaust synonyms;
# find the rest by drawing random terms from neg_vocab;
# however, make sure that chosen words are not valid hypernyms;
# finally, tokenise back to ids;    


# positive_sample and terms both expect tuples where positive_sample = (query, hyper)
# and terms = (all_query_terms, all_hyper_terms)
def get_negative_words(positive_sample, word_hypernyms, data, sample_size=5):
    neg_samples = []
    # we need to make a copy of the synonym list
    # synonmys will form part of out negative examples
    if positive_sample[0] in data.synonyms:
        neg_samples = list(synonyms[positive_sample[0]])        
    
    # there might not be enough; compound with random words
    if len(neg_samples) >= sample_size:
        # jumble negative sample indices        
        neg_samples = np.random.choice(neg_samples, sample_size, replace=False)
    else:
        # get current sample's hypernyms
        positive_hypernyms = word_hypernyms[positive_sample[0]]
        
        # eliminate correct hypernyms from neg_vocab
        word_choice = [nv for nv in data.neg_vocab if nv not in positive_hypernyms and nv not in neg_samples]        
        # choose m - len(neg_samples)
        neg_samples.extend(np.random.choice(word_choice, (sample_size-len(neg_samples))).tolist())
            
    return neg_samples
    

In [None]:
def get_random_negative_words(positive_sample, word_hypernyms, data, sample_size=5):
    neg_samples = []
    
    positive_hypernyms = word_hypernyms[positive_sample[0]]
    
    sample_space = filter(lambda w: w not in positive_hypernyms, data.tokenizer.word_index.keys())
    neg_samples = np.random.choice(list(sample_space), sample_size, replace=False)
    
    return neg_samples


In [None]:
# find most similar words to given hypernym which is not valid hypernym of word
def get_similar_hypernyms(positive_sample, word_hypernyms, data, sample_size=5):    
    word = data.tokenizer.word_index[positive_sample[1]]
    candidate_words = list(filter(lambda w: w != word, data.tokenizer.index_word.keys()))
    sims = list(map(lambda c: np.dot(data.embedding_matrix[c], data.embedding_matrix[word]), candidate_words))

    # get 30 most similar words to hypernyms
    most_sim_idx = np.argsort(sims)[::-1][:30]    
    similar_hypernyms = [data.tokenizer.index_word[candidate_words[idx]] for idx in most_sim_idx]
    
    # make sure that similar words are not actual hypernyms    
    positive_hypernym = word_hypernyms[positive_sample[0]]
    
    return list(filter(lambda x: x not in positive_hypernym, similar_hypernyms))[:sample_size]

In [None]:
# get one similar hypernym and random words
def get_similar_hyponyms(positive_sample, word_hypernyms, data, sample_size=5):    
    # get current sample's hypernyms
    positive_hypernyms = word_hypernyms[positive_sample[0]]
    
    # add query term to hypernym list
    positive_hypernyms.append(positive_sample[0])        

    
    # get candidate words - all vocab except hypernyms of current word and current word
    candidate_words = list(filter(lambda w: w not in positive_hypernyms, data.tokenizer.word_index.keys()))
    
    hypo_sims = list(map(lambda c: np.dot(        
                            data.embedding_matrix[data.tokenizer.word_index[c]], 
                            data.embedding_matrix[data.tokenizer.word_index[positive_sample[0]]]), candidate_words))
    
    most_sim_idx = np.argsort(hypo_sims)[::-1][:sample_size]
    return list(map(lambda i: candidate_words[i], most_sim_idx))

In [None]:
# get one similar hypernym and random words
def mix_sim_hyper_random(positive_sample, word_hypernyms, data, sample_size=5):    
    # init neg_samples
    neg_samples = []
    
    # get current sample's hypernyms
    positive_hypernyms = word_hypernyms[positive_sample[0]]
    
    # add query term to hypernym list
    positive_hypernyms.append(positive_sample[0])    
    
    # get candidate words - all vocab except hypernyms of current word and current word
    candidate_words = list(filter(lambda w: w not in positive_hypernyms, data.tokenizer.word_index.keys()))
    
    
    # find similarity of all candidate words w.r.t. current hypernym
    hyper_sims = list(map(lambda c: np.dot(
                            data.embedding_matrix[data.tokenizer.word_index[c]], 
                            data.embedding_matrix[data.tokenizer.word_index[positive_sample[1]]]), candidate_words))

    # get most similar word to hypernym which is not hypernym
    most_sim_idx = np.argsort(hyper_sims)[::-1][0]    
    # append most similar hypernym to negative samples
    neg_samples.append(candidate_words[most_sim_idx])
        
    if len(neg_samples) < sample_size:
        neg_samples.extend(get_negative_words(positive_sample, word_hypernyms, data, sample_size=sample_size-1))
    
    return neg_samples


In [None]:
# Create list of tuples where every element follows (word, negative_word)
def get_negative_tuples(terms, data, negative_words_lambda, sample_size):
    # convert terms to dictionary
    input_query, input_hyper = terms
    unq_input_query = sorted(list(set(input_query)))
    
    word_hypernyms = {}
    for w in unq_input_query:        
        word_hypernyms[w] = [h for q, h in zip(input_query, input_hyper) if q == w]
        
            
    negative_tuples = []    
    for words in zip(*terms):
        negatives = negative_words_lambda(words, word_hypernyms, data, sample_size)
        negative_tuples.extend(
                [(words, n) for n in negatives]
        )    
    return negative_tuples

In [None]:
# function that returns negative samples alongside set of positive samples
# we need to pass:
# the batch hyponym terms, batch of hypernym terms, negative_tuples, tokenizer 
# to create sequences
def extend_batch_with_negatives(batch_X_term, batch_X_hyper, negative_tuples,                              
                                tokenizer):
    # initialise negative tuples container
    positive_words = [(tokenizer.index_word[term_id], tokenizer.index_word[hyper_id]) \
                          for term_id, hyper_id in zip(batch_X_term.flatten(), batch_X_hyper.flatten())]
    
    # tokenize -ve samples
    neg_terms, neg_hyper = zip(*[(qh[0], h) for qh, h in negative_tuples if qh in positive_words])
    
    neg_terms_seq = tokenizer.texts_to_sequences(neg_terms)
    neg_hyper_seq = tokenizer.texts_to_sequences(neg_hyper)

    # before increasing size of our batch, let's set the actual y values
    # the first n terms are true (1s), and the rest are the -ve samples (0)
    batch_y_label = np.concatenate((
            np.ones(batch_X_term.shape[0]),
            np.zeros(len(neg_terms_seq))
    ))
    # finally, stack -ve sequences at the bottom of +ves to 
    # create our final training batch
    # at most, batch size will be 192 samples            

    batch_X_term = np.vstack((batch_X_term, np.array(neg_terms_seq)))
    batch_X_hyper = np.vstack((batch_X_hyper, np.array(neg_hyper_seq)))
    
    return batch_X_term, batch_X_hyper, batch_y_label

# Evaluation Code

In [None]:
def convert_hypernyms_to_one_line(data):
    ordered_queries = sorted(list(set(data.test_query)))
    one_line = {}
    for w in ordered_queries:
        word_hypernyms = [h for q, h in zip(data.test_query, data.test_hyper) if q == w]
        one_line[w] = word_hypernyms
    return one_line

In [None]:
# taken from task_scorer.py provided with shared task resources
def mean_reciprocal_rank(r):
    """Score is reciprocal of the rank of the first relevant item
    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean reciprocal rank
    """
    r = np.asarray(r).nonzero()[0]
    return 1. / (r[0] + 1) if r.size else 0.

def precision_at_k(r, k, n):
    """Score is precision @ k
    Relevance is binary (nonzero is relevant).
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Precision @ k
    Raises:
        ValueError: len(r) must be >= k
    """
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return (np.mean(r)*k)/min(k,n)
    # Modified from the first version. Now the gold elements are taken into account

# used by Ustalov from https://github.com/nlpub/hyperstar/blob/master/evaluate.py
def compute_ats(data, measures):
    return [sum(measures[j].values()) / len(data.test_query) for j in range(len(measures))]


In [None]:
# predictions is a dictionary whereby key is query term and value is a list of ranked hypernym predictions
def get_evaluation_scores(data, predictions):
    all_scores = []    
    scores_names = ['MRR', 'P@1', 'P@5', 'P@10']
    for query, gold_hyps in convert_hypernyms_to_one_line(data).items():

        avg_pat1 = []
        avg_pat2 = []
        avg_pat3 = []

        pred_hyps = predictions[query]
        gold_hyps_n = len(gold_hyps)    
        r = [0 for i in range(15)]

        for j in range(len(pred_hyps)):
            if j < gold_hyps_n:
                pred_hyp = pred_hyps[j]
                if pred_hyp in gold_hyps:
                    r[j] = 1

        avg_pat1.append(precision_at_k(r,1,gold_hyps_n))
        avg_pat2.append(precision_at_k(r,5,gold_hyps_n))
        avg_pat3.append(precision_at_k(r,10,gold_hyps_n))    

        mrr_score_numb = mean_reciprocal_rank(r)
        avg_pat1_numb = sum(avg_pat1)/len(avg_pat1)
        avg_pat2_numb = sum(avg_pat2)/len(avg_pat2)
        avg_pat3_numb = sum(avg_pat3)/len(avg_pat3)

        score_results = [mrr_score_numb, avg_pat1_numb, avg_pat2_numb, avg_pat3_numb]
        all_scores.append(score_results)
    return scores_names, all_scores

def get_ustalov_evaluation_scores(data, predictions):
    measures = [{} for _ in range(10)]

    for i, (t,h) in enumerate(zip(data.test_query, data.test_hyper)):
        actual = predictions[t]
        for j in range(0, len(measures)):
            measures[j][(t, h)] = 1. if h in actual[:j + 1] else 0.

    ats = compute_ats(data, measures) 
    return ats

# Keras Projection Learning Models

In [None]:
from tensorflow.keras import backend as K
from tensorflow.keras.constraints import Constraint

class ForceToOne (Constraint):    
    def __call__(self, w):
        w /= w
        return w

In [None]:
from tensorflow.keras.layers import Input, Dense, Embedding, Dot, Flatten, Concatenate, concatenate, Reshape, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import RandomNormal, Zeros, Ones
from tensorflow.keras.regularizers import l2, l1, l1_l2
from tensorflow.keras.constraints import UnitNorm

from tensorflow.keras import backend as K
import tensorflow as tf

# Phi layer initialiser
def random_identity(shape, dtype="float32", partition_info=None):    
    rnorm = K.random_normal((shape[-1],shape[-1]), mean=0., stddev=0.05)
    #identity = K.eye(shape[-1], dtype='float32')        
    rident = tf.eye(shape[-1]) * rnorm
    return rident

def random_normal(shape, dtype="float32", partition_info=None): 
    return K.random_normal((shape[-1],shape[-1]), 
                             mean=0., stddev=0.05) 

def get_CRIM_model(phi_k=1, train_embeddings=False,\
                   embeddings_dim=300, vocab_size=1000,\
                   embeddings_matrix=None,
                   phi_init = None,
                   phi_activity_regularisation = None,
                   sigmoid_kernel_regularisation = None,
                   sigmoid_bias_regularisation = None,
                   sigmoid_kernel_constraint = None,
                   do_dropout = False
                  ):
    
    hypo_input  = Input(shape=(1,), name='Hyponym')
    hyper_input = Input(shape=(1,), name='Hypernym')
    
    embedding_layer = Embedding(vocab_size + 1, embeddings_dim, embeddings_constraint = UnitNorm(axis=1), 
                                name='TermEmbedding')
    
    
    hypo_embedding = embedding_layer(hypo_input)    
    hyper_embedding = embedding_layer(hyper_input)
    
    # Add Dropout to avoid overfit
    if do_dropout:
        hypo_embedding = Dropout(0.25)(hypo_embedding)
        hyper_embedding = Dropout(0.25)(hyper_embedding)
    
    phi_layer = []
    for i in range(phi_k):
        phi_layer.append(Dense(embeddings_dim, activation=None, use_bias=False, 
                               activity_regularizer=phi_activity_regularisation,
                               kernel_initializer=phi_init,                               
                               name='Phi%d' % (i))(hypo_embedding))

    #phi1 = Dense(embeddings_dim, activation=None, use_bias=False, 
                #kernel_initializer=random_identity, name='Phi1')(hypo_embedding)

    if phi_k == 1:
        # flatten tensors
        phi = Flatten()(phi_layer[0])
        hyper_embedding = Flatten()(hyper_embedding)    
    else:
        phi = concatenate(phi_layer, axis=1)

    
    # this is referred to as "s" in the "CRIM" paper    
    phi_hyper = Dot(axes=-1, normalize=False, name='DotProduct')([phi, hyper_embedding])
    
    if phi_k > 1:
        phi_hyper = Flatten()(phi_hyper)
    
    predictions = Dense(1, activation="sigmoid", name='Prediction',
                        use_bias=True,
                        kernel_initializer=Ones,
                        kernel_constraint= sigmoid_kernel_constraint,
                        bias_initializer=Zeros,                        
                        kernel_regularizer=sigmoid_kernel_regularisation,
                        bias_regularizer=sigmoid_bias_regularisation
                       )(phi_hyper)

    # instantiate model
    model = Model(inputs=[hypo_input, hyper_input], outputs=predictions)
        
    # inject pre-trained embedding weights into Embedding layer
    model.get_layer(name='TermEmbedding').set_weights([embeddings_matrix])
    model.get_layer(name='TermEmbedding').trainable = train_embeddings    

    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


### The training algorithm incorporates mini-batch stochastic descent and negative sampling

In [None]:
def train(model,       # the model which parameters will be learnt
          epochs,      # number of epochs to run          
          batch_size,  # size of mini-batch
          m,           # number of negative samples
          data,        # data required for training                              
          neg_strategy
         ):

    # create negative tuples
    #negative_tuples = get_negative_tuples(data.train_query + data.test_query,
     #                                     data.train_hyper + data.test_hyper, data.neg_vocab, m)
    
    print ("Generating negative tuples...")
    negative_tuples = get_negative_tuples((data.train_query + data.test_query, data.train_hyper + data.test_hyper), 
                                           data, neg_strategy, m)
    print ("Negative tuples...ok")
    
    # create sequences
    term_train_seq = data.tokenizer.texts_to_sequences(data.train_query)
    hyper_train_seq = data.tokenizer.texts_to_sequences(data.train_hyper)

    term_test_seq = data.tokenizer.texts_to_sequences(data.test_query)
    hyper_test_seq = data.tokenizer.texts_to_sequences(data.test_hyper)
                
    samples = np.arange(len(term_train_seq))
    validation_samples = np.arange(len(term_test_seq))
    
    # train algorithm
    for epoch in range(epochs):
        # reset loss
        loss = 0.
        test_loss = 0.
                        
        np.random.shuffle(samples)        

        shuffled_X_term, shuffled_X_hyper =\
            np.array(term_train_seq, dtype='int32')[samples],\
            np.array(hyper_train_seq, dtype='int32')[samples]

        for b in range(0, len(samples), batch_size):
            # product mini-batch, consisting of 32 +ve samples
            batch_X_term = shuffled_X_term[b:b + batch_size] 
            batch_X_hyper = shuffled_X_hyper[b:b + batch_size]

            # complement +ve samples with negatives
            batch_X_term, batch_X_hyper, batch_y_label =\
            extend_batch_with_negatives(batch_X_term, batch_X_hyper,
                                        negative_tuples,
                                        data.tokenizer
                                       )            
            
            # shuffle validation set indices
            np.random.shuffle(validation_samples)
            # pick batch of shuffled test instances with size equal to training batch
            batch_X_test_term, batch_X_test_hyper =\
                np.array(term_test_seq, dtype='int32')[validation_samples[:batch_size]],\
                np.array(hyper_test_seq, dtype='int32')[validation_samples[:batch_size]]
            
            # distort test batch with some negatives to check how algorithm fares with
            # negatives
            batch_X_test_term, batch_X_test_hyper, batch_y_test_label =\
            extend_batch_with_negatives(batch_X_test_term, batch_X_test_hyper,
                                        negative_tuples,
                                        data.tokenizer
                                       )            

            # train on batch
            loss += model.train_on_batch([batch_X_term, batch_X_hyper], 
                                          batch_y_label)[0]
            
            test_loss += model.test_on_batch([batch_X_test_term, batch_X_test_hyper], 
                                              batch_y_test_label)[0]                
            
        print('Epoch:', epoch+1, 'Loss:', loss, 'Test Loss:', test_loss)    


In [None]:
from tensorflow.keras.initializers import RandomNormal, Zeros, Ones
from tensorflow.keras.regularizers import l2, l1, l1_l2

#rand_norm_m0_sd001 = RandomNormal(mean = 0.0, stddev=0.01, seed=42)
#rand_norm = RandomNormal(mean = 0.0, stddev=1., seed=42)

# negative sampling options
neg_sampling_options = {'synonym':get_negative_words, 
                        'mix_hyper_synonym': mix_sim_hyper_random,
                        'similar_hyponym': get_similar_hyponyms,
                        'random': get_random_negative_words
                       }

# phi random init options
phi_init_options = {'random_identity': random_identity, 'random_normal': random_normal}

# implement mini-batch stochastic training
epochs = 15

batch_size = 32

# number of negative samples
m = 10
phi_k = 1
train_embeddings = False
negative_option = 'mix_hyper_synonym'
phi_init_option = 'random_identity'
do_dropout = False
np.random.seed(42)

# create model
crim_model = get_CRIM_model(phi_k = phi_k, train_embeddings = train_embeddings,
                            embeddings_dim = data.embeddings_dim, vocab_size = len(data.tokenizer.word_counts),
                            embeddings_matrix = data.embedding_matrix,
                            phi_init = phi_init_options[phi_init_option],                            
                            sigmoid_kernel_regularisation = l2(0.001),
                            sigmoid_bias_regularisation = l2(0.001),
                            sigmoid_kernel_constraint = None,#ForceToOne(),
                            do_dropout = do_dropout
                           )

print ("Training started...")
print ('Epochs: ', epochs, 'Batch size: ', batch_size, 'm: ', m, 'pki_k: ', phi_k, 'train_embeddings: ', train_embeddings,
      'Negative sampling: ', negative_option, 'Phi Init: ', phi_init_option, 'Dropout: ', do_dropout)

train(crim_model, epochs, batch_size, m, data, neg_sampling_options[negative_option])

## Evaluation  code

Main observations:<br>
1. Tendency is for the model to overfit if we make the model larger than 1 projection matrix;
1. Negative samples are important for the model to learn which words are not hypernyms;
1. Although the model does seem to learn the correct words that are related to hypernymy to the query terms, it does not stop it from predicting with high confidence that similar but completely unrelated words are also hypernyms;
    1. This is really apparent for animals where the model is not able to distinguish between vertebrate and invertebrate; mammal; animal; and so forth;
    1. It's possible that we did not have enough examples to distinguish the various types of animals from each other;
    1. Also, more targeted negative samples could have helped but these would have to be hand-created;


In [None]:
# test whether two words are related by hypernymy
i = data.tokenizer.word_index['pool']
j = data.tokenizer.word_index['group']
crim_model.predict([[i], [j]])

### Find candidate hypernyms

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# alternative hypernym generator by applying Phi weights to hyponym and see which 
# words are closest to this vector
def alt_get_hypernym(word, model, data, embeddings, top, bias = 0):
    q_idx = data.tokenizer.word_index[word]    
    
    q = embeddings[q_idx]    
        
    try:
        _phi = model.get_layer(name='Phi0').get_weights()[0]
    except ValueError:
        _phi = model.get_layer(name='Phi').get_weights()[0]
             
    _proj = np.dot(q, _phi)
    #_proj /= np.linalg.norm(_proj)
    
    #sim = cosine_similarity(embeddings[1:], _proj.reshape(1,-1)).flatten() 
    sim = np.array(list(map(lambda v: np.dot(v, _proj), embeddings[1:]))) + bias
    
    return list(map(lambda i: (data.tokenizer.index_word[i+1], sim[i]), np.argsort(sim)[::-1][:top]))

# compare product of hypoynm word embedding and Phi to all vectors in embeddings. Returns gibberish
def ustalov_get_hypernyms(word, _model, data, embeddings, top):
    q_idx = data.tokenizer.word_index[word]        
    q = embeddings[q_idx]       
    
    _phi = _model.get_layer(name='Phi0').get_weights()[0]
        
    Y_hat = np.dot(q, _phi)
    Y_hat /= np.linalg.norm(Y_hat)    
    
    return model.similar_by_vector(Y_hat)

In [None]:
# function which generates top 15 predictions for each hyponym query term
# and returns results as dictionary
def predict_crim_hypernyms(data, model):
    hyper_candidates = [[data.tokenizer.word_index[hyper]] for hyper in data.tokenizer.word_index.keys()]
    ordered_queries = sorted(list(set(data.test_query)))
    results = {}
        
    embeddings = crim_model.get_layer(name="TermEmbedding").get_weights()[0]                     
    for idx, word in enumerate(ordered_queries):        
        if (idx + 1) % 25 == 0:
            print ("Done", idx + 1)
        #predicted_hypers = crim_get_top_hypernyms(word, hyper_candidates, model, data, 15)
        predicted_hypers = alt_get_hypernym(word, model, data, embeddings, 15)
        results[word] = [h for h, p in predicted_hypers]
        
    return results

In [None]:
import math

crim_predictions = predict_crim_hypernyms(data, crim_model)

print ("CRIM evaluation:")
score_names, all_scores = get_evaluation_scores(data, crim_predictions)
for k in range(len(score_names)):
    print (score_names[k]+': '+str(round(sum([score_list[k] for score_list in all_scores]) / len(all_scores), 5)))
print ("")
print ("Ustalov-style evaluation:")
ats = get_ustalov_evaluation_scores(data, crim_predictions)
ats_string = ', '.join(['A@%d=%.4f' % (j + 1, ats[j]) for j in range(len(ats))])
print (ats_string)

----------------------------------------------------------------------------------------------------

# Yamane et al. Model

In [2]:
## attempt custom constraint to keep weight fixed at 1.
from tensorflow.keras.constraints import Constraint
from tensorflow.keras import backend as K

class ForceToOne (Constraint):    
    def __call__(self, w):
        w /= w
        return w

In [1]:
from tensorflow.keras.layers import Input, Dense, Embedding, Dot, Flatten, Concatenate, concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.initializers import RandomNormal, Zeros, Ones
from tensorflow.keras import backend as K

import tensorflow as tf

def get_new_cluster_model(embedding_layer, 
                          phi_dim = 300,
                          phi_init = None,
                          phi_activity_regularisation = None,                          
                          sigmoid_bias_regularisation = None,
                          sigmoid_kernel_constraint = None,
                          do_dropout = False):
    
    hypo_input = Input(shape=(1,), name='Hyponym')    
    hyper_input = Input(shape=(1,), name='Hypernym')
    
    hypo_embedding, hyper_embedding = embedding_layer([hypo_input, hyper_input])
    
    if do_dropout:
        hypo_embedding = Dropout(0.25)(hypo_embedding)
        hyper_embedding = Dropout(0.25)(hyper_embedding)
                
    phi = Dense(phi_dim, activation=None, use_bias=False,                 
                kernel_initializer=phi_init,                
                name='Phi0')(hypo_embedding)
    
    # flatten phi and hyper_embedding tensors
    phi = Flatten()(phi)
    hyper_embedding = Flatten()(hyper_embedding)
    
    phi_hyper = Dot(axes=-1, normalize=False, name='DotProduct')([phi, hyper_embedding])    
    
    predictions = Dense(1, activation = "sigmoid", 
                        bias_initializer = Zeros,
                        kernel_initializer = Ones,
                        #kernel_constraint = sigmoid_kernel_constraint,                        
                        kernel_constraint = "UnitNorm",                        
                        bias_regularizer=sigmoid_bias_regularisation, 
                        name='Prediction')(phi_hyper)
    # instantiate model
    model = Model(inputs=[hypo_input, hyper_input], outputs=predictions)
    
    # compile using binary_crossentropy loss
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [2]:
# We don't need a unique embedding layer for every sub-model.  

# Instead, we can create a separate model for the embeddings and set the weights 
# according to the pre-trained embeddings

def get_embeddings_model(dim, embedding_matrix):
    hypo_input = Input(shape=(1,))
    hyper_input = Input(shape=(1,))

    word_embedding = Embedding(embedding_matrix.shape[0], dim, name='WE')

    hypo_embedding = word_embedding(hypo_input)
    hyper_embedding = word_embedding(hyper_input)

    embedding_model = Model(inputs=[hypo_input, hyper_input], outputs=[hypo_embedding, hyper_embedding])

    # inject pre-trained embeddings into this mini, resusable model/layer
    embedding_model.get_layer(name='WE').set_weights([embedding_matrix])
    embedding_model.get_layer(name='WE').trainable = False
    return embedding_model

In [3]:
class YamaneCluster:
    def __init__(self, embedding_layer, phi_dim, phi_init, sigmoid_kernel_constraint):
        
        self.model = get_new_cluster_model(embedding_layer = embedding_layer, 
                                           phi_dim = phi_dim, 
                                           phi_init = phi_init, 
                                           sigmoid_kernel_constraint = sigmoid_kernel_constraint)
        self.epoch_count = 0
        self.loss = 0.
        self.test_loss = 0.
    
    def increment_epoch(self):
        self.epoch_count += 1
        
    def update_loss(self, new_loss):
        self.loss += new_loss
        
    def update_test_loss(self, new_loss):
        self.test_loss += new_loss

In [None]:
def yamane_train(
    epochs,      # number of epochs to run
    m,           # number of negative samples
    data,        # class instance containing all the data required for training/testing        
    embedding_layer,
    threshold    = 0.15,     # threshold; similarity below this score will trigger new cluster
    negative_option = 'random', # inject lambda responsible for determining negative sample choice
    phi_init_option = None,
    sigmoid_kernel_constraint = None):    
  
    
    neg_sampling_options = {'synonym':get_negative_words, 
                            'mix_hyper_synonym': mix_sim_hyper_random,
                            'similar_hyponym': get_similar_hyponyms,
                            'random': get_random_negative_words
                           }
    
    phi_init_options = {'random_identity': random_identity, 'random_normal': random_normal}
        
    print ("Generating negative tuples...")
    negative_tuples = get_negative_tuples((data.train_query + data.test_query, data.train_hyper + data.test_hyper), 
                                           data, neg_sampling_options[negative_option], m)
    print ("Negative tuples...ok")
    
    # create sequences
    # we have two sets of inputs: one for training query and hypernym terms;
    #                             another for the validation query/hyper terms;
    term_train_seq = data.tokenizer.texts_to_sequences(data.train_query)
    hyper_train_seq = data.tokenizer.texts_to_sequences(data.train_hyper)

    term_test_seq = data.tokenizer.texts_to_sequences(data.test_query)
    hyper_test_seq = data.tokenizer.texts_to_sequences(data.test_hyper)
    
    # convert all to arrays
    term_train_seq, hyper_train_seq, term_test_seq, hyper_test_seq =\
    [np.array(x, dtype='int32') for x in [term_train_seq, hyper_train_seq, term_test_seq, hyper_test_seq]]
            
    # this list stores which cluster each training sequence pertains to
    sample_clusters = np.zeros(len(term_train_seq), dtype='int32')
    
    print ("m: ", m, "lambda: ", threshold, "max epoch per cluster: ", epochs, 
           "Negative sampling: ", negative_option, 'Phi Init: ', phi_init_option)
    
    
    print ("Sample clusters size: ", len(sample_clusters))
    # list containing 1 model per cluster
    clusters = []
    # add default model to our list of models
    # we share the embedding layer loaded with the pre-trained weights
    # append tuple where 1st element is the cluster and 2nd element is the 
    # number of epochs that cluster is trained
    
    clusters.append(YamaneCluster(embedding_layer, phi_dim=data.embeddings_dim,
                                  phi_init = phi_init_options[phi_init_option], sigmoid_kernel_constraint = sigmoid_kernel_constraint))
    
    # get training set indices
    indices = np.arange(len(term_train_seq))  
    
    # get test set indices
    test_indices = np.arange(len(term_test_seq))
            
    # initialise each training sample to cluster 0
    sample_clusters[indices] = 0        
    
    # seed random generator
    np.random.seed(42)
    
    # indicator of "current" sample cluster index
    z_i = 0
    
    # train algorithm
    #for epoch in range(epochs):
    epoch = 0    
    
    while np.min([c.epoch_count for c in clusters]) < epochs:
        # reset loss for each cluster                        
        for c in clusters:
            if c.epoch_count < epochs:                
                c.loss = 0.
            c.test_loss = 0.                
        
        # shuffle indices every epoch
        np.random.shuffle(indices)
        
        # train algorithm by stochastic gradient descent, one sample at a time
        for idx, i in enumerate(indices):                        
            if (idx + 1) % 500 == 0:
                print ("Processed ", idx+1, "samples...")
            
            # calculate similarity on all clusters
            sim = list(map(lambda x: x.model.predict([term_train_seq[i], hyper_train_seq[i]]), clusters))
            max_sim = np.argmax(sim)
            #print "Term:", tokenizer.index_word[term_train_seq[i][0]], 'Hyper:', tokenizer.index_word[hyper_train_seq[i][0]], "Max Similarity cluster:", max_sim, "(sim = %0.8f)" % (sim[max_sim])
            # limit cluster creation to a max of 25.
            if ((sim[max_sim] < threshold) and (len(clusters) < 25)): 
                # add new cluster to list of clusters
                clusters.append(YamaneCluster(embedding_layer, phi_dim=data.embeddings_dim,
                                phi_init = phi_init_options[phi_init_option], sigmoid_kernel_constraint = sigmoid_kernel_constraint))
                
                # assign current cluster index to latest model
                z_i = len(clusters) - 1
                sample_clusters[i] = z_i
            else:            
                z_i = max_sim
                sample_clusters[i] = z_i                
                        
            # if current cluster reached/exceeded epoch count, skip current sample (i.e don't update cluster)
            if clusters[z_i].epoch_count < epochs:                                            
                # extend samples in cluster with negative samples
                batch_X_term, batch_X_hyper, batch_y_label =\
                    extend_batch_with_negatives(term_train_seq[i], 
                                                hyper_train_seq[i],
                                                negative_tuples,
                                                data.tokenizer
                                               )  

                # update parameters of cluster 
                clusters[z_i].update_loss(
                    clusters[z_i].model.train_on_batch([batch_X_term, batch_X_hyper], batch_y_label)[0]
                )
            
            # measure test loss 
            # every 32 samples (and updates are processed), we will test performance on validation set
            # of 32 randomly chosen samples. We will record test loss of every cluster and report on 
            # lowest loss
            
            if (idx + 1) % 100 == 0:
                np.random.shuffle(test_indices)
                batch_query, batch_hyper = term_test_seq[test_indices[:32]], hyper_test_seq[test_indices[:32]]
                batch_query, batch_hyper, test_y_label =\
                    extend_batch_with_negatives(batch_query, 
                                                batch_hyper,
                                                negative_tuples,
                                                data.tokenizer
                                               )  
                #batch_label = [1.] * batch_query.shape[0]
                for q, h, l in zip(batch_query, batch_hyper, test_y_label):                                    
                    test_losses = list(map(lambda c: c.model.test_on_batch([q, h], [l])[0], clusters))
                    best_cluster = np.argmin(test_losses)
                    clusters[best_cluster].update_test_loss(
                        test_losses[best_cluster]
                    )                    
                                                                                                                      
        # increase epoch count for clusters
        for cluster in clusters:            
            cluster.epoch_count += 1
                
        print('Epoch:', max([c.epoch_count for c in clusters]), 'Cluster #:', len(clusters) ,
              'Loss:', np.mean([c.loss for c in clusters]),
              'Test Loss:', np.mean([c.test_loss for c in clusters]))
    return clusters, sample_clusters

In [None]:
import datetime


# initialise embedding later which will be shared among all clusters
embedding_layer = get_embeddings_model(dim=data.embeddings_dim, embedding_matrix=data.embedding_matrix)
epochs = 15
m = 10

print ("Training started...")
clusters, sample_clusters =\
    yamane_train(epochs, m, 
                 data,
                 embedding_layer,
                 threshold = 0.2,
                 negative_option = 'mix_hyper_synonym',
                 phi_init_option = 'random_normal',
                 sigmoid_kernel_constraint = ForceToOne())

print (datetime.datetime.now())

### Train KNN classifier on clustering data jointly learnt by model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5, weights='distance')

# prepare knn dataset based on learnt clusters
train_seq = np.array(data.tokenizer.texts_to_sequences(train_query))

X_knn = {}
for idx, c in enumerate(clusters):
    cluster_ids = np.where(sample_clusters == idx)
    # we can reduce duplicate terms to unique terms    
    uniq_terms = np.unique(train_seq[cluster_ids])
    #print (uniq_terms)    
    X_knn[idx] = data.embedding_matrix[uniq_terms]  

X_features = X_knn[0]
y = np.zeros(X_knn[0].shape[0], dtype='int16')

for k in range(1,len(clusters)):
    X_features = np.vstack((X_features, X_knn[k]))
    y = np.hstack((y, np.array([k] * X_knn[k].shape[0])))
    
neigh.fit(X_features, y) 

# Yamane Evaluation Code

In [None]:
# check likelihood of terms semantically related by hypernmy over all clusters
i = data.tokenizer.word_index['baby']
j = data.tokenizer.word_index['child']
list(map(lambda c: c.model.predict([[i], [j]]),  clusters))

## Yamane Clusters

Analysing the hypernym element of the training pairs consistuting a cluster, a pattern emerges. The training algorithm has realised a semantic split and has clustered terms around particular hypernyms. 
Give a model trained on the following parameters:
* m:  10 lambda:  0.15 max epoch per cluster:  15 Negative sampling:  random Phi Init:  random_normal

, the most populated clusters where in descending order: 
* Counter({4: 743, 0: 674, 7: 311, 1: 279, 5: 236, 13: 166, 11: 149, 22: 139, 23: 138, 6: 135, 24: 118, 21: 117, 18: 116, 19: 108, 17: 106, 3: 105, 20: 97, 14: 94, 16: 90, 15: 89, 9: 80, 10: 78, 12: 72, 8: 68, 2: 66})

For instance, cluster 4, the most populated clusters, features animals (282), mammals (119) and birds (50).  Cluster 0 (second most populated), features chordates (203), vertebrates(201), placentals (108) and invertebrates (56). <br>

It is likely that the model is learning the optimal projections to transform any given hyponym to the hypernym which features frequently in a cluster.  For instance if we multiply the embedding of"barley" with projection matrix learnt from the samples in cluster 0, the resultant vector would be similar to "vertebrate" even though barley is clearly not an animal.  

When we compute the product of the projection matric in every cluster and compare the result with every word in the vocab, the top 15 most similar words are:

[('plant', 9.292432),<br>
 ('food', 5.7546735),<br>
 ('herb', 5.4948673),<br>
 ('vertebrate', 5.256508),<br>
 ('chordate', 4.754751),<br>
 ('cereal', 4.3708754),<br>
 ('placental', 4.155033),<br>
 ('grass', 4.0776873),<br>
 ('factory', 3.7920942),<br>
 ('beverage', 3.3642926),<br>
 ('shrub', 3.3227572),<br>
 ('animal', 3.1966455),<br>
 ('produce', 2.8567104),<br>
 ('ruminant', 2.2009413),<br>
 ('angiosperm', 2.1797898)]<br>

Turns out that 5 (out of 6)  of the hypernyms in the gold test data were correctly generated.  However note that chordate, vertebrate and placental features in the top 10 words.

In [None]:
# check out words in a particular cluster
hypers_in_cluster = list(map(lambda x: data.train_hyper[x], np.where(sample_clusters == 15)[0]))

freq_hyper_cluster = Counter(hypers_in_cluster)
sorted(freq_hyper_cluster.items(), key=lambda kv: kv[1], reverse=True)

In [None]:
# return some basic cluster data
from collections import Counter
# show distribution of samples over the trained clusters
print (Counter(sample_clusters))
print ("-"*30)
print ("Train and test loss per cluster")

for idx, c in enumerate(clusters):
    print (idx, c.epoch_count, c.loss, c.test_loss)

### Find Yamane candidate hypernyms

In [None]:
def alt_yamane_get_top(word, hyper_candidates, clusters, data, top, bias_list):
    yam_results = []
    for idx, c in enumerate(clusters):
        yam_results.extend(\
            alt_get_hypernym(word, c.model, data, embedding_layer.get_layer(name="WE").get_weights()[0], 10, bias=yamane_bias[idx])
                          )
    return sorted(yam_results, key= lambda x:x[1], reverse=True)[:top]


def predict_yamane_hypernyms(data, clusters):
    hyper_candidates = [[data.tokenizer.word_index[hyper]] for hyper in data.tokenizer.word_index.keys()]
    
    # store biases in list
    yamane_bias = list(map(lambda c: c.model.get_layer('Prediction').get_weights()[1][0], clusters))
    
    ordered_queries = sorted(list(set(data.test_query)))
    
    results = {}
    for idx, word in enumerate(ordered_queries):
        if (idx + 1) % 25 == 0:
            print ("Done", idx + 1)
        
        # find clusters best suited to this word
        word_id = data.tokenizer.word_index[word]
        
        cluster_probs = neigh.predict_proba(data.embedding_matrix[word_id].reshape(1,-1))
        cluster_idx = np.where(cluster_probs > 0.)[1]
        #print (cluster_idx)
        specific_clusters = map(lambda c: clusters[c], cluster_idx)
        specific_bias = map(lambda c: yamane_bias[c], cluster_idx)

        predicted_hypers = alt_yamane_get_top(word, hyper_candidates, specific_clusters, data, 15, specific_bias )
                
        #predicted_hypers = yamane_get_top_hypernym(word, hyper_candidates, clusters, data, 15)
        #predicted_hypers = alt_yamane_get_top(word, hyper_candidates, clusters, data, 15, yamane_bias)        
        
        results[word] = [h for h, p in predicted_hypers]
        
    return results

In [None]:
# compute Yamane model performance results

import math

yamane_predictions = predict_yamane_hypernyms(data, clusters)

print ("Yamane evaluation:")
score_names, all_scores = get_evaluation_scores(data, yamane_predictions)
for k in range(len(score_names)):
    print (score_names[k]+': '+str(round(sum([score_list[k] for score_list in all_scores]) / len(all_scores), 5)))
print ("")
print ("Ustalov-style evaluation:")
ats = get_ustalov_evaluation_scores(data, yamane_predictions)
ats_string = ', '.join(['A@%d=%.4f' % (j + 1, ats[j]) for j in range(len(ats))])
print (ats_string)

-------------------------------------------------------------------------------------------------------------

# Scratch Pad

In [None]:
get_negative_tuples((data.test_query, data.test_hyper), data, get_random_negative_words, 5)

## Introspect CRIM model

In [None]:
# have a look at the prediction layer weights
print (crim_model.get_layer(name='Prediction').get_weights())
#projs = ['Phi0', 'Phi1', 'Phi2', 'Phi3', 'Phi4']
projs = ['Phi0']
for p in projs:
    print (np.mean(crim_model.get_layer(name=p).get_weights()[0]))

In [None]:
# Find hypernyms by running every vocab word against each query term and finding prediction.
# Rather slow and inelegant.  Found better (faster) way to evaluate model by applying Phi learnt projection matrix
# to the query term and then look for most similar words in vocab using cosine simalarity.

# If single projection matrix is learnt, we can easily ignore LR parameters (scalar multiple) as well as bias
# (applied equally to all candidate hypernyms )
hyper_candidates = [[data.tokenizer.word_index[hyper]] for hyper in data.tokenizer.word_index.keys()]

def crim_get_top_hypernyms(query, hyper_candidates, model, data, top):
    query_index = data.tokenizer.word_index[query]    
    valid_candidates = list(filter(lambda w: w != [query_index], hyper_candidates))
    
    candidate_sim = list(map(lambda x: model.predict([[query_index], x]).flatten()[0], valid_candidates))       
    top_idx = np.argsort(candidate_sim)[::-1][:top]    
    top_hyper = np.array(valid_candidates)[top_idx].flatten()
    
    return [(data.tokenizer.index_word[t], candidate_sim[top_idx[i]]) for i, t in enumerate(top_hyper)]


crim_get_top_hypernyms('sofa', hyper_candidates, crim_model, data, 15)

In [None]:
# test get hypernym routine
alt_get_hypernym('deaf',crim_model, data, crim_model.get_layer(name="TermEmbedding").get_weights()[0], 15)

In [None]:
# exceedingly slow, but technically correct method to find most likely hypernyms for all test terms
def yamane_get_top_hypernym(query, hyper_candidates, clusters, data, top):    
    query_index = data.tokenizer.word_index[query]
    # remove actual query from candidates    
    valid_candidates = list(filter(lambda x: x[0]!=query_index, hyper_candidates))
    hyper_probs = []
    for idx, hyper in enumerate(valid_candidates):    
        if (idx+1) % 500 == 0:
            print ("Done", idx+1)
        candidate_sim = list(map(lambda x: x.model.predict([[query_index], hyper]).flatten()[0], clusters))
        hyper_probs.append(np.max(candidate_sim))
    
    top_idx = np.argsort(hyper_probs)[::-1][:top]
    top_hyper = np.array(valid_candidates)[top_idx].flatten()
            
    return [(data.tokenizer.index_word[t], hyper_probs[top_idx[i]]) for i, t in enumerate(top_hyper)]

In [None]:
#yamane_predictions = predict_yamane_hypernyms(data, clusters)
hyper_candidates = [[data.tokenizer.word_index[hyper]] for hyper in data.tokenizer.word_index.keys()]
yamane_bias = list(map(lambda c: c.model.get_layer('Prediction').get_weights()[1][0], clusters))
alt_yamane_get_top('deaf', hyper_candidates, clusters, data, 15, yamane_bias )