## Experiment with Yamane's Projection Learning method

In [1]:
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

This is the part where we load the dataset; remove training pairs for which no embedding vector exists; tokenize separately and sequence to convert words into list of lists (latter list consisting of single word)

In [2]:
import codecs
import os
import csv
from collections import defaultdict

def read_subsumptions(filename):
    subsumptions = []

    with codecs.open(filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)

        for row in reader:
            subsumptions.append((row[0], row[1]))

    return subsumptions

def read_synonyms(filename):
    synonyms = defaultdict(lambda: list())

    with codecs.open(filename,encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)

        for row in reader:
            for word in row[1].split(','):
                synonyms[row[0]].append(word)
    
    synonyms.default_factory = None
    return synonyms

In [3]:
#os.path.exists('../Ustalov/subsumptions-train.txt.orig')

train_subs = read_subsumptions('../Ustalov/subsumptions-train.txt.orig')
test_subs = read_subsumptions('../Ustalov/subsumptions-test.txt.orig')
valid_subs = read_subsumptions('../Ustalov/subsumptions-validation.txt.orig')

synonyms = read_synonyms('../Ustalov/synonyms.txt')

### Construct pre-trained word embeddings dictionary

In [4]:
embeddings_index = {}
with open('glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print ("Total number of word vectors is: ", len(embeddings_index))

('Total number of word vectors is: ', 400000)


In [5]:
# eliminate training tuples for which no embedding exists
from collections import Counter

def get_terms_having_vectors(dataset):    
    query, hyper = \
    zip(*[(q,h) for q, h in dataset 
          if q in embeddings_index and h in embeddings_index])
    
    return list(query), list(hyper)
    
train_query, train_hyper = get_terms_having_vectors(train_subs)
test_query, test_hyper = get_terms_having_vectors(test_subs)

assert len(train_query) == len(train_hyper) == 4338
assert len(test_query) == len(test_hyper) == 1533

### Remove OOV words from synonym list

In [6]:
for k, v in list(synonyms.iteritems()):
    if k not in embeddings_index:
        synonyms.pop(k)
    else:
        for word in v:
            if word not in embeddings_index:
                v.remove(word)
    
# flatten list of synonyms    
syns = [word for v in synonyms.values() for word in v]    

# confirm that all words in synonym vocab have embeddings representation
assert len(filter(lambda x: x in embeddings_index, syns))==len(syns)

### Define class Data which encapsulates all the bits and pieces we require for training algorithms

In [7]:
# Data class that encapsulates all word-based data I need to train the various algorithms
# We assume that we have all pre-filtered any words that don't feature in the embeddings
class Data:
    def __init__(self, train_query, train_hyper, test_query, test_hyper, synonyms, embeddings):
        # construct vocab made up from term and hypernyms 
        # we will choose negative samples from this vocab after exhausting
        # the synonyms
        self.neg_vocab = set(train_hyper + test_hyper)
        
        # encapsulate input variables so that all the data can be passed via class instance reference
        self.train_query = train_query
        self.train_hyper = train_hyper
        self.test_query = test_query
        self.test_hyper = test_hyper
        self.synonyms = synonyms
        
        # calculate size of term and hypernym dataset (train + test)
        n_hyponyms = len(set(train_query + test_query + syns))
        # hypernyms will be introduced in the model as either training,
        # gold positives, test gold positives (when evaluation) or
        # negative synonyms.
        n_hypernyms = len(set(train_hyper + test_hyper))

        # determine dimensionality of embeddings
        self.embeddings_dim = embeddings['a'].shape[0]
        # intialise and fit tokenizer
        self.tokenizer = Tokenizer(num_words = n_hyponyms + n_hypernyms + 1)
        self.tokenizer.fit_on_texts(train_query + train_hyper + test_query + test_hyper + syns)
        
        # construct embedding_matrix
        self.embedding_matrix = np.zeros((len(self.tokenizer.word_index)+1, self.embeddings_dim), dtype='float32')

        for word, i in self.tokenizer.word_index.items():
            if i < len(self.tokenizer.word_index) + 1:
                embedding_vector = embeddings.get(word)
                if embedding_vector is not None:
                    # normalise vector 
                    embedding_vector /= np.linalg.norm(embedding_vector)
                    self.embedding_matrix[i,:] = embedding_vector  
        # confirm shape
        assert self.embedding_matrix.shape == (len(self.tokenizer.word_index)+1, self.embeddings_dim)
        
        
        

In [8]:
data = Data(train_query, train_hyper, test_query, test_hyper, synonyms, embeddings_index)

### Negative sampling strategies

In [402]:
# first exhaust synonyms;
# find the rest by drawing random terms from neg_vocab;
# however, make sure that chosen words are not valid hypernyms;
# finally, tokenise back to ids;    


# positive_sample and terms both expect tuples where positive_sample = (query, hyper)
# and terms = (all_query_terms, all_hyper_terms)
def get_negative_words(positive_sample, word_hypernyms, data, sample_size=5):
    neg_samples = []
    # we need to make a copy of the synonym list
    # synonmys will form part of out negative examples
    if positive_sample[0] in data.synonyms:
        neg_samples = list(synonyms[positive_sample[0]])        
    
    # there might not be enough; compound with random words
    if len(neg_samples) >= sample_size:
        # jumble negative sample indices        
        neg_samples = np.random.choice(neg_samples, sample_size, replace=False)
    else:
        # get current sample's hypernyms
        positive_hypernyms = word_hypernyms[positive_sample[0]]
        
        # eliminate correct hypernyms from neg_vocab
        word_choice = [nv for nv in data.neg_vocab if nv not in positive_hypernyms and nv not in neg_samples]        
        # choose m - len(neg_samples)
        neg_samples.extend(np.random.choice(word_choice, (sample_size-len(neg_samples))).tolist())
            
    return neg_samples
    

In [403]:
# find most similar words to given hypernym which is not valid hypernym of word
def get_similar_hypernyms(positive_sample, word_hypernyms, data, sample_size=5):    
    word = data.tokenizer.word_index[positive_sample[1]]
    candidate_words = filter(lambda w: w != word, data.tokenizer.index_word.keys())
    sims = map(lambda c: np.dot(data.embedding_matrix[c], data.embedding_matrix[word]), candidate_words)

    # get 30 most similar words to hypernyms
    most_sim_idx = np.argsort(sims)[::-1][:30]    
    similar_hypernyms = [data.tokenizer.index_word[candidate_words[idx]]for idx in most_sim_idx]
    
    # make sure that similar words are not actual hypernyms    
    positive_hypernym = word_hypernyms[positive_sample[0]]
    
    return filter(lambda x: x not in positive_hypernym, similar_hypernyms)[:sample_size]

In [416]:
# get one similar hypernym and random words
def get_similar_hyponyms(positive_sample, word_hypernyms, data, sample_size=5):    
    # get current sample's hypernyms
    positive_hypernyms = word_hypernyms[positive_sample[0]]
    
    # add query term to hypernym list
    positive_hypernyms.append(positive_sample[0])        

    
    # get candidate words - all vocab except hypernyms of current word and current word
    candidate_words = filter(lambda w: w not in positive_hypernyms, data.tokenizer.word_index.keys())
    
    hypo_sims = map(lambda c: np.dot(        
                            data.embedding_matrix[data.tokenizer.word_index[c]], 
                            data.embedding_matrix[data.tokenizer.word_index[positive_sample[0]]]), candidate_words)

    most_sim_idx = np.argsort(hypo_sims)[::-1][:sample_size]
    return map(lambda i: candidate_words[i], most_sim_idx)
    

In [405]:
# get one similar hypernym and random words
def mix_sim_hyper_random(positive_sample, word_hypernyms, data, sample_size=5):    
    # init neg_samples
    neg_samples = []
    
    # get current sample's hypernyms
    positive_hypernyms = word_hypernyms[positive_sample[0]]
    
    # add query term to hypernym list
    positive_hypernyms.append(positive_sample[0])    
    
    # get candidate words - all vocab except hypernyms of current word and current word
    candidate_words = filter(lambda w: w not in positive_hypernyms, data.tokenizer.word_index.keys())
    
    
    # find similarity of all candidate words w.r.t. current hypernym
    hyper_sims = map(lambda c: np.dot(
                            data.embedding_matrix[data.tokenizer.word_index[c]], 
                            data.embedding_matrix[data.tokenizer.word_index[positive_sample[1]]]), candidate_words)

    # get most similar word to hypernym which is not hypernym
    most_sim_idx = np.argsort(hyper_sims)[::-1][0]    
    # append most similar hypernym to negative samples
    neg_samples.append(candidate_words[most_sim_idx])
        
    if len(neg_samples) < sample_size:
        neg_samples.extend(get_negative_words(positive_sample, word_hypernyms, data, sample_size=sample_size-1))
    
    return neg_samples


In [418]:
# Create list of tuples where every element follows (word, negative_word)
def get_negative_tuples(terms, data, negative_words_lambda, sample_size):
    # convert terms to dictionary
    input_query, input_hyper = terms
    unq_input_query = sorted(list(set(input_query)))
    
    word_hypernyms = {}
    for w in unq_input_query:        
        word_hypernyms[w] = [h for q, h in zip(input_query, input_hyper) if q == w]
        
            
    negative_tuples = []    
    for words in zip(*terms):
        negatives = negative_words_lambda(words, word_hypernyms, data, sample_size)
        negative_tuples.extend(
                [(words, n) for n in negatives]
        )    
    return negative_tuples


In [422]:
get_negative_tuples((data.test_query, data.test_hyper), data, mix_sim_hyper_random, 2)
#get_negative_words(('zebra', 'mammal'), (data.train_query, data.train_hyper), data.synonyms, data.neg_vocab, 2)

#get_negative_words(('mackerel', 'fish'), (data.train_query, data.train_hyper), data)
#get_negative_words(('lime', 'citrus'), (data.train_query + data.test_query, data.train_hyper + data.test_hyper), data, 5)
#get_similar_hypernyms(('mackerel', 'fish'), (data.train_query, data.train_hyper), data)
#get_negative_tuples((['cat'], ['animal']), data, get_similar_hyponyms, 5)

#get_similar_hyponyms(('cat', 'animal'), (data.test_query, data.test_hyper), data)


[(('vision', 'experience'), 'knowledge'),
 (('vision', 'experience'), 'sight'),
 (('lime', 'citrus'), 'grapefruit'),
 (('lime', 'citrus'), 'coconut'),
 (('lime', 'plant'), 'factory'),
 (('lime', 'plant'), 'tangerine'),
 (('lime', 'tree'), 'pine'),
 (('lime', 'tree'), 'peach'),
 (('lime', 'food'), 'eat'),
 (('lime', 'food'), 'strawberry'),
 (('lime', 'produce'), 'develop'),
 (('lime', 'produce'), 'orange'),
 (('lime', 'fruit'), 'vegetable'),
 (('lime', 'fruit'), 'peach'),
 (('train', 'artefact'), 'arthropod'),
 (('train', 'artefact'), 'scooter'),
 (('train', 'conveyance'), 'unclean'),
 (('train', 'conveyance'), 'scooter'),
 (('train', 'transport'), 'traffic'),
 (('train', 'transport'), 'motorcycle'),
 (('train', 'vehicle'), 'car'),
 (('train', 'vehicle'), 'scooter'),
 (('train', 'artifact'), 'magical'),
 (('train', 'artifact'), 'ferry'),
 (('train', 'practice'), 'training'),
 (('train', 'practice'), 'helicopter'),
 (('train', 'transportation'), 'traffic'),
 (('train', 'transportation'),

In [None]:
zip(*[(xy[0], x_) for xy, x_ in neg_tuples if xy == ('zebra', 'placental')])

In [14]:
# function that returns negative samples alongside set of positive samples
# we need to pass:
# the batch hyponym terms, batch of hypernym terms, negative_tuples, tokenizer 
# to create sequences
def extend_batch_with_negatives(batch_X_term, batch_X_hyper, negative_tuples,                              
                                tokenizer):
    # initialise negative tuples container
    positive_words = [(tokenizer.index_word[term_id], tokenizer.index_word[hyper_id]) \
                          for term_id, hyper_id in zip(batch_X_term.flatten(), batch_X_hyper.flatten())]
    
    # tokenize -ve samples
    neg_terms, neg_hyper = zip(*[(qh[0], h) for qh, h in negative_tuples if qh in positive_words])
    
    neg_terms_seq = tokenizer.texts_to_sequences(neg_terms)
    neg_hyper_seq = tokenizer.texts_to_sequences(neg_hyper)

    # before increasing size of our batch, let's set the actual y values
    # the first n terms are true (1s), and the rest are the -ve samples (0)
    batch_y_label = np.concatenate((
            np.ones(batch_X_term.shape[0]),
            np.zeros(len(neg_terms_seq))
    ))
    # finally, stack -ve sequences at the bottom of +ves to 
    # create our final training batch
    # at most, batch size will be 192 samples            

    batch_X_term = np.vstack((batch_X_term, np.array(neg_terms_seq)))
    batch_X_hyper = np.vstack((batch_X_hyper, np.array(neg_hyper_seq)))
    
    return batch_X_term, batch_X_hyper, batch_y_label

In [424]:
# non-essential code; just testing the negative_extender

_terms = np.array(data.tokenizer.texts_to_sequences(train_query[4:5]))
_hypers = np.array(data.tokenizer.texts_to_sequences(train_hyper[4:5]))

print "Generating negative Tuples"
neg_tuples = get_negative_tuples((data.train_query + data.test_query, 
                                  data.train_hyper + data.test_hyper), data, get_negative_words, 1)


_terms, _hypers, _lab = extend_batch_with_negatives(_terms, _hypers, neg_tuples, data.tokenizer)
_hypers = data.tokenizer.sequences_to_texts(_hypers)
for idx, _t in enumerate(data.tokenizer.sequences_to_texts(_terms)):
    print _t, _hypers[idx], _lab[idx]

Generating negative Tuples
swordfish chordate 1.0
swordfish tuna 0.0


In [None]:
# example on how to use negative_tuples to get 2 negative samples
zip(*[(q,h)for q, h in negative_tuples if q in ['cat', 'poplar','black']])

In [None]:
# print total number of words in training/test set
print data.tokenizer.num_words
# print number of unique words
print len(data.tokenizer.word_index)

## Model Definition in Keras

In [None]:
from tensorflow.keras import backend as K

class ForceToOne (Constraint):    
    def __call__(self, w):
        w /= w
        return w


In [948]:
from tensorflow.keras.layers import Input, Dense, Embedding, Dot, Flatten, Concatenate, concatenate, Reshape, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import RandomNormal, Zeros, Ones
from tensorflow.keras.regularizers import l2, l1, l1_l2
from tensorflow.keras.constraints import UnitNorm

from tensorflow.keras import backend as K
import tensorflow as tf


def get_CRIM_model(phi_k=1, train_embeddings=False,\
                   embeddings_dim=300, vocab_size=1000,\
                   embeddings_matrix=None,
                   phi_init = None,
                   phi_activity_regularisation = None,
                   sigmoid_kernel_regularisation = None,
                   sigmoid_bias_regularisation = None,
                   sigmoid_kernel_constraint = None
                  ):
    
    hypo_input  = Input(shape=(1,), name='Hyponym')
    hyper_input = Input(shape=(1,), name='Hypernym')
    
    embedding_layer = Embedding(vocab_size + 1, embeddings_dim, embeddings_constraint = UnitNorm(axis=1), 
                                name='TermEmbedding')
    hypo_embedding = embedding_layer(hypo_input)    
    hyper_embedding = embedding_layer(hyper_input)
    
    # Add Dropout to avoid overfit
    #hypo_embedding = Dropout(0.5)(hypo_embedding)
    #hyper_embedding = Dropout(0.5)(hyper_embedding)

    # we will set the weights before compilation and training
    # here I have two varieties:
    # one is a standard random normal initialiser, mean=0, std 0.01
    rand_norm_init = RandomNormal(mean = 0.0, stddev=0.01, seed=42)
    # this one is custom and is based on the CRIM paper. 
    # we initialise on random normal noise applied to an identity matrix
    def random_identity(shape, dtype="float32", partition_info=None):    
        identity = K.eye(shape[-1], dtype='float32')

        rnorm = K.random_normal((shape[-1],shape[-1]), 
                                 mean=0., stddev=0.01)

        return identity * rnorm

    phi_layer = []
    for i in range(phi_k):
        phi_layer.append(Dense(embeddings_dim, activation=None, use_bias=False, 
                               activity_regularizer=phi_activity_regularisation,
                               kernel_initializer=phi_init, 
                               name='Phi%d' % (i))(hypo_embedding))

    #phi1 = Dense(embeddings_dim, activation=None, use_bias=False, 
                #kernel_initializer=random_identity, name='Phi1')(hypo_embedding)

    if phi_k == 1:
        # flatten tensors
        phi = Flatten()(phi_layer[0])
        hyper_embedding = Flatten()(hyper_embedding)    
    else:
        phi = concatenate(phi_layer, axis=1)

    
    # this is referred to as "s" in the "CRIM" paper    
    phi_hyper = Dot(axes=-1, normalize=False, name='DotProduct')([phi, hyper_embedding])
    
    if phi_k > 1:
        phi_hyper = Flatten()(phi_hyper)
    
    predictions = Dense(1, activation="sigmoid", name='Prediction',
                        use_bias=True,
                        kernel_initializer=Ones,
                        kernel_constraint= sigmoid_kernel_constraint,
                        bias_initializer=Zeros,                        
                        kernel_regularizer=sigmoid_kernel_regularisation,
                        bias_regularizer=sigmoid_bias_regularisation
                       )(phi_hyper)

    # instantiate model
    model = Model(inputs=[hypo_input, hyper_input], outputs=predictions)

    # inject pre-trained embedding weights into Embedding layer
    model.get_layer(name='TermEmbedding').set_weights([embeddings_matrix])
    model.get_layer(name='TermEmbedding').trainable = train_embeddings    

    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


### Plot model

In [None]:
from keras.utils.vis_utils import plot_model
#from tensorflow.keras.utils import plot_model

plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)

### Implement mini-batch stochastic training with negative sampling

At every epoch, we will randomly shuffle the training set and split it in 32 distinct batches
For every one of the positive samples, we will generate m negative samples

### The training algorithm incorporates mini-batch stochastic descent and negative sampling

In [152]:
def train(model,       # the model which parameters will be learnt
          epochs,      # number of epochs to run          
          batch_size,  # size of mini-batch
          m,           # number of negative samples
          data,        # data required for training                              
          neg_strategy
         ):

    # create negative tuples
    #negative_tuples = get_negative_tuples(data.train_query + data.test_query,
     #                                     data.train_hyper + data.test_hyper, data.neg_vocab, m)
    
    print "Generating negative tuples..."
    negative_tuples = get_negative_tuples((data.train_query + data.test_query, data.train_hyper + data.test_hyper), 
                                           data, neg_strategy, m)
    print "Negative tuples...ok"
    
    # create sequences
    term_train_seq = data.tokenizer.texts_to_sequences(data.train_query)
    hyper_train_seq = data.tokenizer.texts_to_sequences(data.train_hyper)

    term_test_seq = data.tokenizer.texts_to_sequences(data.test_query)
    hyper_test_seq = data.tokenizer.texts_to_sequences(data.test_hyper)
                
    samples = np.arange(len(term_train_seq))
    validation_samples = np.arange(len(term_test_seq))
    
    # train algorithm
    for epoch in range(epochs):
        # reset loss
        loss = 0.
        test_loss = 0.
                        
        np.random.shuffle(samples)

        shuffled_X_term, shuffled_X_hyper =\
            np.array(term_train_seq, dtype='int32')[samples],\
            np.array(hyper_train_seq, dtype='int32')[samples]

        for b in range(0, len(samples), batch_size):
            # product mini-batch, consisting of 32 +ve samples
            batch_X_term = shuffled_X_term[b:b + batch_size] 
            batch_X_hyper = shuffled_X_hyper[b:b + batch_size]

            # complement +ve samples with negatives
            batch_X_term, batch_X_hyper, batch_y_label =\
            extend_batch_with_negatives(batch_X_term, batch_X_hyper,
                                        negative_tuples,
                                        data.tokenizer
                                       )            
            
            # shuffle validation set indices
            np.random.shuffle(validation_samples)
            # pick batch of shuffled test instances with size equal to training batch
            batch_X_test_term, batch_X_test_hyper =\
                np.array(term_test_seq, dtype='int32')[validation_samples[:batch_size]],\
                np.array(hyper_test_seq, dtype='int32')[validation_samples[:batch_size]]
            
            # distort test batch with some negatives to check how algorithm fares with
            # negatives
            batch_X_test_term, batch_X_test_hyper, batch_y_test_label =\
            extend_batch_with_negatives(batch_X_test_term, batch_X_test_hyper,
                                        negative_tuples,
                                        data.tokenizer
                                       )            

            # train on batch
            loss += model.train_on_batch([batch_X_term, batch_X_hyper], 
                                          batch_y_label)[0]
            
            test_loss += model.test_on_batch([batch_X_test_term, batch_X_test_hyper], 
                                              batch_y_test_label)[0]                
            
        print('Epoch:', epoch+1, 'Loss:', loss, 'Test Loss:', test_loss)    


### Harness code to pass in the various parameters to the training algorithm

In [1104]:
from tensorflow.keras.initializers import RandomNormal, Zeros, Ones
from tensorflow.keras.regularizers import l2, l1, l1_l2

# Phi layer initialiser
def random_identity(shape, dtype="float32", partition_info=None):    
    identity = K.eye(shape[-1], dtype='float32')

    rnorm = K.random_normal((shape[-1],shape[-1]), 
                             mean=0., stddev=0.01)

    return identity * rnorm

def random_normal(shape, dtype="float32", partition_info=None): 
    return K.random_normal((shape[-1],shape[-1]), 
                             mean=0., stddev=0.05)

#rand_norm_m0_sd001 = RandomNormal(mean = 0.0, stddev=0.01, seed=42)
#rand_norm = RandomNormal(mean = 0.0, stddev=1., seed=42)

# negative sampling options
neg_sampling_options = {'synonym':get_negative_words, 
                        'mix_hyper_synonym': mix_sim_hyper_random,
                        'similar_hyponym': get_similar_hyponyms
                       }

# phi random init options
phi_init_options = {'random_identity': random_identity, 'random_normal': random_normal}

# implement mini-batch stochastic training
epochs = 8

batch_size = 32

# number of negative samples
m = 10
phi_k = 1
train_embeddings = True
negative_option = 'mix_hyper_synonym'
phi_init_option = 'random_normal'
np.random.seed(42)

# create model
crim_model = get_CRIM_model(phi_k = phi_k, train_embeddings = train_embeddings,
                            embeddings_dim = data.embeddings_dim, vocab_size = 2931,
                            embeddings_matrix = data.embedding_matrix,
                            phi_init = phi_init_options[phi_init_option],                            
                            sigmoid_kernel_regularisation = None,
                            sigmoid_bias_regularisation = None,
                            sigmoid_kernel_constraint = ForceToOne()
                           )

print "Training started..."
print ('Epochs: ', epochs, 'Batch size: ', batch_size, 'm: ', m, 'pki_k: ', phi_k, 'train_embeddings: ', train_embeddings,
      'Negative sampling: ', negative_option, 'Phi Init: ', phi_init_option)

train(crim_model, epochs, batch_size, m, data, neg_sampling_options[negative_option])

Training started...
('Epochs: ', 8, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 1, 'train_embeddings: ', True, 'Negative sampling: ', 'mix_hyper_synonym', 'Phi Init: ', 'random_normal')
Generating negative tuples...
Negative tuples...ok
('Epoch:', 1, 'Loss:', 57.355670511722565, 'Test Loss:', 59.72325420379639)
('Epoch:', 2, 'Loss:', 27.334548115730286, 'Test Loss:', 36.40574422478676)
('Epoch:', 3, 'Loss:', 22.817064203321934, 'Test Loss:', 33.09851683676243)
('Epoch:', 4, 'Loss:', 20.19028501957655, 'Test Loss:', 31.51719556748867)
('Epoch:', 5, 'Loss:', 18.312923535704613, 'Test Loss:', 30.627734020352364)
('Epoch:', 6, 'Loss:', 16.621946439146996, 'Test Loss:', 29.96564643085003)
('Epoch:', 7, 'Loss:', 15.119618479162455, 'Test Loss:', 29.58227378129959)
('Epoch:', 8, 'Loss:', 13.738165482878685, 'Test Loss:', 31.057048320770264)


In [1035]:
# have a look at the prediction layer weights
crim_model.get_layer(name='Prediction').get_weights()
#model.get_layer(name='Phi0').get_weights()

[array([[1.]], dtype=float32), array([-0.59664816], dtype=float32)]

In [1037]:
# get phi mean value
projs = ['Phi0']#, 'Phi1', 'Phi2']
for p in projs:
    print np.mean(crim_model.get_layer(name=p).get_weights()[0])


0.0010779559


### Evaluation  code

Main observations:<br>
1. Tendency is for the model to overfit if we make the model larger than 1 projection matrix;
1. Negative samples are important for the model to learn which words are not hypernyms;
1. Although the model does seem to learn the correct words that are related to hypernymy to the query terms, it does not stop it from predicting with high confidence that similar but completely unrelated words are also hypernyms;
    1. This is really apparent for animals where the model is not able to distinguish between vertebrate and invertebrate; mammal; animal; and so forth;
    1. It's possible that we did not have enough examples to distinguish the various types of animals from each other;
    1. Also, more targeted negative samples could have helped but these would have to be hand-created;



In [677]:
from sklearn.metrics import f1_score

def evaluate_crim(data, model, neg_strategy):
    # initialise metrics
    train_accuracy = 0.
    basic_test_accuracy = 0.
    basic_test_f1 = 0.
    negative_test_accuracy = 0.
    negative_test_f1 = 0.
    
    # tokenise training data
    term_train_seq = data.tokenizer.texts_to_sequences(data.train_query)
    hyper_train_seq = data.tokenizer.texts_to_sequences(data.train_hyper)

    y_train_labels = [1.] * len(term_train_seq)
    print "Evaluating training set..." 
        
    _, train_accuracy = np.round(model.evaluate([term_train_seq, hyper_train_seq], y_train_labels), 5)

    # evaluate on test set but create negative examples at a ratio of 1:1
    # tokenize testing data
    term_test_seq = data.tokenizer.texts_to_sequences(data.test_query)
    hyper_test_seq = data.tokenizer.texts_to_sequences(data.test_hyper)

    print "Evaluating given test dataset size (", len(term_test_seq), ') items.'
    # convert to arrays
    term_test_seq = np.array(term_test_seq, dtype='int32')
    hyper_test_seq = np.array(hyper_test_seq, dtype='int32')
    y_test_labels = [1.] * len(term_test_seq)
    
    _, basic_test_accuracy = np.round(model.evaluate([term_test_seq, hyper_test_seq], y_test_labels), 5)
    
    predictions = model.predict([term_test_seq, hyper_test_seq])
    binary_predictions = map(lambda p: 1. if p >= 0.5 else 0., predictions)
    basic_test_f1 = np.round(f1_score(y_test_labels, binary_predictions), 5)


    # generate 2 negative tuple for every unique entry in the test_set
    if neg_strategy:
        print "Augmenting basic test samples with negatives..."
        negative_tuples = get_negative_tuples((data.test_query, data.test_hyper), 
                                               data, neg_strategy, 2)

        term_test_seq, hyper_test_seq, y_test_labels =\
            extend_batch_with_negatives(term_test_seq, hyper_test_seq,
                                        negative_tuples,
                                        data.tokenizer)                  
    
        print "Evaluating extended test dataset size (", term_test_seq.shape[0], ') items.'
        _, negative_test_accuracy = np.round(model.evaluate([term_test_seq, hyper_test_seq], y_test_labels), 5)
        
        predictions = model.predict([term_test_seq, hyper_test_seq])
        binary_predictions = map(lambda p: 1. if p >= 0.5 else 0., predictions)
        negative_test_f1 = np.round(f1_score(y_test_labels, binary_predictions), 5)
    
    
    return train_accuracy, basic_test_accuracy, basic_test_f1, negative_test_accuracy, negative_test_f1


In [1038]:
train_accuracy, basic_test_accuracy, basic_test_f1, negative_test_accuracy, negative_test_f1 =\
    evaluate_crim(data, crim_model, get_negative_words)
    
print "Training accuracy:", train_accuracy, "; Test accuracy:", basic_test_accuracy, "; Test F1:", basic_test_f1, "; Negative Test accuracy:",negative_test_accuracy, "; Negative Test F1:", negative_test_f1



Evaluating training set...
Evaluating given test dataset size ( 1533 ) items.
Augmenting basic test samples with negatives...
Evaluating extended test dataset size ( 4599 ) items.
Training accuracy: 0.71784 ; Test accuracy: 0.61709 ; Test F1: 0.76321 ; Negative Test accuracy: 0.84475 ; Negative Test F1: 0.72602


In [None]:
for word,hyper in zip(test_query, test_hyper):
    print word, hyper, crim_model.predict( [[data.tokenizer.word_index[word]], [data.tokenizer.word_index[hyper]]])

### Experiment results
('Epochs: ', 15, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 1)<br>
Training accuracy: 0.7416; Testing accuracy: 0.72380; Test f1: 0.733013

('Epochs: ', 15, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 2)<br>
Training accuracy: 0.81766; Testing accuracy: 0.74435; Test f1: 0.75947

('Epochs: ', 15, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 5)<br>
Training accuracy: 0.88428; Testing accuracy: 0.75709; Test f1: 0.7779

('Epochs: ', 10, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 5)<br>
-- lesser readings like with phi_k = 2

### Try with a smaller number of negative samples and projection matrices
('Epochs: ', 10, 'Batch size: ', 32, 'm: ', 5, 'pki_k: ', 1)<br>
Training accuracy: 0.79599; Testing accuracy: 0.75339; Test f1: 0.77307 <br>
-- fairly reasonable results.  

('Epochs: ', 10, 'Batch size: ', 32, 'm: ', 1, 'pki_k: ', 1)<br>
Training accuracy: 0.98317; Testing accuracy: 0.78216; Test f1: 0.84217 <br>
-- smallest model gives best results

('Epochs: ', 10, 'Batch size: ', 32, 'm: ', 1, 'pki_k: ', 5)<br>
Training accuracy: 0.98986; Testing accuracy: 0.77476; Test f1: 0.83681 <br>
-- increasing the complexity of the model, increases overfit but doesn't really improve the quality of
   the predictions
   
### Attempt to fine-tune embeddings
Increases overfit.  When qualitatively studying hypernyms returned by model, I find quite a lot of overconfident matches.  Ex: matching "cold" (From training data):<br>
[('animal', 1.0), ('vertebrate', 1.0), ('vehicle', 1.0), ('artifact', 1.0), ('tool', 1.0), ('chordate', 1.0), ('creature', 1.0), ('bird', 1.0), ('mammal', 1.0), ('artefact', 0.9999999)]

Training accuracy: 0.98778; Testing accuracy: 0.72421; Test f1: 0.79844 

### Removal of sigmoid layer regularisation; phi layer initialised on random_normal distribution (mean=0, sd=0.01)
('Epochs: ', 20, 'Batch size: ', 32, 'm: ', 5, 'pki_k: ', 1, 'train_embeddings: ',False)
Training accuracy: 0.92001; Testing accuracy: 0.85236; Test f1: 0.74861 

### Cast CRIM as Yamane but with no soft-clustering, learning single projection matrix
('Epochs: ', 20, 'Batch size: ', 32, 'm: ', 5, 'pki_k: ', 1, 'train_embeddings: ', False, 'Negative sampling: ', 'synonym', 'Phi Init: ', 'random_normal') (std 0.05)

Training accuracy: 0.66897 ; Test accuracy: 0.56621 ; Test F1: 0.72303 ; Negative Test accuracy: 0.84497 ; Negative Test F1: 0.70886<br>

* Increasing projections does improve the evaluation metrics

('Epochs: ', 20, 'Batch size: ', 32, 'm: ', 5, 'pki_k: ', 3, 'train_embeddings: ', False, 'Negative sampling: ', 'synonym', 'Phi Init: ', 'random_normal') (std 0.01)

Training accuracy: 0.92808 ; Test accuracy: 0.64188 ; Test F1: 0.78188 ; Negative Test accuracy: 0.85236 ; Negative Test F1: 0.74348

* Increasing projections and loosening constraint that keeps LR weights at 1, the projection matrices still develop similarly.  Classification evaluation results improve:

Training accuracy: 0.9811 ; Test accuracy: 0.70189 ; Test F1: 0.82484 ; Negative Test accuracy: 0.84518 ; Negative Test F1: 0.7514



### This bit generates hypernyms
Requires a list of candidate hypernyms

In [1039]:
hyper_candidates = [[data.tokenizer.word_index[hyper]] for hyper in data.tokenizer.word_index.keys()]

def crim_get_top_hypernyms(query, hyper_candidates, model, data, top):
    query_index = data.tokenizer.word_index[query]    
    valid_candidates = filter(lambda w: w != [query_index], hyper_candidates)    
    
    candidate_sim = map(lambda x: model.predict([[query_index], x]).flatten()[0], valid_candidates)        
    top_idx = np.argsort(candidate_sim)[::-1][:top]
    top_hyper = np.array(valid_candidates)[top_idx].flatten()
    return [(data.tokenizer.index_word[t], candidate_sim[top_idx[i]]) for i, t in enumerate(top_hyper)]


crim_get_top_hypernyms('cat', hyper_candidates, crim_model, data, 15)
#print "-" * 30
#print crim_get_top_hypernyms('apartment', hyper_candidates, crim_model, data, 10)
#print "-" * 30
#print crim_get_top_hypernyms('carrot', hyper_candidates, crim_model, data, 10)
#print "-" * 30
#print crim_get_top_hypernyms('dagger', hyper_candidates, crim_model, data, 10)



[('creature', 0.96224177),
 ('human', 0.9570506),
 ('beast', 0.9486063),
 ('vertebrate', 0.948362),
 ('animal', 0.94165653),
 ('chordate', 0.9388889),
 ('object', 0.9273808),
 ('food', 0.92394197),
 ('hobby', 0.92351276),
 ('game', 0.91993695),
 ('insect', 0.9094172),
 ('mammal', 0.90893006),
 ('arthropod', 0.9065798),
 ('shape', 0.89415187),
 ('place', 0.8843403)]

In [852]:
i = data.tokenizer.word_index['cat']
j = data.tokenizer.word_index['plant']
crim_model.predict([[i], [j]])

array([[0.25018686]], dtype=float32)

In [None]:
# hypernyms in the training set exhibit a sort of power distribution 
# animal is features in ~6.35% of the samples

# in this universe the model will start to think that everything is an animal
hyper_freq = Counter(data.train_hyper)
for key, value in sorted(hyper_freq.iteritems(), key=lambda (k,v): (v,k), reverse=True):
    print "%s: %s" % (key, value)

# Implementation of Yamane et al. 

## Clusters learnt together with projections

In [15]:
## attempt custom constraint to keep weight fixed at 1.
from tensorflow.keras.constraints import Constraint
from tensorflow.keras import backend as K

class ForceToOne (Constraint):    
    def __call__(self, w):
        w /= w
        return w


In [16]:
from tensorflow.keras.layers import Input, Dense, Embedding, Dot, Flatten, Concatenate, concatenate, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.initializers import RandomNormal, Zeros, Ones
from tensorflow.keras import backend as K

import tensorflow as tf

def get_new_cluster_model(embedding_layer, phi_dim):
    hypo_input = Input(shape=(1,), name='Hyponym')    
    hyper_input = Input(shape=(1,), name='Hypernym')
    
    hypo_embedding, hyper_embedding = embedding_layer([hypo_input, hyper_input])
    
    def random_identity(shape, dtype="float32", partition_info=None):    
        identity = K.eye(shape[-1], dtype='float32')
    
        rnorm = K.random_normal((shape[-1],shape[-1]), 
                             mean=0., stddev=0.01)

        return identity * rnorm

    rand_0_01 = RandomNormal(mean=0., stddev=0.01)
    
    phi = Dense(phi_dim, activation=None, use_bias=False, 
                kernel_initializer=random_identity,
                #kernel_regularizer=l2(0.001),                
                name='Phi')(hypo_embedding)
    
    # flatten phi and hyper_embedding tensors
    phi = Flatten()(phi)
    hyper_embedding = Flatten()(hyper_embedding)
    
    phi_hyper = Dot(axes=-1, normalize=False, name='DotProduct')([phi, hyper_embedding])
    force_to_one = ForceToOne()
    
    predictions = Dense(1, activation="sigmoid", 
                        bias_initializer=Zeros,
                        kernel_initializer=Ones,
                        kernel_constraint= force_to_one,                        
                        bias_regularizer=l2(0.001), 
                        name='Prediction')(phi_hyper)
    # instantiate model
    model = Model(inputs=[hypo_input, hyper_input], outputs=predictions)
    
    # compile using binary_crossentropy loss
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
#model = get_new_cluster_model(embedding_layer, 300)
#model.get_layer(name='Phi').get_weights()[0][1]

### We don't need a unique embedding layer for every sub-model.  

Instead, we can create a separate model for the embeddings and set the weights according to the pre-trained embeddings

In [17]:
def get_embeddings_model(dim, embedding_matrix):
    hypo_input = Input(shape=(1,))
    hyper_input = Input(shape=(1,))

    word_embedding = Embedding(embedding_matrix.shape[0], dim, name='WE')

    hypo_embedding = word_embedding(hypo_input)
    hyper_embedding = word_embedding(hyper_input)

    embedding_model = Model(inputs=[hypo_input, hyper_input], outputs=[hypo_embedding, hyper_embedding])

    # inject pre-trained embeddings into this mini, resusable model/layer
    embedding_model.get_layer(name='WE').set_weights([embedding_matrix])
    embedding_model.get_layer(name='WE').trainable = False
    return embedding_model

In [18]:
class YamaneCluster:
    def __init__(self, embedding_layer, phi_dim=300):
        self.model = get_new_cluster_model(embedding_layer, phi_dim)
        self.epoch_count = 0
        self.loss = 0.
        self.test_loss = 0.
    
    def increment_epoch(self):
        self.epoch_count += 1
        
    def update_loss(self, new_loss):
        self.loss += new_loss
        
    def update_test_loss(self, new_loss):
        self.test_loss += new_loss
        

### Yamane et al. training algorithm

In [32]:
def yamane_train(
    epochs,      # number of epochs to run
    m,           # number of negative samples
    data,        # class instance containing all the data required for training/testing
          #train_query, # input sequence of hyponyms
          #train_hyper, # input sequence of hypernyms
          #valid_query, # validation sequence of hyponyms
          #valid_hyper, # validation sequence of hypernyms       
          #tokenizer,        # tokenizer used to generate sequences
    embedding_layer,
    threshold    = 0.15,     # threshold; similarity below this score will trigger new cluster
    neg_strategy = get_negative_words):    # inject lambda responsible for determining negative sample choice
 

    # create negative tuples
    #negative_tuples = get_negative_tuples(data.train_query + data.test_query, 
    #                                     data.train_hyper + data.test_hyper, 
    #                                    data.neg_vocab, m)
    
    print "Generating negative tuples..."
    negative_tuples = get_negative_tuples((data.train_query + data.test_query, data.train_hyper + data.test_hyper), 
                                           data, neg_strategy, m)
    print "Negative tuples...ok"
    
    # create sequences
    # we have two sets of inputs: one for training query and hypernym terms;
    #                             another for the validation query/hyper terms;
    term_train_seq = data.tokenizer.texts_to_sequences(data.train_query)
    hyper_train_seq = data.tokenizer.texts_to_sequences(data.train_hyper)

    term_test_seq = data.tokenizer.texts_to_sequences(data.test_query)
    hyper_test_seq = data.tokenizer.texts_to_sequences(data.test_hyper)
    
    # convert all to arrays
    term_train_seq, hyper_train_seq, term_test_seq, hyper_test_seq =\
    [np.array(x, dtype='int32') for x in term_train_seq, hyper_train_seq, term_test_seq, hyper_test_seq]
            
    # this list stores which cluster each training sequence pertains to
    sample_clusters = np.zeros(len(term_train_seq), dtype='int32')
    
    print ("m: ", m, "lambda: ", threshold, "max epoch per cluster: ", epochs)
    print "Sample clusters size: ", len(sample_clusters)
    # list containing 1 model per cluster
    clusters = []
    # add default model to our list of models
    # we share the embedding layer loaded with the pre-trained weights
    # append tuple where 1st element is the cluster and 2nd element is the 
    # number of epochs that cluster is trained
    
    clusters.append(YamaneCluster(embedding_layer, phi_dim=data.embeddings_dim))
    
    # get training set indices
    indices = np.arange(len(term_train_seq))  
    
    # get test set indices
    test_indices = np.arange(len(term_test_seq))
            
    # initialise each training sample to cluster 0
    sample_clusters[indices] = 0        
    
    # seed random generator
    np.random.seed(42)
    
    # indicator of "current" sample cluster index
    z_i = 0
    
    # train algorithm
    #for epoch in range(epochs):
    epoch = 0
    test_loss = 0.    
    
    while np.min([c.epoch_count for c in clusters]) < epochs:
        # reset loss for each cluster                        
        for c in clusters:
            if c.epoch_count < epochs:                
                c.loss = 0.
            c.test_loss = 0.
        
        test_loss = 0.
        
        # shuffle indices every epoch
        np.random.shuffle(indices)
        
        # train algorithm by stochastic gradient descent, one sample at a time
        for idx, i in enumerate(indices):                        
            if (idx + 1) % 500 == 0:
                print "Processed ", idx+1, "samples..."
            
            # calculate similarity on all clusters
            sim = map(lambda x: x.model.predict([term_train_seq[i], hyper_train_seq[i]]), clusters)
            max_sim = np.argmax(sim)
            #print "Term:", tokenizer.index_word[term_train_seq[i][0]], 'Hyper:', tokenizer.index_word[hyper_train_seq[i][0]], "Max Similarity cluster:", max_sim, "(sim = %0.8f)" % (sim[max_sim])
            # limit cluster creation to a max of 25.
            if ((sim[max_sim] < threshold) and (len(clusters) < 25)): 
                # add new cluster to list of clusters
                clusters.append(YamaneCluster(embedding_layer, phi_dim=data.embeddings_dim))                
                # assign current cluster index to latest model
                z_i = len(clusters) - 1
                sample_clusters[i] = z_i
            else:            
                z_i = max_sim
                sample_clusters[i] = z_i                
            
            
            # if current cluster reached/exceeded epoch count, skip current sample (i.e don't update cluster)
            if clusters[z_i].epoch_count < epochs:                                            
                # extend samples in cluster with negative samples
                batch_X_term, batch_X_hyper, batch_y_label =\
                    extend_batch_with_negatives(term_train_seq[i], 
                                                hyper_train_seq[i],
                                                negative_tuples,
                                                data.tokenizer
                                               )  

                # update parameters of cluster 
                clusters[z_i].update_loss(
                    clusters[z_i].model.train_on_batch([batch_X_term, batch_X_hyper], batch_y_label)[0]
                )
            
            # measure test loss 
            # every 32 samples (and updates are processed), we will test performance on validation set
            # of 32 randomly chosen samples. We will record test loss of every cluster and report on 
            # lowest loss
            
            if (idx + 1) % 5000 == 0:
                np.random.shuffle(test_indices)
                batch_query, batch_hyper = term_test_seq[test_indices[:32]], hyper_test_seq[test_indices[:32]]
                batch_query, batch_hyper, test_y_label =\
                    extend_batch_with_negatives(batch_query, 
                                                batch_hyper,
                                                negative_tuples,
                                                data.tokenizer
                                               )  
                #batch_label = [1.] * batch_query.shape[0]
                for q, h, l in zip(batch_query, batch_hyper, test_y_label):                                    
                    test_losses = map(lambda c: c.model.test_on_batch([q, h], [l])[0], clusters)
                    best_cluster = np.argmin(test_losses)
                    clusters[best_cluster].update_test_loss(
                        test_losses[best_cluster]
                    )
                    
                                                                                                                      
        # increase epoch count for clusters
        for cluster in clusters:            
            cluster.epoch_count += 1
                
        print('Epoch:', max([c.epoch_count for c in clusters]), 'Cluster #:', len(clusters) ,
              'Loss:', np.mean([c.loss for c in clusters]),
              'Test Loss:', np.mean([c.test_loss for c in clusters]))
    return clusters, sample_clusters


In [33]:
import datetime

# initialise embedding later which will be shared among all clusters
embedding_layer = get_embeddings_model(dim=data.embeddings_dim, embedding_matrix=data.embedding_matrix)
epochs = 13
m = 5

print "Training started..."
clusters, sample_clusters =\
    yamane_train(epochs, m, 
                 data,
                 embedding_layer,
                 threshold = 0.15,
                 neg_strategy = mix_sim_hyper_random
                 )

print datetime.datetime.now()

Training started...
Generating negative tuples...
Negative tuples...ok
('m: ', 5, 'lambda: ', 0.15, 'max epoch per cluster: ', 13)
Sample clusters size:  4338
Processed  500 samples...
Processed  1000 samples...
Processed  1500 samples...
Processed  2000 samples...
Processed  2500 samples...
Processed  3000 samples...
Processed  3500 samples...
Processed  4000 samples...
('Epoch:', 1, 'Cluster #:', 5, 'Loss:', 424.06980473771694, 'Test Loss:', 0.0)
Processed  500 samples...
Processed  1000 samples...
Processed  1500 samples...
Processed  2000 samples...
Processed  2500 samples...
Processed  3000 samples...
Processed  3500 samples...
Processed  4000 samples...
('Epoch:', 2, 'Cluster #:', 8, 'Loss:', 209.78459176700562, 'Test Loss:', 0.0)
Processed  500 samples...
Processed  1000 samples...
Processed  1500 samples...
Processed  2000 samples...
Processed  2500 samples...
Processed  3000 samples...
Processed  3500 samples...
Processed  4000 samples...
('Epoch:', 3, 'Cluster #:', 9, 'Loss:'

In [34]:
[c.epoch_count for c in clusters]

[20, 20, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 16, 16, 15, 14, 13, 13]

Running experiment to try and get the best performance out of the Yamane model.
* By constraining the weight to 1., we stop the model from being over-zealous in predicting hypernymy;
* Doing that increased the size of the weights in the Phi layer and the first cluster (0) was especially impacted;
* To keep the weights of phi small, I introduced activation regularisation which intends to keep the phi.X_hypo dot product value.  However, this resulted in a large loss and inability to decrease loss even after several iterations.
* Decreasing the threshold controls the number of clusters.  At lambda=0.2, we end up with around 20 clusters; while with a lambda of 0.15 we get eight clusters by the end of 30 epochs.
* The number of negative samples also affects the number of clusters.  Higher -ve sample amounts return more clusters.

I need to be able to measure performance better:
* For starters, I need to include loss on unseen data in the training algorithm;
* Secondly, I need to implement proper evaluation score.  Since this model is actually a collection of models (ensemble technique you could say), evaluation scores must be all hand-coded.

---------------------------------------

### Print some stats
* Number of samples per cluster;
* Loss per cluster

In [35]:
# show distribution of samples over the trained clusters
print Counter(sample_clusters)
print "-"*30
print "Train and test loss per cluster"

for idx, c in enumerate(clusters):
    print idx, c.epoch_count, c.loss

Counter({0: 996, 1: 747, 2: 303, 3: 254, 4: 251, 5: 250, 17: 190, 16: 158, 11: 137, 15: 136, 13: 133, 7: 131, 6: 128, 10: 126, 9: 115, 12: 112, 14: 97, 8: 74})
------------------------------
Train and test loss per cluster
0 20 23.98701424896717
1 20 23.89376202598214
2 20 9.941689834464341
3 20 17.530611298047006
4 20 19.239212260581553
5 19 20.823654890991747
6 19 17.05653603747487
7 19 16.639262665063143
8 18 14.652702376246452
9 17 19.35460589081049
10 17 20.094996355473995
11 17 22.820394083857536
12 16 20.502011947333813
13 16 23.752162247896194
14 15 16.37061956524849
15 14 23.684391610324383
16 13 27.60558620095253
17 13 37.411818757653236


In [36]:
# list terms in particular cluster
[train_query[i] for i in np.argwhere(sample_clusters == 2).flatten()]


# find in which clusters a particular query term ended
#word_id = [idx for idx, term in enumerate(train_query) if term=='freedom']
#sample_clusters[word_id]

['produce',
 'cloak',
 'cloak',
 'cloak',
 'add',
 'glove',
 'glove',
 'glove',
 'glass',
 'beet',
 'beet',
 'plum',
 'plum',
 'turnip',
 'turnip',
 'catfish',
 'wool',
 'cold',
 'mackerel',
 'rose',
 'flower',
 'wrench',
 'wrench',
 'wrench',
 'shirt',
 'shirt',
 'shirt',
 'couch',
 'couch',
 'couch',
 'light',
 'phrase',
 'head',
 'whole',
 'whole',
 'whole',
 'salt',
 'trout',
 'battleship',
 'battleship',
 'candy',
 'grave',
 'copper',
 'paint',
 'paint',
 'shovel',
 'shovel',
 'shovel',
 'pistol',
 'pistol',
 'pistol',
 'butter',
 'freezer',
 'freezer',
 'freezer',
 'freezer',
 'stove',
 'stove',
 'stove',
 'stove',
 'book',
 'fish',
 'goat',
 'bread',
 'cherry',
 'cherry',
 'log',
 'dance',
 'parsley',
 'parsley',
 'guitar',
 'guitar',
 'guitar',
 'sieve',
 'sieve',
 'sieve',
 'bathtub',
 'root',
 'root',
 'dress',
 'dress',
 'dress',
 'saxophone',
 'saxophone',
 'saxophone',
 'chip',
 'lettuce',
 'lettuce',
 'castle',
 'castle',
 'castle',
 'grapefruit',
 'grapefruit',
 'diamond

In [179]:
hyper_candidates = [[data.tokenizer.word_index[hyper]] for hyper in data.tokenizer.word_index.keys()]

def yamane_get_top_hypernym(query, hyper_candidates, clusters, data, top):    
    query_index = data.tokenizer.word_index[query]
    # remove actual query from candidates    
    valid_candidates = filter(lambda x: x[0]!=query_index, hyper_candidates)
    hyper_probs = []
    for hyper in valid_candidates:                        
        candidate_sim = map(lambda x: x.model.predict([[query_index], hyper]).flatten()[0], clusters)
        hyper_probs.append(np.max(candidate_sim))
    
    top_idx = np.argsort(hyper_probs)[::-1][:top]
    top_hyper = np.array(valid_candidates)[top_idx].flatten()
            
    return [(data.tokenizer.index_word[t], hyper_probs[top_idx[i]]) for i, t in enumerate(top_hyper)]

#print yamane_get_top_hypernym('budgerigar', hyper_candidates, term_tokenizer, 10)

In [903]:
word = 'boat'
lizard_hyper = yamane_get_top_hypernym(word, hyper_candidates, clusters, data, 15)
[h for h, p in lizard_hyper]


['artefact',
 'artifact',
 'craft',
 'watercraft',
 'vehicle',
 'habitation',
 'conveyance',
 'document',
 'commerce',
 'object',
 'sport',
 'normative',
 'vessel',
 'teff',
 'kenaf']

### Evaluation logic for Yamane model

### Training Set Evaluation Outcome

* m = 5; epochs = 30; lambda = 0.15; activation_regularisation = l2_0.001.; Training Accuracy = 0.9207
* m = 5; epochs = 15; lambda = 0.15; kernel_regularisation = l2_0.001.; Training Accuracy = 0.1708
* m = 5; epochs = 15; lambda = 0.15; kernel_regularisation = NA; Training Accuracy = 0.9506

Changed algorithm slightly so that each cluster gets trained on the same number of epochs.<br>
This improved training accuracy but had not discernible effect on test performance. Model was subject
to higher variance and overfit.

* m = 5; epochs = 15; lambda = 0.15; no regularisation in phi; training accuracy = 0.9787920700783771
* m = 3; epochs = 30; lambda = 0.15; no regularisation on phi; training accuracy = 0.983402489626556
* m = 4; epochs = 10; lambda = 0.2; no phi regularisation; training accuracy = 0.9723374827109267


In [42]:
def evaluate_yamane(clusters, query_seq, hyper_seq, labels):
        
    test_predictions=[]

    for idx, (x_term, x_hyper) in enumerate(zip(query_seq, hyper_seq)):
        if (idx+1) % 500 == 0:
            print "Done", idx+1
        
        hyp_prob_clusters = map(lambda x: x.model.predict((x_term, x_hyper)), clusters)
        
        cluster_max = np.argmax(hyp_prob_clusters)
        test_predictions.append((np.max(hyp_prob_clusters), cluster_max))

    # compute accuracy
    accuracy = 1. - (np.sum(
                        np.abs(
                            np.round(np.array(zip(*test_predictions)[0]), 0) - labels
                        )
                    ) / query_seq.shape[0])
        
    return accuracy

In [45]:
# measure accuracy on training set itself
input_list = [data.train_query, data.train_hyper]
query_seq, hyper_seq = map(lambda x: np.array(data.tokenizer.texts_to_sequences(x), dtype='int32'), input_list)
print "Train dataset size:", query_seq.shape[0]

train_accuracy = evaluate_yamane(clusters, query_seq, hyper_seq, [1.]*query_seq.shape[0])
print train_accuracy

Train dataset size: 4338
Done 500
Done 1000
Done 1500
Done 2000
Done 2500
Done 3000
Done 3500
Done 4000
0.9612724757952974


In [47]:
# evaluate trained yamane model
# we need to do this from first principles since our model is really an ensemble of models
negative_tuples = get_negative_tuples((data.train_query + data.test_query, 
                                       data.train_hyper + data.test_hyper), 
                                       data, get_negative_words, 2)

input_list = [test_query, test_hyper]

query_seq, hyper_seq = map(lambda x: np.array(data.tokenizer.texts_to_sequences(x), dtype='int32'), input_list)
print "Test dataset size:", query_seq.shape[0]
    

query_seq, hyper_seq, y_labels =\
        extend_batch_with_negatives(query_seq, hyper_seq,
                                    negative_tuples,
                                    data.tokenizer)  
    
print "Extended test dataset size:", query_seq.shape[0]


accuracy = evaluate_yamane(clusters, query_seq, hyper_seq, y_labels)

print "Test set accuracy:",accuracy    

Test dataset size: 1533
Extended test dataset size: 4599
Done 500
Done 1000
Done 1500
Done 2000
Done 2500
Done 3000
Done 3500
Done 4000
Done 4500
Test set accuracy: 0.8549684714068275


### Test Set Evaluation Outcome

* m = 5; epochs = 30; lambda = 0.15; activation_regularisation = l2_0.001.; Testing Accuracy = 0.5943
* m = 5; epochs = 15; lambda = 0.15; kernel_regularisation = l2_0.001.; Testing Accuracy = rubbish
* m = 5; epochs = 15; lambda = 0.15; kernel_regularisation = None; Testing Accuracy = 0.7764

After change of training algorithm to ensure that every cluster is updated for the same number of times... 
* m = 5; epochs = 15; lambda = 0.15; no phi regularisation; Testing accuracy = 0.77517
* m = 3; epochs = 30; lambda = 0.15; no phi regularisation; Testing accuracy = 0.78381
* m = 3; epochs = 20; lambda = 0.15; no phi regularisation; Testing accuracy = 0.78792

* m = 1; epochs = 20; lambda = 0.2; no phi regularisation; Testing accuracy = 0.696259
* m = 4; epochs = 10; lambda = 0.2; no phi regularisation; Testing accuracy = 0.776408


Introduced new negative strategy whereby one of the negative samples is always the most similar word
to the given hypernym which is not one of the words hyperyms.

Extended dataset size: 4599.  Negatives are synonyms of the query words; clusters learnt = 20
* ('m: ', 5, 'lambda: ', 0.15, 'max epoch per cluster: ', 13)
    * Training accuracy = 0.96; Testing accuracy = 0.855

### Evaluate according to MRR

In [944]:
def convert_hypernyms_to_one_line(data):
    ordered_queries = sorted(list(set(data.test_query)))
    one_line = {}
    for w in ordered_queries:
        word_hypernyms = [h for q, h in zip(data.test_query, data.test_hyper) if q == w]
        one_line[w] = word_hypernyms
    return one_line

In [116]:
def mean_reciprocal_rank(r):
    """Score is reciprocal of the rank of the first relevant item
    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean reciprocal rank
    """
    r = np.asarray(r).nonzero()[0]
    return 1. / (r[0] + 1) if r.size else 0.

def precision_at_k(r, k, n):
    """Score is precision @ k
    Relevance is binary (nonzero is relevant).
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Precision @ k
    Raises:
        ValueError: len(r) must be >= k
    """
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return (np.mean(r)*k)/min(k,n)
    # Modified from the first version. Now the gold elements are taken into account


In [144]:
def predict_yamane_hypernyms(data, clusters):
    hyper_candidates = [[data.tokenizer.word_index[hyper]] for hyper in data.tokenizer.word_index.keys()]
    
    ordered_queries = sorted(list(set(data.test_query)))
    
    results = {}
    for idx, word in enumerate(ordered_queries):
        if (idx + 1) % 100 == 0:
            print "Done", idx + 1
        predicted_hypers = yamane_get_top_hypernym(word, hyper_candidates, clusters, data, 15)
        results[word] = list(predicted_hypers[0])
        
    return results

In [1100]:
def alt_get_hypernym(word, model, data, embeddings, top):
    q_idx = data.tokenizer.word_index[word]    
    
    q = embeddings[q_idx]
    #q = data.embedding_matrix[q_idx]
        
    _phi = model.get_layer(name='Phi0').get_weights()[0]
    #
    #_phi = model.get_layer(name='Phi0').get_weights()[0] +\
    #       model.get_layer(name='Phi1').get_weights()[0] +\
    #       model.get_layer(name='Phi2').get_weights()[0]
    
    _proj = np.dot(q, _phi)
    
    sim = cosine_similarity(embeddings, _proj.reshape(1,-1)).flatten()
    
    return map(lambda i: (data.tokenizer.index_word[i], sim[i]), np.argsort(sim)[::-1][:top])
    

In [1101]:
def predict_crim_hypernyms(data, model):
    hyper_candidates = [[data.tokenizer.word_index[hyper]] for hyper in data.tokenizer.word_index.keys()]
    ordered_queries = sorted(list(set(data.test_query)))
    results = {}
        
    embeddings = crim_model.get_layer(name="TermEmbedding").get_weights()[0]
    #embeddings = data.embedding_matrix
                            
    for idx, word in enumerate(ordered_queries):
        if (idx + 1) % 25 == 0:
            print "Done", idx + 1
        #predicted_hypers = crim_get_top_hypernyms(word, hyper_candidates, model, data, 15)
        predicted_hypers = alt_get_hypernym(word, model, data, embeddings, 15)
        results[word] = [h for h, p in predicted_hypers]
        
    return results



In [1103]:
# get current sample's hypernyms
#predictions = predict_hypernyms(data, clusters)

alt_get_hypernym('cat',crim_model, data, crim_model.get_layer(name="TermEmbedding").get_weights()[0], 15)

[('creature', 0.50561774),
 ('human', 0.4879193),
 ('beast', 0.46308428),
 ('vertebrate', 0.46242478),
 ('animal', 0.4453914),
 ('chordate', 0.43889248),
 ('object', 0.41451705),
 ('food', 0.4079268),
 ('hobby', 0.4071234),
 ('game', 0.40058747),
 ('insect', 0.38279364),
 ('mammal', 0.38201594),
 ('arthropod', 0.3783149),
 ('shape', 0.36002675),
 ('place', 0.3468836)]

In [104]:
import pickle
import os

dest = os.path.join('.', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

#pickle.dump(predictions, open(os.path.join(dest, 'yamane_epoch13_cluster18_prediction.pkl'), 'wb'), protocol=2)

In [184]:
#yamane_predictions = pickle.load(open(os.path.join(dest, 'yamane_epoch13_cluster18_prediction.pkl'), 'rb'))

In [942]:
# predictions is a dictionary whereby key is query term and value is a list of ranked hypernym predictions
def get_evaluation_scores(data, predictions):
    all_scores = []    
    scores_names = ['MRR', 'P@1', 'P@5', 'P@10']
    for query, gold_hyps in convert_hypernyms_to_one_line(data).iteritems():

        avg_pat1 = []
        avg_pat2 = []
        avg_pat3 = []

        pred_hyps = predictions[query]
        gold_hyps_n = len(gold_hyps)    
        r = [0 for i in range(15)]

        for j in range(len(pred_hyps)):
            if j < gold_hyps_n:
                pred_hyp = pred_hyps[j]
                if pred_hyp in gold_hyps:
                    r[j] = 1

        avg_pat1.append(precision_at_k(r,1,gold_hyps_n))
        avg_pat2.append(precision_at_k(r,5,gold_hyps_n))
        avg_pat3.append(precision_at_k(r,10,gold_hyps_n))    

        mrr_score_numb = mean_reciprocal_rank(r)
        avg_pat1_numb = sum(avg_pat1)/len(avg_pat1)
        avg_pat2_numb = sum(avg_pat2)/len(avg_pat2)
        avg_pat3_numb = sum(avg_pat3)/len(avg_pat3)

        score_results = [mrr_score_numb, avg_pat1_numb, avg_pat2_numb, avg_pat3_numb]
        all_scores.append(score_results)
    return scores_names, all_scores



#### Yamane models

Yamane evaluation:<br>
MRR: 0.456185185185<br>
P@1: 0.4311111111111111<br>
P@5: 0.32437037037037036<br>
P@10: 0.29980687830687813<br>
<hr>

#### CRIM models
* Simplest model evaluated: epochs: 10; m=1; phi_k = 1

CRIM evaluation<br>
MRR: 0.408481481481<br>
P@1: 0.3711111111111111<br>
P@5: 0.24518518518518506<br>
P@10: 0.2410123456790123<br>

* Model evaluated: epochs: 10; m = 10; phi_k = 10

CRIM evaluation:<br>
MRR: 0.337<br>
P@1: 0.2777777777777778<br>
P@5: 0.21485185185185185<br>
P@10: 0.20493121693121696<br>

* ('Epochs: ', 7, 'Batch size: ', 32, 'm: ', 5, 'pki_k: ', 5, 'train_embeddings: ', False)
    * Removed regularisation from final layer
    
CRIM evaluation:<br>
MRR: 0.33891005291<br>
P@1: 0.2777777777777778<br>
P@5: 0.22551851851851862<br>
P@10: 0.2137513227513228<br>

* ('Epochs: ', 20, 'Batch size: ', 32, 'm: ', 5, 'pki_k: ', 1, 'train_embeddings: ', False)
    * Negative sampling based on hyponym synonyms/random words; no regularisation, random normal init, synonym random negative sampling<br>
    
CRIM evaluation:<br>
MRR: 0.402243386243<br>
P@1: 0.3466666666666667<br>
P@5: 0.28040740740740716<br>
P@10: 0.26355996472663124   <br>

* ('Epochs: ', 15, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 1, 'train_embeddings: ', False, 'Negative sampling: ', 'synonym', 'Phi Init: ', 'random_normal')
* More negative samples, have an adverse impact on the results<br>

CRIM evaluation:<br>
MRR: 0.370518518519<br>
P@1: 0.31333333333333335<br>
P@5: 0.25762962962962943<br>
P@10: 0.24290740740740735<br>

* ('Epochs: ', 15, 'Batch size: ', 32, 'm: ', 1, 'pki_k: ', 1, 'train_embeddings: ', False, 'Negative sampling: ', 'similar_hyponym', 'Phi Init: ', 'random_normal')

CRIM evaluation:<br>
MRR: 0.335593554594<br>
P@1: 0.3022222222222222<br>
P@5: 0.17833333333333348<br>
P@10: 0.16412345679012352<br>

* Modified CRIM to be equal to Yamane but with no soft-clustering.  Learned just one projection matrix;
* Evaluation results based on accuracy/F1 were quite poor but MRR, P@k scores were highest I managed using the CRIM model;

CRIM evaluation:<br>
MRR: 0.439148148148<br>
P@1: 0.3888888888888889<br>
P@5: 0.3091851851851849<br>
P@10: 0.29187566137566123<br>

* Keeping LR kernel weight fixed and increasing the number of matrices, yields projection matrices that are quite similar to each other.  To evaluate, I added all projection matrices together (3 in this case).  I got similar results but not as good.  Adding more projection matrices helped accuracy/F1 when recasting the problem as classication but did not do much with respect to MRR and p@k

CRIM evaluation:<br>
MRR: 0.41137037037<br>
P@1: 0.35555555555555557<br>
P@5: 0.2919629629629628<br>
P@10: 0.27328395061728383<br>

* Again kept LR kernel weight fixed to 1 but changed negative sampling strategy to a mix of negative hypernym and synonym. Improved likelihood of getting high-ranked positive hypernyms but quality of subsequent hypernyms degraded as can be attested by the p@5 and p@10 scores.

('Epochs: ', 15, 'Batch size: ', 32, 'm: ', 5, 'pki_k: ', 1, 'train_embeddings: ', False, 'Negative sampling: ', 'mix_hyper_synonym', 'Phi Init: ', 'random_normal')

CRIM evaluation:<br>
MRR: 0.452407407407<br>
P@1: 0.4111111111111111<br>
P@5: 0.2835185185185183<br>
P@10: 0.2627098765432097<br>

* Increasing m to 10, actually reduced the scores.

('Epochs: ', 15, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 1, 'train_embeddings: ', False, 'Negative sampling: ', 'mix_hyper_synonym', 'Phi Init: ', 'random_normal')

CRIM evaluation:<br>
MRR: 0.426833333333<br>
P@1: 0.3888888888888889<br>
P@5: 0.2716296296296294<br>
P@10: 0.24958289241622564<br>

* Decreasing batch size and increasing epochs more or less returns the same result as when we had a 32 sample batch size and lower epoch count

('Epochs: ', 30, 'Batch size: ', 12, 'm: ', 5, 'pki_k: ', 1, 'train_embeddings: ', False, 'Negative sampling: ', 'mix_hyper_synonym', 'Phi Init: ', 'random_normal')

CRIM evaluation:<br>
MRR: 0.455185185185<br>
P@1: 0.41555555555555557<br>
P@5: 0.3007037037037034<br>
P@10: 0.27982186948853605<br>

* Increasing batch size also degrades performance

('Epochs: ', 20, 'Batch size: ', 52, 'm: ', 5, 'pki_k: ', 1, 'train_embeddings: ', False, 'Negative sampling: ', 'mix_hyper_synonym', 'Phi Init: ', 'random_normal')

CRIM evaluation:<br>
MRR: 0.437962962963<br>
P@1: 0.39111111111111113<br>
P@5: 0.2701851851851849<br>
P@10: 0.25051234567901226<br>

* Tuning embeddings (constrained to unit norm), has an adverse effect on results

('Epochs: ', 8, 'Batch size: ', 32, 'm: ', 5, 'pki_k: ', 1, 'train_embeddings: ', True, 'Negative sampling: ', 'synonym', 'Phi Init: ', 'random_normal')

CRIM evaluation:<br>
MRR: 0.341307760141<br>
P@1: 0.28888888888888886<br>
P@5: 0.22011111111111115<br>
P@10: 0.21259435626102294<br>


In [1085]:
print "Yamane evaluation:"
score_names, all_scores = get_evaluation_scores(data, yamane_predictions)
for k in range(len(scores_names)):
    print scores_names[k]+': '+str(sum([score_list[k] for score_list in all_scores]) / len(all_scores))

crim_predictions = predict_crim_hypernyms(data, crim_model)

print "-"*30
print "CRIM evaluation:"
score_names, all_scores = get_evaluation_scores(data, crim_predictions)
for k in range(len(scores_names)):
    print scores_names[k]+': '+str(sum([score_list[k] for score_list in all_scores]) / len(all_scores))
    


Yamane evaluation:
MRR: 0.456185185185
P@1: 0.4311111111111111
P@5: 0.32437037037037036
P@10: 0.29980687830687813
Done 25
Done 50
Done 75
Done 100
Done 125
Done 150
Done 175
Done 200
Done 225
Done 250
Done 275
Done 300
Done 325
Done 350
Done 375
Done 400
Done 425
Done 450
------------------------------
CRIM evaluation:
MRR: 0.417220458554
P@1: 0.37555555555555553
P@5: 0.2918148148148147
P@10: 0.2761075837742504


In [1099]:
yamane_predictions['china']
#crim_predictions['cat']

['undemocratic',
 'place',
 'bowtie',
 'shaped',
 'invalid',
 'naivety',
 'roadrunner',
 'change',
 'depress',
 'enrage',
 'everything',
 'villa',
 'move',
 'smooth',
 'rendezvous']

### Analyse weights learned

In [46]:
# have a look at the Prediction Layer weights

for idx, c in enumerate(clusters):
    print idx, c.model.get_layer(name='Prediction').get_weights()



0 [array([[1.]], dtype=float32), array([-2.7328842], dtype=float32)]
1 [array([[1.]], dtype=float32), array([-2.8369036], dtype=float32)]
2 [array([[1.]], dtype=float32), array([-2.006221], dtype=float32)]
3 [array([[1.]], dtype=float32), array([-2.5338047], dtype=float32)]
4 [array([[1.]], dtype=float32), array([-2.1483057], dtype=float32)]
5 [array([[1.]], dtype=float32), array([-2.0767462], dtype=float32)]
6 [array([[1.]], dtype=float32), array([-1.7475002], dtype=float32)]
7 [array([[1.]], dtype=float32), array([-1.6737198], dtype=float32)]
8 [array([[1.]], dtype=float32), array([-1.3753002], dtype=float32)]
9 [array([[1.]], dtype=float32), array([-1.7079883], dtype=float32)]
10 [array([[1.]], dtype=float32), array([-1.6500005], dtype=float32)]
11 [array([[1.]], dtype=float32), array([-1.7033082], dtype=float32)]
12 [array([[1.]], dtype=float32), array([-1.6039661], dtype=float32)]
13 [array([[1.]], dtype=float32), array([-1.6968616], dtype=float32)]
14 [array([[1.]], dtype=float32

In [None]:
np.mean(clusters[3].model.get_layer(name='Phi').get_weights()[0])

# Scratch Pad

In [None]:
term_tokenizer.texts_to_sequences(test_query)[:100]


In [None]:
batch_X_hyper
negatives

a = [(1,2), (1,3), (1,4)]
zip(*a)

#for idx, term in enumerate(neg_terms):
    #print term, neg_hyper[idx]
    
term_tokenizer.texts_to_sequences(neg_terms)
term_tokenizer.index_word[507], term_tokenizer.index_word[130]



for idx, t in enumerate(batch_X_term.flatten()):
    print term_tokenizer.index_word[t], \
      hyper_tokenizer.index_word[batch_X_hyper.flatten()[idx]], \
       batch_y_label[idx]

In [None]:
np.tile(np.zeros(1), (2,1))
rpt = np.repeat([[word_idx]],3,axis=0)
np.vstack(([[1],[2],[3]], rpt))




In [None]:
# get negative samples
# number of negative samples
m = 5
word_hypernyms = [y for x, y in train if x == 'jacket']
print word_hypernyms
#possible = [nv for nv in neg_vocab if nv not in word_hypernyms]
#np.random.choice(possible, 5).tolist()
#synonyms['piranha']

In [None]:
get_negative_words('sausage', train_query, train_hyper)

In [None]:
np.dot(np.arange(3),np.arange(3))

In [None]:
a = [1,2]
b = [3,4]

a, b = [np.array(x, dtype='int32')for x in [a, b]]
print b.shape


In [1098]:
import math
from sklearn.metrics.pairwise import cosine_similarity


q_idx = data.tokenizer.word_index['china']
h_idx = data.tokenizer.word_index['plant']

print q_idx, h_idx

#print embeddings_index['swordfish'][:20]
tuned_embeddings = crim_model.get_layer(name='TermEmbedding').get_weights()[0]
q, h = tuned_embeddings[q_idx], tuned_embeddings[h_idx]
#q, h = embedding_layer.predict([[q_idx],[h_idx]])
#q, h = [x.flatten() for x in [q, h]]

_phi = crim_model.get_layer(name='Phi0').get_weights()[0]        

print _phi.shape
_proj = np.dot(q, _phi)

print "Similarity between given terms:", cosine_similarity(h.reshape(1,-1), _proj.reshape(1,-1))

sim = cosine_similarity(tuned_embeddings, _proj.reshape(1,-1)).flatten()
#sim = cosine_similarity(data.embedding_matrix, _proj.reshape(1,-1)).flatten()
map(lambda i: (data.tokenizer.index_word[i], sim[i]), np.argsort(sim)[::-1][:15])



2014 2
(300, 300)
Similarity between given terms: [[-0.20150737]]


[('change', 0.49852777),
 ('make', 0.33062166),
 ('work', 0.3202718),
 ('action', 0.28008065),
 ('place', 0.26855475),
 ('hornet', 0.26388532),
 ('cockroach', 0.25421357),
 ('pheasant', 0.24992587),
 ('falcon', 0.24652898),
 ('salmon', 0.23833041),
 ('cathedral', 0.23767158),
 ('cod', 0.23764622),
 ('herring', 0.23626004),
 ('key', 0.23345503),
 ('carp', 0.23164552)]

In [1034]:
import math
from sklearn.metrics.pairwise import cosine_similarity


q_idx = data.tokenizer.word_index['lime']
h_idx = data.tokenizer.word_index['invertebrate']

print q_idx, h_idx

#print embeddings_index['swordfish'][:20]
        
q, h = tuned_embeddings[q_idx], tuned_embeddings[h_idx]
q, h = [x.flatten() for x in [q, h]]

_phi = crim_model.get_layer(name='Phi0').get_weights()[0]        

print _phi.shape
_proj = np.dot(q, _phi)

print "Similarity between given terms:", cosine_similarity(h.reshape(1,-1), _proj.reshape(1,-1))

sim = cosine_similarity(tuned_embeddings, _proj.reshape(1,-1)).flatten()
map(lambda i: (data.tokenizer.index_word[i], sim[i]), np.argsort(sim)[::-1][:15])

158 12
(300, 300)
Similarity between given terms: [[0.35952055]]


[('artifact', 0.7524527),
 ('beast', 0.7518158),
 ('artefact', 0.73376083),
 ('predator', 0.7144881),
 ('creature', 0.6901133),
 ('shape', 0.66437286),
 ('raptor', 0.6472047),
 ('object', 0.6456276),
 ('veggie', 0.63704467),
 ('game', 0.63308823),
 ('canine', 0.61953783),
 ('insect', 0.616869),
 ('herbivore', 0.6158552),
 ('firearm', 0.60992),
 ('transport', 0.60256714)]

In [804]:
q_idx = data.tokenizer.word_index['hart']
h_idx = data.tokenizer.word_index['mammal']
#embeddings_index['swordfish']
print q_idx, h_idx

crim_model.predict([[q_idx],[h_idx]])

536 5


array([[0.36265057]], dtype=float32)

In [None]:
# Create list of tuples where every element follows (word, negative_word)
def get_negative_tuples(terms, synonyms, negative_vocab, negative_words_lambda , sample_size):
    negative_tuples = []
    for word in set(terms[0]):
        negatives = negative_words_lambda(word, query_terms, hyper_terms, negative_vocab, sample_size)
        negative_tuples.extend(
                [(word, n) for n in negatives]
        )
    return negative_tuples


In [485]:
W = np.array([[0.2, -0.5, 0.1, 2.], [1.5, 1.3, 2.1, 0.], [0, 0.25, 0.2, -0.3]])
print W, W.shape

x = np.array([56, 231, 24, 2])
print x, x.shape

 
np.dot(x, W.T) + np.array([1.1, 3.2, -1.2])

[[ 0.2  -0.5   0.1   2.  ]
 [ 1.5   1.3   2.1   0.  ]
 [ 0.    0.25  0.2  -0.3 ]] (3, 4)
[ 56 231  24   2] (4,)


array([-96.8 , 437.9 ,  60.75])

In [1029]:
# find most similar words to given word
word = data.tokenizer.word_index['guitar']
candidate_words = filter(lambda w: w != word, data.tokenizer.index_word.keys())
tuned_embeddings = crim_model.get_layer(name='TermEmbedding').get_weights()[0]

sims = map(lambda c: np.dot(tuned_embeddings[c], tuned_embeddings[word]), candidate_words)

most_sim_idx = np.argsort(sims)[::-1][:20]
#print most_sim_idx

[(data.tokenizer.index_word[candidate_words[idx]], sims[idx]) for idx in most_sim_idx]

[('piano', 0.9856831),
 ('cello', 0.98091674),
 ('flute', 0.97312176),
 ('saxophone', 0.9671867),
 ('sword', 0.9248958),
 ('bus', 0.91819954),
 ('bookcase', 0.91581094),
 ('wardrobe', 0.913466),
 ('spear', 0.9124167),
 ('scooter', 0.9069559),
 ('hatchet', 0.906389),
 ('stereo', 0.9054192),
 ('bottle', 0.8994448),
 ('onion', 0.89725924),
 ('pineapple', 0.89628285),
 ('cherry', 0.8947779),
 ('phone', 0.89401215),
 ('musket', 0.89235973),
 ('bass', 0.89191216),
 ('strawberry', 0.88773155)]

In [1031]:
# find most similar words to given word
word = data.tokenizer.word_index['guitar']
candidate_words = filter(lambda w: w != word, data.tokenizer.index_word.keys())
sims = map(lambda c: np.dot(data.embedding_matrix[c], data.embedding_matrix[word]), candidate_words)

most_sim_idx = np.argsort(sims)[::-1][:20]
#print most_sim_idx

[(data.tokenizer.index_word[candidate_words[idx]], sims[idx]) for idx in most_sim_idx]


[('bass', 0.7609487),
 ('piano', 0.71672153),
 ('saxophone', 0.6717821),
 ('harmonica', 0.66169775),
 ('mandolin', 0.6379906),
 ('banjo', 0.6303143),
 ('violin', 0.6158097),
 ('keyboard', 0.59280485),
 ('trumpet', 0.5794264),
 ('cello', 0.5722961),
 ('band', 0.5655428),
 ('trombone', 0.5609751),
 ('musician', 0.5495929),
 ('flute', 0.53228563),
 ('clarinet', 0.53192836),
 ('music', 0.5316814),
 ('ukulele', 0.52557594),
 ('album', 0.51117814),
 ('drum', 0.5047761),
 ('song', 0.5043962)]

In [1105]:
!cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 78
model name	: Intel(R) Core(TM) i7-6500U CPU @ 2.50GHz
stepping	: 3
cpu MHz		: 2591.996
cache size	: 4096 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 2
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 22
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid pni pclmulqdq ssse3 cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx rdrand hypervisor lahf_lm abm 3dnowprefetch pti avx2 rdseed clflushopt
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf
bogomips	: 5183.99
clflush size	: 64
cache_alignment	: 64
address sizes	: 39 bits physical, 48 bits virtual
power management:

processor	: 1
vendor_id	: GenuineIntel
cpu family	: 6
model		: 78
model name	: Intel(R) Core(TM) i7-6500U CPU @ 2.50GHz
steppin