# Load pre-trained UMBC vectors

In [1]:
import numpy as np
from gensim.models.keyedvectors import KeyedVectors

In [2]:
w2v = 'embeddings/1A_en_UMBC_tokenized.vectors.txt'
model = KeyedVectors.load_word2vec_format(w2v, binary=False)
# pre-compute L2 norms of vectors\\\\
model.init_sims(replace=True)

In [17]:
len(model.index2word)

# embeddings size is 219523 terms

219523

In [62]:
model.most_similar(u'igeoe', topn=20)

[(u'tailgating', 0.26962268352508545),
 (u'bleacher', 0.25520360469818115),
 (u'basketball_tournament', 0.2459111511707306),
 (u'board_meeting', 0.2328670769929886),
 (u'gridiron', 0.2320120930671692),
 (u'football_game', 0.22912628948688507),
 (u'grotto', 0.22740451991558075),
 (u'tailgater', 0.2264767587184906),
 (u'tailgate_party', 0.22612455487251282),
 (u'photo_opportunity', 0.22608546912670135),
 (u'pep_rally', 0.22473984956741333),
 (u'commencement_day', 0.2242063581943512),
 (u'hibachi', 0.21942704916000366),
 (u'bleachers', 0.21716344356536865),
 (u'stupe', 0.2161717265844345),
 (u'football_season', 0.21609929203987122),
 (u'pregame', 0.2155630886554718),
 (u'basketball_game', 0.21550601720809937),
 (u'pre-game', 0.21545638144016266),
 (u'predawn', 0.21508227288722992)]

In [3]:
# no word is capitalised in model vocab
assert len(list(filter(lambda k: k.istitle(), model.vocab.keys()))) == 0


## Load training, test data

In [3]:
#import codecs
import io
import os
import unicodecsv as csv
from collections import defaultdict

def read_subsumptions(filenames):
    hypo, hyper = filenames
        
    data_list, gold_list, subsumptions = [], [], []
    
    # load data items
    with open(hypo, mode='r') as f:        
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE, encoding='utf-8')
        for row in reader:
            data_list.append(row[0])
            
    with io.open(hyper, mode= 'r') as f:        
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE, encoding='utf-8')
        for row in reader:            
            gold_list.append(row)
      
    # make sure we have the same number of elements in each list
    assert len(data_list) == len(gold_list)
    
    for data_item, gold_terms in zip(data_list, gold_list):
        for gold_item in gold_terms:
            data_item = data_item.replace(" ", "_").lower()
            gold_item = gold_item.replace(" ", "_").lower()
            subsumptions.append((data_item, gold_item))
    
    return subsumptions

def read_vocab(filename):
        
    vocab = []    
    # load data items
    with open(filename, mode='r') as f:        
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE, encoding='utf-8')
        for row in reader:
            vocab_item = row[0].replace(" ", "_").lower()
            vocab.append(vocab_item)
                              
    return vocab


In [4]:
data_file_names = list(map(lambda x: './data/shared_task/1A.english.%s.data.txt'%(x), ['trial', 'test', 'training']))
gold_file_names = list(map(lambda x: './data/shared_task/1A.english.%s.gold.txt'%(x), ['trial', 'test', 'training']))
vocab_file_name = './data/shared_task/1A.english.vocabulary.txt'

file_names = zip(data_file_names, gold_file_names)
# 0 = trial; 1 = test; 2 = train
valid_subs = read_subsumptions(file_names[0])
test_subs = read_subsumptions(file_names[1])
train_subs = read_subsumptions(file_names[2])
vocabulary = read_vocab(vocab_file_name)


In [7]:
# ensure that model vocab is greater than given vocab
assert len(model.index2word) >= len(vocabulary)

In [5]:
# eliminate training tuples for which no embedding exists
from collections import Counter

def get_terms_having_vectors(dataset):        
    query, hyper = \
    zip(*[(q,h) for q, h in dataset 
          if q in model and h in model])
    
    return list(query), list(hyper)


train_query, train_hyper = get_terms_having_vectors(train_subs)
test_query, test_hyper = get_terms_having_vectors(test_subs)
valid_query, valid_hyper = get_terms_having_vectors(valid_subs)
vocab = list(filter(lambda w: w in model, vocabulary))

assert len(train_query) == len(train_hyper)
assert len(test_query) == len(test_hyper)
assert len(valid_query) == len(valid_hyper)

In [6]:
from collections import Counter

# person is the most popular hypernym in training set (310 times), followed by city (63)
hypernym_distrib = Counter(train_hyper)
for v, k in  sorted(((value, key) for (key,value) in hypernym_distrib.items()), reverse = True):
    print (k,v)

# There are 4,233 unique hypernyms in all
print len(hypernym_distrib)

(u'person', 310)
(u'city', 63)
(u'work_of_art', 44)
(u'leader', 41)
(u'picture', 40)
(u'writer', 39)
(u'natural_phenomenon', 37)
(u'animal', 35)
(u'movie', 34)
(u'locale', 34)
(u'film', 34)
(u'technical_specification', 33)
(u'moving-picture_show', 33)
(u'state', 32)
(u'moving_picture', 32)
(u'motion_picture', 31)
(u'constructed_structure', 31)
(u'computer_software', 31)
(u'software', 30)
(u'software_package', 29)
(u'show', 29)
(u'plant', 29)
(u'computer_programme', 29)
(u'computer_program', 29)
(u'sport', 28)
(u'software_program', 28)
(u'picture_show', 28)
(u'phenomenon', 28)
(u'country', 27)
(u'computer_code', 27)
(u'company', 27)
(u'chief', 27)
(u'physical_phenomenon', 26)
(u'channel', 26)
(u'transportation', 25)
(u'town', 25)
(u'movement', 25)
(u'communication_medium', 25)
(u'transmission_channel', 24)
(u'medium', 24)
(u'transport', 23)
(u'social_event', 23)
(u'public_building', 23)
(u'move', 23)
(u'function', 23)
(u'politician', 22)
(u'musical_work', 22)
(u'mechanism', 22)
(u'boss'

(u'register', 1)
(u'refuse', 1)
(u'reformist', 1)
(u'reformer', 1)
(u'reflex_response', 1)
(u'reflection', 1)
(u'refined_sugar', 1)
(u'reference_work', 1)
(u'reference_book', 1)
(u'referee', 1)
(u'reduction', 1)
(u'redox', 1)
(u'red', 1)
(u'rectification', 1)
(u'recreational_vehicle', 1)
(u'record_producer', 1)
(u'record_company', 1)
(u'reconstruction', 1)
(u'reckoning', 1)
(u'reciprocating_engine', 1)
(u'recipe', 1)
(u'recess', 1)
(u'receiver', 1)
(u'realty', 1)
(u'raw_material', 1)
(u'rational_motive', 1)
(u'rapidity', 1)
(u'rap_music', 1)
(u'railway_track', 1)
(u'railway_car', 1)
(u'railway', 1)
(u'rails', 1)
(u'railroad_train', 1)
(u'railroad_car', 1)
(u'railroad', 1)
(u'railcar', 1)
(u'rail_transportation', 1)
(u'rail_transport', 1)
(u'rail_track', 1)
(u'rail_line', 1)
(u'rail', 1)
(u'radioactivity', 1)
(u'radio_station', 1)
(u'radio_show', 1)
(u'radio_programme', 1)
(u'radio_program', 1)
(u'radio_frequency', 1)
(u'radiation', 1)
(u'racing_boat', 1)
(u'raceway', 1)
(u'racetrack', 

(u'buzz_word', 1)
(u'businesswoman', 1)
(u'business_model', 1)
(u'business_activity', 1)
(u'bus_service', 1)
(u'bus_company', 1)
(u'burning', 1)
(u'bunk', 1)
(u'bulge', 1)
(u'building_block', 1)
(u'brown', 1)
(u'broadcasting_station', 1)
(u'broadcast_station', 1)
(u'broadcast_programming', 1)
(u'bridge', 1)
(u'breeding', 1)
(u'breathing_out', 1)
(u'breathing', 1)
(u'breathe_out', 1)
(u'breath', 1)
(u'breaking', 1)
(u'breakfast_food', 1)
(u'breakfast_cereal', 1)
(u'breakdown', 1)
(u'breakdancing', 1)
(u'breakdance', 1)
(u'break_dance', 1)
(u'breadth', 1)
(u'breadbasket', 1)
(u'bravery', 1)
(u'braveness', 1)
(u'bramble_bush', 1)
(u'brace', 1)
(u'boy_band', 1)
(u'boxer', 1)
(u'bovid', 1)
(u'bounds', 1)
(u'bound', 1)
(u'bottom', 1)
(u'borrowing', 1)
(u'borderline', 1)
(u'border_checkpoint', 1)
(u'booster', 1)
(u'bookstore', 1)
(u'book_series', 1)
(u'book_of_account', 1)
(u'book_design', 1)
(u'bondage', 1)
(u'bollywood', 1)
(u'body_waste', 1)
(u'body_substance', 1)
(u'body_fluid', 1)
(u'bod

## Construct synonyms for training, testing and validation terms

In [3]:
from collections import defaultdict

def get_synyonyms(hyponyms, hypernyms, n=15):
    synonyms = {}
    for term in set(hyponyms):
        term_hypernyms = [h for q, h in zip(hyponyms, hypernyms) if q == term]    
        synonyms[term] = list(filter(lambda x: x not in term_hypernyms, zip(*model.most_similar(term, topn=20))[0]))[:n]
        
    return synonyms
    
#get_synyonyms(train_query + test_query + valid_query, train_hyper + test_hyper + valid_hyper)    
#get_synyonyms(valid_query, valid_hyper)    

def get_random(hyponyms, hypernyms, vocab, n = 15):
    random_words = {}
    for term in set(hyponyms):
        term_hypernyms = [h for q, h in zip(hyponyms, hypernyms) if q == term]    
        
        some_words = np.random.choice(vocab, 20, replace=False)        
        random_words[term] = list(filter(lambda x: x not in term_hypernyms, some_words))[:n]
    
    return random_words

#get_random(valid_query, valid_hyper, vocab)    

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Data class that encapsulates all word-based data I need to train the various algorithms
# We assume that we have all pre-filtered any words that don't feature in the embeddings
class Data:
    def __init__(self, 
                 train_query, train_hyper, 
                 test_query, test_hyper, 
                 valid_query, valid_hyper, 
                 vocab, embeddings):
        
                
        # encapsulate input variables so that all the data can be passed via class instance reference
        self.train_query = train_query
        self.train_hyper = train_hyper
        self.test_query = test_query
        self.test_hyper = test_hyper
        self.valid_query = valid_query
        self.valid_hyper = valid_hyper
        self.vocab = vocab
        
        #self.synonyms = synonyms
                
        # determine dimensionality of embeddings
        self.embeddings_dim = embeddings['animal'].shape[0]
        
        print ("Tokenising words...")
        # intialise and fit tokenizer
        self.tokenizer = tokenizer = Tokenizer(num_words = 300000, filters='')
        self.tokenizer.fit_on_texts(train_query + test_query + valid_query + vocab)
        
        print ("Creating embedding matrix...")
        # construct embedding_matrix
        self.embedding_matrix = np.zeros((len(self.tokenizer.word_index)+1, self.embeddings_dim), dtype='float32')

        for word, i in self.tokenizer.word_index.items():
            if i < len(self.tokenizer.word_index) + 1:
                embedding_vector = embeddings[word]
                if embedding_vector is not None:
                    # normalise vector (already normalised)
                    #embedding_vector /= np.linalg.norm(embedding_vector)
                    self.embedding_matrix[i,:] = embedding_vector  
        # confirm shape
        assert self.embedding_matrix.shape == (len(self.tokenizer.word_index)+1, self.embeddings_dim)
        
        print ("Creating random words/synonyms...")
        self.random_words = get_random(train_query + test_query + valid_query, train_hyper + test_hyper + valid_hyper, vocab)  
        self.synonyms = get_synyonyms(train_query + test_query + valid_query, train_hyper + test_hyper + valid_hyper)

In [5]:
#data = Data(train_query, train_hyper, test_query, test_hyper, valid_query, valid_hyper, vocab, model)
import pickle
import os

dest = os.path.join('.', 'pickle')
#pickle.dump(data, open(os.path.join(dest, 'semeval_data.pkl'), 'wb'), protocol=2)
data = pickle.load(open(os.path.join(dest, 'semeval_data.pkl'), 'rb'))


In [7]:
print (data.random_words['starcraft'])
print (data.synonyms['rod_laver'])


[u'starkers', u'zefir', u'unmirrored', u'absorb', u'acushla', u'high-level_design', u'hyposensitivity', u'rumours', u'3801', u'destroyer_escort', u'demultiplex', u'dogma', u'ozonosphere', u'euthanize', u'polyhedral']
[u'grand_slam', u'davis_cup', u'semifinal', u'unbeaten', u'world_championship', u'olympic_record', u'championship', u'post-match', u'cup_final', u'quarterfinal', u'bhupathi', u'clinched', u'junior_welterweight', u'unseeded', u'olympic_champion']


In [6]:
# function that returns negative samples alongside set of positive samples
# we need to pass:
# the batch hyponym terms, batch of hypernym terms, negative_tuples, tokenizer 
# to create sequences
def extend_batch_with_negatives(batch_X_term, batch_X_hyper, negative_tuples,                              
                                tokenizer, m):
    # initialise negative tuples container
    positive_words = [tokenizer.index_word[term_id] for term_id in batch_X_term.flatten()]
    
    # tokenize -ve samples
    neg_terms, neg_hyper = [], []
    for n in positive_words:
        for n2 in negative_tuples[n][:m]:
            neg_terms.append(n)
            neg_hyper.append(n2)
    
    neg_terms_seq = tokenizer.texts_to_sequences(neg_terms)
    neg_hyper_seq = tokenizer.texts_to_sequences(neg_hyper)

    # before increasing size of our batch, let's set the actual y values
    # the first n terms are true (1s), and the rest are the -ve samples (0)
    batch_y_label = np.concatenate((
            np.ones(batch_X_term.shape[0]),
            np.zeros(len(neg_terms_seq))
    ))
    # finally, stack -ve sequences at the bottom of +ves to 
    # create our final training batch
    # at most, batch size will be 192 samples            

    batch_X_term = np.vstack((batch_X_term, np.array(neg_terms_seq)))
    batch_X_hyper = np.vstack((batch_X_hyper, np.array(neg_hyper_seq)))
    
    return batch_X_term, batch_X_hyper, batch_y_label

# Evaluation

In [7]:
def convert_hypernyms_to_one_line(dataset):
    ordered_queries = sorted(list(set(dataset[0])))
    one_line = {}
    for w in ordered_queries:
        word_hypernyms = [h for q, h in zip(*dataset) if q == w]
        one_line[w] = word_hypernyms
    return one_line

In [8]:
# taken from task_scorer.py provided with shared task resources
def mean_reciprocal_rank(r):
    """Score is reciprocal of the rank of the first relevant item
    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean reciprocal rank
    """
    r = np.asarray(r).nonzero()[0]
    return 1. / (r[0] + 1) if r.size else 0.

def precision_at_k(r, k, n):
    """Score is precision @ k
    Relevance is binary (nonzero is relevant).
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Precision @ k
    Raises:
        ValueError: len(r) must be >= k
    """
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return (np.mean(r)*k)/min(k,n)
    # Modified from the first version. Now the gold elements are taken into account

def average_precision(r,n):
    """Score is average precision (area under PR curve)
    Relevance is binary (nonzero is relevant).
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Average precision
    """
    r = np.asarray(r) != 0
    out = [precision_at_k(r, k + 1, n) for k in range(r.size)]
    #Modified from the first version (removed "if r[k]"). All elements (zero and nonzero) are taken into account
    if not out:
        return 0.
    return np.mean(out)

def mean_average_precision(r,n):
    """Score is mean average precision
    Relevance is binary (nonzero is relevant).
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean average precision
    """
    return average_precision(r,n)



In [9]:
# predictions is a dictionary whereby key is query term and value is a list of ranked hypernym predictions
def get_evaluation_scores(dataset, predictions):
    all_scores = []    
    scores_names = ['MRR', 'MAP', 'P@1', 'P@5', 'P@10']
    for query, gold_hyps in convert_hypernyms_to_one_line(dataset).items():

        avg_pat1 = []
        avg_pat2 = []
        avg_pat3 = []

        pred_hyps = predictions[query]
        gold_hyps_n = len(gold_hyps)    
        r = [0 for i in range(15)]

        for j in range(len(pred_hyps)):
            if j < gold_hyps_n:
                pred_hyp = pred_hyps[j]
                if pred_hyp in gold_hyps:
                    r[j] = 1

        avg_pat1.append(precision_at_k(r,1,gold_hyps_n))
        avg_pat2.append(precision_at_k(r,5,gold_hyps_n))
        avg_pat3.append(precision_at_k(r,10,gold_hyps_n))    

        mrr_score_numb = mean_reciprocal_rank(r)
        map_score_numb = mean_average_precision(r,gold_hyps_n)
        avg_pat1_numb = sum(avg_pat1)/len(avg_pat1)
        avg_pat2_numb = sum(avg_pat2)/len(avg_pat2)
        avg_pat3_numb = sum(avg_pat3)/len(avg_pat3)

        score_results = [mrr_score_numb, map_score_numb, avg_pat1_numb, avg_pat2_numb, avg_pat3_numb]
        all_scores.append(score_results)
    return scores_names, all_scores


## Helper functions to run evaluation routine

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# alternative hypernym generator by applying Phi weights to hyponym and see which 
# words are closest to this vector
def crim_get_hypernym(word, tokenizer, phi, cluster_weight, bias, embeddings, top):
    
    q_idx = tokenizer.word_index[word]
    q = embeddings[q_idx] 

    projections = np.dot(q, phi)
    #projections /= np.linalg.norm(projections, axis=1).reshape(-1,1)
    
    sim_matrix = np.dot(cluster_weight.T, np.dot(embeddings[1:], projections.T).T) + bias
    top_ranked_sequence = np.argsort(sim_matrix[0])[::-1][:top]
    
    return zip(tokenizer.sequences_to_texts(top_ranked_sequence.reshape(-1,1) + 1), 
               sim_matrix.flatten()[top_ranked_sequence])

In [11]:
def cluster_get_hypernym(word, tokenizer, phi, cluster_weight, bias, embeddings, top):
    q_idx = tokenizer.word_index[word]
    q = embeddings[q_idx] 
    
    projections = np.dot(q, phi)
    s = np.dot(embeddings[1:], projections.T)
    linear_combination = (s.T * cluster_weight) + bias

    best_projection = np.max(linear_combination, axis=0)
    top_words = np.argsort(best_projection)[::-1][:top]
    
    return zip(tokenizer.sequences_to_texts(top_words.reshape(-1,1) + 1), 
               best_projection[top_words])

    

In [12]:
# function which generates top 15 predictions for each hyponym query term
# and returns results as dictionary
def predict_crim_hypernyms(queries, tokenizer, model, algol):
    #hyper_candidates = [[data.tokenizer.word_index[hyper]] for hyper in data.tokenizer.word_index.keys()]
    #hyper_candidates = [[data.tokenizer.word_index[hyper]] for hyper in set(data.valid_hyper)]
    
    ordered_queries = sorted(list(set(queries)))
    results = {}
        
    # extract the Phi matrices out of trained model
    dense = [l.get_weights()[0] for l in model.layers if type(l) == Dense and l.name.startswith('Phi') ]
    dense = np.asarray(dense)
    
    # extract affine transform layer weights
    cluster_weight = model.get_layer(name='Prediction').get_weights()[0]
    bias = model.get_layer(name='Prediction').get_weights()[1]
    embeddings = model.get_layer(name="TermEmbedding").get_weights()[0]

    
    for idx, word in enumerate(ordered_queries):        
        if (idx + 1) % 100 == 0:
            print ("Done", idx + 1)
        
        #predicted_hypers = alt_get_hypernym(word, model, data, dense, 15)
        predicted_hypers = algol(word, tokenizer, dense, cluster_weight, bias, embeddings, 15)
        results[word] = [h for h, p in predicted_hypers]
        
    return results


In [14]:
def predict_cluster_hypernyms(queries, tokenizer, cluster_list, knn_model=None):
    #hyper_candidates = [[data.tokenizer.word_index[hyper]] for hyper in data.tokenizer.word_index.keys()]
    #hyper_candidates = [[data.tokenizer.word_index[hyper]] for hyper in set(data.valid_hyper)]
    
    ordered_queries = sorted(list(set(queries)))
    
    results = {}
        
    # embeddings are present in a "shared" model that is used as the first layer of each cluster
    embeddings = [l for l in cluster_list[0].model.layers if type(l) == Model][0].get_layer(name='WE').get_weights()[0]
    
    # extract the Phi matrices out of trained model
    dense = np.zeros((len(cluster_list), embeddings.shape[1], embeddings.shape[1]))
    lr_weights = np.zeros((len(cluster_list), 1))
    lr_bias = np.zeros((len(cluster_list), 1))

    for idx, cluster in enumerate(cluster_list):
        dense[idx] = cluster.model.get_layer(name='Phi0').get_weights()[0]
        lr_weights[idx] = cluster.model.get_layer(name='Prediction').get_weights()[0]
        lr_bias[idx] = cluster.model.get_layer(name='Prediction').get_weights()[1]    
    
    for idx, word in enumerate(ordered_queries):        
        if (idx + 1) % 100 == 0:
            print ("Done", idx + 1)
            
            
        if knn_model:            
            cluster_probs = knn_model.predict_proba(embeddings[tokenizer.word_index[word]].reshape(1,-1))
            cluster_idx = np.where(cluster_probs > 0.)[1]                        
            predicted_hypers = cluster_get_hypernym(word, tokenizer, 
                                                    dense[cluster_idx], 
                                                    lr_weights[cluster_idx], 
                                                    lr_bias[cluster_idx], embeddings, 15)
        else:        
            #predicted_hypers = alt_get_hypernym(word, model, data, dense, 15)
            predicted_hypers = cluster_get_hypernym(word, tokenizer, 
                                                    dense, lr_weights, lr_bias, 
                                                    embeddings, 15)
        
        results[word] = [h for h, p in predicted_hypers]
        
    return results

In [39]:
predict_cluster_hypernyms(data.valid_query, data.tokenizer, clusters)


{u'bierstadt': [u'cultural_landscape',
  u'television_production',
  u'business_establishment',
  u'convey',
  u'musical_work',
  u'business_organization',
  u'cultural_tourism',
  u'visual_art',
  u'performing_arts',
  u'theatrical_production',
  u'commercial_agency',
  u'musical_performance',
  u'news_event',
  u'subject',
  u'artistic'],
 u'bloodguilt': [u'person',
  u'television_production',
  u'electronic_media',
  u'subject',
  u'context',
  u'musical_work',
  u'scope',
  u'convey',
  u'responsibility',
  u'relate',
  u'news_event',
  u'terms',
  u'scriptwriting',
  u'creative_work',
  u'relevant'],
 u'boatlift': [u'television_production',
  u'business_establishment',
  u'business_organization',
  u'cultural_landscape',
  u'business',
  u'convey',
  u'electronic_media',
  u'entertainment',
  u'musical_work',
  u'responsibility',
  u'performing_arts',
  u'audience',
  u'sponsorship',
  u'local_community',
  u'professional_sports'],
 u'burger_king': [u'television_production',
  u'b

In [76]:
# This method retrieves the words which our model considers the most probably hypernyms.  
# Problem with this method is that it's excruciatingly slow so I developed a numpy-based
# algorithm which is significantly faster.  Refer to alt_get_hypernym

def crim_get_top_hypernyms(query, hyper_candidates, model, data, top):    
    candidates = data.tokenizer.texts_to_sequences(data.vocab)
    candidates = np.asarray(candidates).flatten()
    
    query_id = data.tokenizer.word_index[query]
    predictions = model.predict([np.asarray([query_id] * len(data.vocab)), candidates])

    best_predictions = np.argsort(predictions.flatten())[::-1][:top]
    return list(map(lambda x: (data.vocab[x], predictions[x][0]), best_predictions))
    

In [222]:
candidates = data.tokenizer.texts_to_sequences(data.vocab)
candidates = np.asarray(candidates).flatten()

predictions = np.zeros((5, len(data.vocab)))
query_id = data.tokenizer.word_index['starcraft']
for idx, c in enumerate(cluster_list):
    predictions[idx] = c.model.predict([np.asarray([query_id] * len(data.vocab)), candidates]).flatten()
    
best_projection = np.max(predictions, axis=0)
top_words = np.argsort(best_projection)[::-1][:15]

zip( map ( lambda x: data.tokenizer.index_word[candidates[x]], top_words), 
               best_projection[top_words])


# Keras Projection Learning Models

In [15]:
from tensorflow.keras import backend as K
from tensorflow.keras.constraints import Constraint

class ForceToOne (Constraint):    
    def __call__(self, w):
        w /= w
        return w

In [16]:
from tensorflow.keras.layers import Input, Dense, Embedding, Dot, Flatten, Concatenate, concatenate, Reshape, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import RandomNormal, Zeros, Ones
from tensorflow.keras.regularizers import l2, l1, l1_l2
from tensorflow.keras.constraints import UnitNorm
from tensorflow.keras.optimizers import Adam

from tensorflow.keras import backend as K
import tensorflow as tf

# Phi layer initialiser
def random_identity(shape, dtype="float32", partition_info=None):    
    rnorm = K.random_normal((shape[-1],shape[-1]), mean=0., stddev=0.01)
    #identity = K.eye(shape[-1], dtype='float32')        
    rident = tf.eye(shape[-1]) * rnorm
    return rident

def random_plus_identity(shape, dtype="float32", partition_info=None):    
    rnorm = K.random_normal((shape[-1],shape[-1]), mean=0., stddev=0.01)    
    rident = tf.eye(shape[-1]) + rnorm
    return rident

def random_normal(shape, dtype="float32", partition_info=None): 
    return K.random_normal((shape[-1],shape[-1]), 
                             mean=0., stddev=0.05) 

def get_CRIM_model(phi_k=1, train_embeddings=False,\
                   embeddings_dim=300, vocab_size=1000,\
                   embeddings_matrix=None,
                   phi_init = None,
                   phi_activity_regularisation = None,
                   sigmoid_kernel_regularisation = None,
                   sigmoid_bias_regularisation = None,
                   sigmoid_kernel_constraint = None,
                   dropout_rate = 0.,
                   learning_rate = 0.001
                  ):
    
    hypo_input  = Input(shape=(1,), name='Hyponym')
    hyper_input = Input(shape=(1,), name='Hypernym')
    
    embedding_layer = Embedding(vocab_size + 1, embeddings_dim, embeddings_constraint = UnitNorm(axis=1), 
                                input_length=1, name='TermEmbedding')
    
    
    hypo_embedding = embedding_layer(hypo_input)    
    hyper_embedding = embedding_layer(hyper_input)
    
    # Add Dropout to avoid overfit    
    hypo_embedding = Dropout(dropout_rate, name='Dropout_Hypo')(hypo_embedding)
    hyper_embedding = Dropout(dropout_rate, name='Dropout_Hyper')(hyper_embedding)
    
    phi_layer = []
    for i in range(phi_k):
        phi_layer.append(Dense(embeddings_dim, activation=None, use_bias=False, 
                               activity_regularizer=phi_activity_regularisation,
                               kernel_initializer=phi_init,                               
                               name='Phi%d' % (i)) (hypo_embedding))
        

    #phi1 = Dense(embeddings_dim, activation=None, use_bias=False, 
                #kernel_initializer=random_identity, name='Phi1')(hypo_embedding)

    if phi_k == 1:
        # flatten tensors
        phi = Flatten()(phi_layer[0])
        hyper_embedding = Flatten()(hyper_embedding)    
    else:
        phi = concatenate(phi_layer, axis=1)
    
    phi = Dropout(dropout_rate, name='Dropout_Phi')(phi)
    
    # this is referred to as "s" in the "CRIM" paper    
    phi_hyper = Dot(axes=-1, normalize=False, name='DotProduct')([phi, hyper_embedding])
    
    if phi_k > 1:
        phi_hyper = Flatten()(phi_hyper)
    
    predictions = Dense(1, activation="sigmoid", name='Prediction',
                        use_bias=True,
                        kernel_initializer=Zeros,
                        kernel_constraint= sigmoid_kernel_constraint,
                        bias_initializer=Zeros,                        
                        kernel_regularizer=sigmoid_kernel_regularisation,
                        bias_regularizer=sigmoid_bias_regularisation
                       ) (phi_hyper)

    # instantiate model
    model = Model(inputs=[hypo_input, hyper_input], outputs=predictions)
        
    # inject pre-trained embedding weights into Embedding layer
    model.get_layer(name='TermEmbedding').set_weights([embeddings_matrix])
    model.get_layer(name='TermEmbedding').trainable = train_embeddings    

    adam = Adam(lr = learning_rate, beta_1 = 0.9, beta_2 = 0.9, clipnorm=1.)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [16]:
# idea is to train an initial model to generate reasonabl projection matrices that
# when applied to a hyponym, gets it close to the hypernym in question.

# after initial training, we will inject the phi and sigmoid layer trained weights into the model;
# the projections will be frozen but instead we will adjust the embeddings.  
# prediction layer weights may also be modified 
def get_CRIM_model_freeze_phi(phi_k=1, train_phi=False,
                              phi_weights = None,
                              lr_weights = None,
                              embeddings_dim=300, vocab_size=1000,
                              embeddings_matrix=None,
                              phi_init = None,
                              phi_activity_regularisation = None,
                              sigmoid_kernel_regularisation = None,
                              sigmoid_bias_regularisation = None,
                              sigmoid_kernel_constraint = None,
                              dropout_rate = 0.,
                              learning_rate = 0.00025
                  ):
    
    hypo_input  = Input(shape=(1,), name='Hyponym')
    hyper_input = Input(shape=(1,), name='Hypernym')
    
    embedding_layer = Embedding(vocab_size + 1, embeddings_dim, embeddings_constraint = UnitNorm(axis=1), 
                                input_length=1, name='TermEmbedding')
    
    
    hypo_embedding = embedding_layer(hypo_input)    
    hyper_embedding = embedding_layer(hyper_input)
    
    # Add Dropout to avoid overfit    
    hypo_embedding = Dropout(dropout_rate, name='Dropout_Hypo')(hypo_embedding)
    hyper_embedding = Dropout(dropout_rate, name='Dropout_Hyper')(hyper_embedding)
    
    phi_layer = []
    for i in range(phi_k):
        phi_layer.append(Dense(embeddings_dim, activation=None, use_bias=False, 
                               activity_regularizer=phi_activity_regularisation,
                               kernel_initializer=phi_init,                               
                               name='Phi%d' % (i)) (hypo_embedding))
            
    if phi_k == 1:
        # flatten tensors
        phi = Flatten()(phi_layer[0])
        hyper_embedding = Flatten()(hyper_embedding)    
    else:
        phi = concatenate(phi_layer, axis=1)
            
    # this is referred to as "s" in the "CRIM" paper    
    phi_hyper = Dot(axes=-1, normalize=False, name='DotProduct')([phi, hyper_embedding])
    
    if phi_k > 1:
        phi_hyper = Flatten()(phi_hyper)
    
    predictions = Dense(1, activation="sigmoid", name='Prediction',
                        use_bias=True,
                        kernel_initializer=Zeros,
                        kernel_constraint= sigmoid_kernel_constraint,
                        bias_initializer=Zeros,                        
                        kernel_regularizer=sigmoid_kernel_regularisation,
                        bias_regularizer=sigmoid_bias_regularisation
                       ) (phi_hyper)

    # instantiate model
    model = Model(inputs=[hypo_input, hyper_input], outputs=predictions)
        
    # inject pre-trained embedding weights into Embedding layer
    model.get_layer(name='TermEmbedding').set_weights([embeddings_matrix])
    #model.get_layer(name='TermEmbedding').trainable = train_embeddings    
    
    phi_projections = [l for l in model.layers if l.name.startswith('Phi')]    
    for idx, phi_projection in enumerate(phi_projections):
        phi_projection.set_weights([phi_weights[idx]])
        phi_projection.trainable = train_phi
        
    model.get_layer(name='Prediction').set_weights(lr_weights)
    
    adam = Adam(lr=learning_rate, beta_1 = 0.9, beta_2 = 0.9, clipnorm=1.)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

### The training algorithm incorporates mini-batch stochastic descent and negative sampling

In [17]:
def train(model,       # the model which parameters will be learnt
          epochs,      # number of epochs to run          
          batch_size,  # size of mini-batch
          m,           # number of negative samples
          data,        # data required for training                              
          neg_strategy
         ):

            
    # create sequences
    term_train_seq = data.tokenizer.texts_to_sequences(data.train_query)
    hyper_train_seq = data.tokenizer.texts_to_sequences(data.train_hyper)

    term_test_seq = data.tokenizer.texts_to_sequences(data.valid_query)
    hyper_test_seq = data.tokenizer.texts_to_sequences(data.valid_hyper)
                
    samples = np.arange(len(term_train_seq))
    validation_samples = np.arange(len(term_test_seq))
    
    # train algorithm
    for epoch in range(epochs):
        # reset loss
        loss = 0.
        test_loss = 0.
                        
        np.random.shuffle(samples)        

        shuffled_X_term, shuffled_X_hyper =\
            np.array(term_train_seq, dtype='int32')[samples],\
            np.array(hyper_train_seq, dtype='int32')[samples]

        for b in range(0, len(samples), batch_size):
            # product mini-batch, consisting of 32 +ve samples
            batch_X_term = shuffled_X_term[b:b + batch_size] 
            batch_X_hyper = shuffled_X_hyper[b:b + batch_size]

            # complement +ve samples with negatives
            batch_X_term, batch_X_hyper, batch_y_label =\
            extend_batch_with_negatives(batch_X_term, batch_X_hyper,
                                        neg_strategy,
                                        data.tokenizer, m
                                       )            
            
            # shuffle validation set indices
            #np.random.shuffle(validation_samples)
            
            # pick batch of shuffled test instances with size equal to training batch
            batch_X_test_term, batch_X_test_hyper =\
                np.array(term_test_seq, dtype='int32')[validation_samples],\
                np.array(hyper_test_seq, dtype='int32')[validation_samples]
            
            #batch_y_test_label = np.ones(batch_X_test_term.shape[0]),
            
            # distort test batch with some negatives to check how algorithm fares with
            # negatives
            
            batch_X_test_term, batch_X_test_hyper, batch_y_test_label =\
            extend_batch_with_negatives(batch_X_test_term, batch_X_test_hyper,
                                        neg_strategy,
                                        data.tokenizer, m
                                       )            

            # train on batch
            loss += model.train_on_batch([batch_X_term, batch_X_hyper], 
                                          batch_y_label)[0]
            
            test_loss += model.test_on_batch([batch_X_test_term, batch_X_test_hyper], 
                                              batch_y_test_label)[0]                
                
            
        print('Epoch:', epoch+1, 'Loss:', loss, 'Test Loss:', test_loss)    


In [25]:
from tensorflow.keras.initializers import RandomNormal, Zeros, Ones
from tensorflow.keras.regularizers import l2, l1, l1_l2

#rand_norm_m0_sd001 = RandomNormal(mean = 0.0, stddev=0.01, seed=42)
#rand_norm = RandomNormal(mean = 0.0, stddev=1., seed=42)

# negative sampling options
neg_sampling_options = {'synonym':data.synonyms,                                                 
                        'random':data.random_words
                       }

# phi random init options
phi_init_options = {'random_plus_identity': random_plus_identity,
                    'random_identity': random_identity, 
                    'random_normal': random_normal}

kernel_constraints = {'None': None, 'ForceToOne': ForceToOne()}

# positive batch size
batch_size = 32

# implement mini-batch stochastic training
epochs = 10


# number of negative samples
m = 10
# number of projections
phi_k = 1
# train (True) or freeze
train_embeddings = False
# negative sample strategy
negative_option = 'random'
# initialise phi strategy
phi_init_option = 'random_identity'
# constrain LR parameter
kernel_constraint_option = 'None'
# dropout rate
dropout_rate = 0.3
learning_rate = 0.001

np.random.seed(10)


# create model
crim_model = get_CRIM_model(phi_k = phi_k, train_embeddings = train_embeddings,
                            embeddings_dim = data.embeddings_dim, vocab_size = len(data.tokenizer.word_counts),
                            embeddings_matrix = data.embedding_matrix,
                            phi_init = phi_init_options[phi_init_option],                            
                            sigmoid_kernel_regularisation = None, #l2(0.001),
                            sigmoid_bias_regularisation = None, #l2(0.001),
                            sigmoid_kernel_constraint = kernel_constraints[kernel_constraint_option],
                            dropout_rate = dropout_rate,
                            learning_rate = learning_rate
                           )

crim_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Hyponym (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
TermEmbedding (Embedding)       (None, 1, 200)       43904800    Hyponym[0][0]                    
                                                                 Hypernym[0][0]                   
__________________________________________________________________________________________________
Dropout_Hypo (Dropout)          (None, 1, 200)       0           TermEmbedding[0][0]              
__________________________________________________________________________________________________
Hypernym (InputLayer)           (None, 1)            0                                            
__________

In [25]:
# plot model
from keras.utils.vis_utils import plot_model
#from tensorflow.keras.utils import plot_model

plot_model(crim_model, to_file='CRIM_alternate_model.png', show_shapes=True, show_layer_names=True)

Using Theano backend.


In [27]:
import math

predict = False
print ('Epochs: ', epochs, 'Batch size: ', batch_size, 'm: ', m, 'pki_k: ', phi_k, 'train_embeddings: ', train_embeddings,
      'Negative sampling: ', negative_option, 'Phi Init: ', phi_init_option, 'Dropout rate: ', dropout_rate, 
      'Kernel constraint: ', kernel_constraint_option, 'Learning rate: ', learning_rate)

train(crim_model, epochs, batch_size, m, data, neg_sampling_options[negative_option])

# evaluate
if predict:
    print ("Generating predictions...")
    crim_predictions = predict_crim_hypernyms(data, crim_model, alt_get_hypernym)

    print ("CRIM evaluation:")
    score_names, all_scores = get_evaluation_scores((data.test_query, data.test_hyper), crim_predictions)
    for k in range(len(score_names)):
        print (score_names[k]+': '+str(round(sum([score_list[k] for score_list in all_scores]) / len(all_scores), 5)))



('Epochs: ', 10, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 1, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout rate: ', 0.3, 'Kernel constraint: ', 'None', 'Learning rate: ', 0.001)
('Epoch:', 1, 'Loss:', 154.7653611600399, 'Test Loss:', 154.85199791193008)
('Epoch:', 2, 'Loss:', 100.31694516539574, 'Test Loss:', 99.247761875391)
('Epoch:', 3, 'Loss:', 87.97930534183979, 'Test Loss:', 87.04231545329094)
('Epoch:', 4, 'Loss:', 78.69284763932228, 'Test Loss:', 76.97447828948498)
('Epoch:', 5, 'Loss:', 72.10867649316788, 'Test Loss:', 69.92591404914856)
('Epoch:', 6, 'Loss:', 67.23415367305279, 'Test Loss:', 64.80040496587753)
('Epoch:', 7, 'Loss:', 63.75109326094389, 'Test Loss:', 61.24854889512062)
('Epoch:', 8, 'Loss:', 59.589780896902084, 'Test Loss:', 58.83554546535015)
('Epoch:', 9, 'Loss:', 56.71147071570158, 'Test Loss:', 56.90971930325031)
('Epoch:', 10, 'Loss:', 54.37444880232215, 'Test Loss:', 55.833690986037254)


In [36]:
# run further training if required
m = 10
negative_option = 'random'
epochs = 5
learning_rate = 0.00025

predict = True

dense = map(lambda x: x.get_weights()[0], [l for l in crim_model.layers if l.name.startswith('Phi')])
dense = np.asarray(dense)
lr_weights = crim_model.get_layer(name='Prediction').get_weights()

crim_model_2 = get_CRIM_model_freeze_phi(phi_k=phi_k, 
                                         train_phi=False, phi_weights=dense,
                                         lr_weights=lr_weights,
                                         embeddings_dim = data.embeddings_dim, vocab_size = len(data.tokenizer.word_counts),
                                         embeddings_matrix = data.embedding_matrix,
                                         phi_init = phi_init_options[phi_init_option],                            
                                         sigmoid_kernel_regularisation = None,
                                         sigmoid_bias_regularisation = None,
                                         sigmoid_kernel_constraint = kernel_constraints[kernel_constraint_option],
                                         dropout_rate = dropout_rate,
                                         learning_rate = learning_rate
                                        )
                                         
crim_model_2.summary()                                         

print ('Epochs: ', epochs, 'Batch size: ', batch_size, 'm: ', m, 'pki_k: ', phi_k, 'train_embeddings: ', train_embeddings,
      'Negative sampling: ', negative_option, 'Phi Init: ', phi_init_option, 'Dropout rate: ', dropout_rate, 
      'Kernel constraint: ', kernel_constraint_option, 'Learning rate: ', learning_rate)

#train(crim_model_2, epochs, batch_size, m, data, neg_sampling_options[negative_option])
if predict:
    print ("Generating predictions...")
    crim_predictions_2 = predict_crim_hypernyms(data, crim_model_2, alt_get_hypernym)

    print ("CRIM evaluation:")
    score_names, all_scores = get_evaluation_scores((data.test_query, data.test_hyper), crim_predictions_2)
    for k in range(len(score_names)):
        print (score_names[k]+': '+str(round(sum([score_list[k] for score_list in all_scores]) / len(all_scores), 5)))




('Epochs: ', 5, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 24, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_plus_identity', 'Dropout rate: ', 0.4, 'Kernel constraint: ', 'None')
Generating predictions...
('Done', 100)
('Done', 200)
('Done', 300)
('Done', 400)
('Done', 500)
('Done', 600)
('Done', 700)
('Done', 800)
('Done', 900)
('Done', 1000)
('Done', 1100)
('Done', 1200)
('Done', 1300)
('Done', 1400)
CRIM evaluation:
MRR: 0.29684
P@1: 0.24483
P@5: 0.13789
P@10: 0.13077


In [None]:
# save models
#crim_model.save_weights('models/crim_phi2_e25_drop35.h5')
#crim_model_2.save_weights('models/crim_embeddings_e5_drop35.h5')


In [28]:
# print crim_model parameters

print (crim_model.get_layer(name='Prediction').get_weights())
dense = map(lambda x: x.get_weights()[0], [l for l in crim_model.layers if l.name.startswith('Phi')])
dense = np.asarray(dense)
print (dense.shape)

[array([[-2.0710576]], dtype=float32), array([-1.8059509], dtype=float32)]
(1, 200, 200)


In [29]:
# print phi mean and standard deviation

for i in range(dense.shape[0]):
    print i, np.round(np.mean(dense[i]),5), np.std(dense[i])
    

0 -0.00509 0.23016633


## Evaluation  code

In [74]:
# generate a few example hypernyms
#example_terms = ['suzy_favor_hamilton', 'wicketkeeper','aquamarine','tenpence','vegetarian','blackfly']
example_terms = ['cat']

for t in example_terms:
    print (t)
    for h in alt_get_hypernym(t, cluster_list[0].model, data, dense, 15):
        print h
    print ("-"*30)


cat


NameError: name 'dense' is not defined

In [123]:
# test whether two words are related by hypernymy
i = data.tokenizer.word_index['ebert']
j = data.tokenizer.word_index['tennis_player']
crim_model.predict([[i], [j]])


array([[0.36369914]], dtype=float32)

### Find candidate hypernyms

In [17]:
#crim_get_top_hypernyms('robert_de_niro', candidates, crim_model, data, 15)


# Some readings

* ('Epochs: ', 30, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 24, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout 0.5: ', True)

CRIM evaluation:<br>
MRR: 0.13002<br>
P@1: 0.08806<br>
P@5: 0.06368<br>
P@10: 0.06131<br>

* ('Epochs: ', 10, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 1, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout rate: ', 0.5, 'Kernel constraint: ', 'None')

CRIM evaluation:<br>
MRR: 0.16015<br>
P@1: 0.12675<br>
P@5: 0.07046<br>
P@10: 0.06632<br>

* ('Epochs: ', 15, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 1, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout rate: ', 0.5: ')

CRIM evaluation:<br>
MRR: 0.15522<br>
P@1: 0.11741<br>
P@5: 0.07256<br>
P@10: 0.06844<br>

* ('Epochs: ', 5, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 1, 'train_embeddings: ', True, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout (0.5): ', True)

CRIM evaluation:<br>
MRR: 0.22385<br>
P@1: 0.1968<br>
P@5: 0.08815<br>
P@10: 0.08352<br>

Even though these results are superior on the outset, in reality, tuning the embeddings reduces the model to Most Frequent Hypernym.


* ('Epochs: ', 10, 'Batch size: ', 32, 'm: ', 5, 'pki_k: ', 24, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout: ', True)

Reducing the number of random samples has a negative impact.

    CRIM evaluation:<br>
MRR: 0.06439<br>
P@1: 0.04536<br>
P@5: 0.02777<br>
P@10: 0.02703<br>

* ('Epochs: ', 10, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 24, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout rate: ', 0.5, 'Kernel constraint: ', 'ForceToOne')

CRIM evaluation:<br>
MRR: 0.12434<br>
P@1: 0.09006<br>
P@5: 0.05667<br>
P@10: 0.0549<br>

* ('Epochs: ', 10, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 5, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout rate: ', 0.5, 'Kernel constraint: ', 'None')

CRIM evaluation:<br>
MRR: 0.1346<br>
P@1: 0.0974<br>
P@5: 0.0627<br>
P@10: 0.05998<br>

* ('Epochs: ', 10, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 10, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout rate: ', 0.5, 'Kernel constraint: ', 'None')

CRIM evaluation:<br>
MRR: 0.12567<br>
P@1: 0.08539<br>
P@5: 0.06105<br>
P@10: 0.05916<br>


* ('Epochs: ', 10, 'Batch size: ', 32, 'm: ', 5, 'pki_k: ', 12, 'train_embeddings: ', False, 'Negative sampling: ', 'synonym', 'Phi Init: ', 'random_identity', 'Dropout rate: ', 0.5, 'Kernel constraint: ', 'None')
('Epoch:', 1, 'Loss:', 155.5154161900282, 'Test Loss:', 485.83162450790405)
('Epoch:', 2, 'Loss:', 88.82768769562244, 'Test Loss:', 348.13482135534286)
('Epoch:', 3, 'Loss:', 66.63960940390825, 'Test Loss:', 258.7310974597931)
('Epoch:', 4, 'Loss:', 56.400326274335384, 'Test Loss:', 216.1373891234398)
('Epoch:', 5, 'Loss:', 51.70219925045967, 'Test Loss:', 207.98559176921844)
('Epoch:', 6, 'Loss:', 48.9724283516407, 'Test Loss:', 204.41960680484772)
('Epoch:', 7, 'Loss:', 46.962790466845036, 'Test Loss:', 195.1675413697958)
('Epoch:', 8, 'Loss:', 45.41264865081757, 'Test Loss:', 198.74169850349426)
('Epoch:', 9, 'Loss:', 45.02060864120722, 'Test Loss:', 191.37398713827133)
('Epoch:', 10, 'Loss:', 44.76244197413325, 'Test Loss:', 188.13010711967945)

CRIM evaluation:<br>
MRR: 0.01452<br>
P@1: 0.00801<br>
P@5: 0.00653<br>
P@10: 0.00623<br>

* ('Epochs: ', 15, 'Batch size: ', 32, 'm: ', 15, 'pki_k: ', 1, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout rate: ', 0.5, 'Kernel constraint: ', 'None')
('Epoch:', 1, 'Loss:', 173.66530799865723, 'Test Loss:', 422.2526289820671)
('Epoch:', 2, 'Loss:', 89.43182794749737, 'Test Loss:', 888.1642265319824)
('Epoch:', 3, 'Loss:', 82.11764796078205, 'Test Loss:', 897.4773137569427)
('Epoch:', 4, 'Loss:', 77.61969149112701, 'Test Loss:', 871.3285944461823)
('Epoch:', 5, 'Loss:', 73.61261868476868, 'Test Loss:', 838.6195100545883)
('Epoch:', 6, 'Loss:', 70.76602215319872, 'Test Loss:', 790.632804274559)
('Epoch:', 7, 'Loss:', 68.09195621311665, 'Test Loss:', 758.9123626947403)
('Epoch:', 8, 'Loss:', 66.28909918665886, 'Test Loss:', 722.3105019330978)
('Epoch:', 9, 'Loss:', 64.0174068659544, 'Test Loss:', 701.5663638114929)
('Epoch:', 10, 'Loss:', 62.1955054551363, 'Test Loss:', 685.4047366380692)
('Epoch:', 11, 'Loss:', 61.18628938496113, 'Test Loss:', 666.7885119915009)
('Epoch:', 12, 'Loss:', 60.10102154314518, 'Test Loss:', 654.0900322198868)
('Epoch:', 13, 'Loss:', 59.095360577106476, 'Test Loss:', 653.2110993862152)
('Epoch:', 14, 'Loss:', 58.58073855936527, 'Test Loss:', 634.9157860279083)
('Epoch:', 15, 'Loss:', 57.885950952768326, 'Test Loss:', 631.2514699697495)

CRIM evaluation:
MRR: 0.15946
P@1: 0.11674
P@5: 0.07472
P@10: 0.06976

-----------------------------------------------
Tried new technique involving mix of ensemble and transfer learning.<br>
* Developed two models: one is the standard CRIM model I've always used.  The embeddings are frozen and populated with the word2vec vectors provided by Gabriel.
* After training this model for an adequate number of epochs (i.e. until optimal fitting; early stopping implemented manually);
* After the first cycle of training is over I extracted the phi and LR weights;
* These weights were subsequently injected in a new model.  This time the embeddings later was set to trainable and the Phi dense layer was frozen;
* The model was encouraged to use the same projection weights and fine-tune the embeddings to learn a better hypernym generation model;

#### 1 Projection; First Cycle 
('Epochs: ', 10, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 1, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout rate: ', 0.3, 'Kernel constraint: ', 'None')
('Epoch:', 1, 'Loss:', 177.9488000869751, 'Test Loss:', 178.53641974925995)
('Epoch:', 2, 'Loss:', 103.15769128501415, 'Test Loss:', 102.5895478874445)
('Epoch:', 3, 'Loss:', 90.33858793973923, 'Test Loss:', 90.0393942296505)
('Epoch:', 4, 'Loss:', 80.81938941776752, 'Test Loss:', 79.73569859564304)
('Epoch:', 5, 'Loss:', 74.30441601574421, 'Test Loss:', 73.00559163093567)
('Epoch:', 6, 'Loss:', 68.31351159512997, 'Test Loss:', 67.29328979551792)
('Epoch:', 7, 'Loss:', 64.69703111797571, 'Test Loss:', 64.18054696917534)
('Epoch:', 8, 'Loss:', 61.36914176493883, 'Test Loss:', 60.54891411960125)
('Epoch:', 9, 'Loss:', 57.571076557040215, 'Test Loss:', 58.882518880069256)
('Epoch:', 10, 'Loss:', 54.78656556457281, 'Test Loss:', 57.673407919704914)
Generating predictions...
('Done', 100)
('Done', 200)
('Done', 300)
('Done', 400)
('Done', 500)
('Done', 600)
('Done', 700)
('Done', 800)
('Done', 900)
('Done', 1000)
('Done', 1100)
('Done', 1200)
('Done', 1300)
('Done', 1400)
CRIM evaluation:
MRR: 0.14914
P@1: 0.10874
P@5: 0.0687
P@10: 0.06505


#### 1 Projection; Second Cycle
('Epochs: ', 5, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 1, 'train_embeddings: ', True, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout rate: ', 0.3, 'Kernel constraint: ', 'None')
('Epoch:', 1, 'Loss:', 29.900435734540224, 'Test Loss:', 50.436300061643124)
('Epoch:', 2, 'Loss:', 8.763934534537839, 'Test Loss:', 46.93847519904375)
('Epoch:', 3, 'Loss:', 4.247505006627762, 'Test Loss:', 51.40926795452833)
('Epoch:', 4, 'Loss:', 2.347970368108463, 'Test Loss:', 56.144402757287025)
('Epoch:', 5, 'Loss:', 1.38899088197104, 'Test Loss:', 63.76580411195755)
Generating predictions...
('Done', 100)
('Done', 200)
('Done', 300)
('Done', 400)
('Done', 500)
('Done', 600)
('Done', 700)
('Done', 800)
('Done', 900)
('Done', 1000)
('Done', 1100)
('Done', 1200)
('Done', 1300)
('Done', 1400)
CRIM evaluation:
MRR: 0.28634
P@1: 0.24483
P@5: 0.1263
P@10: 0.11961

---------------------------------------------------------------------------------------
#### 24 Projections; First Cycle
('Epochs: ', 10, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 24, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout rate: ', 0.3, 'Kernel constraint: ', 'None')
('Epoch:', 1, 'Loss:', 109.97628237307072, 'Test Loss:', 110.91941311955452)
('Epoch:', 2, 'Loss:', 61.31632810086012, 'Test Loss:', 65.23697778582573)
('Epoch:', 3, 'Loss:', 48.39542027562857, 'Test Loss:', 59.35354737192392)
('Epoch:', 4, 'Loss:', 41.11544480547309, 'Test Loss:', 60.305753372609615)
('Epoch:', 5, 'Loss:', 36.67992676049471, 'Test Loss:', 64.0475360751152)
('Epoch:', 6, 'Loss:', 34.28844119235873, 'Test Loss:', 63.37563705444336)
('Epoch:', 7, 'Loss:', 32.068154136650264, 'Test Loss:', 64.99528855085373)
('Epoch:', 8, 'Loss:', 31.266997564584017, 'Test Loss:', 64.47643724828959)
('Epoch:', 9, 'Loss:', 30.232711946591735, 'Test Loss:', 63.88381798565388)
('Epoch:', 10, 'Loss:', 29.51552465558052, 'Test Loss:', 65.2800731509924)
Generating predictions...
('Done', 100)
('Done', 200)
('Done', 300)
('Done', 400)
('Done', 500)
('Done', 600)
('Done', 700)
('Done', 800)
('Done', 900)
('Done', 1000)
('Done', 1100)
('Done', 1200)
('Done', 1300)
('Done', 1400)
CRIM evaluation:
MRR: 0.10304
P@1: 0.07138
P@5: 0.04817
P@10: 0.04453

#### 24 Projections; Second Cycle
('Epochs: ', 5, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 24, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout rate: ', 0.3, 'Kernel constraint: ', 'None')

('Epoch:', 1, 'Loss:', 13.566975106863538, 'Test Loss:', 59.5253451615572)
('Epoch:', 2, 'Loss:', 2.032865798302737, 'Test Loss:', 60.16112444549799)
('Epoch:', 3, 'Loss:', 0.46449146703412225, 'Test Loss:', 64.23307839781046)
('Epoch:', 4, 'Loss:', 0.1069298386116202, 'Test Loss:', 68.53682653605938)
('Epoch:', 5, 'Loss:', 0.027284826716368116, 'Test Loss:', 73.16503396630287)
Generating predictions...
('Done', 100)
('Done', 200)
('Done', 300)
('Done', 400)
('Done', 500)
('Done', 600)
('Done', 700)
('Done', 800)
('Done', 900)
('Done', 1000)
('Done', 1100)
('Done', 1200)
('Done', 1300)
('Done', 1400)
CRIM evaluation:
MRR: 0.30386
P@1: 0.24817
P@5: 0.14445
P@10: 0.13732

---------------------------------------------------------------------------------------------------
### Experiment with single projection but changing: i) Random initialiser (random + identity); ii) 2nd phase keeps trainining Phi; iii) Change Learning Rate of 2nd phase.

#### 1 Projections; First Cycle
('Epochs: ', 10, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 1, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout rate: ', 0.3, 'Kernel constraint: ', 'None')
('Epoch:', 1, 'Loss:', 176.2122738659382, 'Test Loss:', 177.55351921916008)
('Epoch:', 2, 'Loss:', 104.77461278438568, 'Test Loss:', 105.96388858556747)
('Epoch:', 3, 'Loss:', 93.10278041660786, 'Test Loss:', 95.61527897417545)
('Epoch:', 4, 'Loss:', 83.65695556998253, 'Test Loss:', 86.35148683190346)
('Epoch:', 5, 'Loss:', 76.63541767001152, 'Test Loss:', 80.32908068597317)
('Epoch:', 6, 'Loss:', 71.39560843259096, 'Test Loss:', 74.56421269476414)
('Epoch:', 7, 'Loss:', 67.28997546434402, 'Test Loss:', 70.91447427868843)
('Epoch:', 8, 'Loss:', 63.82748632133007, 'Test Loss:', 67.26722575724125)
('Epoch:', 9, 'Loss:', 60.12399164587259, 'Test Loss:', 65.32279951870441)
('Epoch:', 10, 'Loss:', 57.98733665794134, 'Test Loss:', 63.860010385513306)
Generating predictions...
('Done', 100)
('Done', 200)
('Done', 300)
('Done', 400)
('Done', 500)
('Done', 600)
('Done', 700)
('Done', 800)
('Done', 900)
('Done', 1000)
('Done', 1100)
('Done', 1200)
('Done', 1300)
('Done', 1400)
CRIM evaluation:
MRR: 0.15094
P@1: 0.11474
P@5: 0.0674
P@10: 0.065

#### 24 Projections; Second Cycle
('Epochs: ', 10, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 1, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout rate: ', 0.3, 'Kernel constraint: ', 'None')
('Epoch:', 1, 'Loss:', 37.44226049259305, 'Test Loss:', 62.1908989623189)
('Epoch:', 2, 'Loss:', 14.947778060100973, 'Test Loss:', 74.91842666268349)
('Epoch:', 3, 'Loss:', 7.418942770687863, 'Test Loss:', 98.69380746781826)
('Epoch:', 4, 'Loss:', 4.618407022906467, 'Test Loss:', 120.58737960457802)
('Epoch:', 5, 'Loss:', 2.871887466167209, 'Test Loss:', 142.8387492597103)
('Epoch:', 6, 'Loss:', 1.9543396250010119, 'Test Loss:', 163.086307823658)
('Epoch:', 7, 'Loss:', 1.3212263471978076, 'Test Loss:', 183.57101076841354)
('Epoch:', 8, 'Loss:', 0.9992299735413326, 'Test Loss:', 209.59179404377937)
('Epoch:', 9, 'Loss:', 0.8104324492987871, 'Test Loss:', 228.58731454610825)
('Epoch:', 10, 'Loss:', 0.5762173827174593, 'Test Loss:', 254.7513089776039)
Generating predictions...
('Done', 100)
('Done', 200)
('Done', 300)
('Done', 400)
('Done', 500)
('Done', 600)
('Done', 700)
('Done', 800)
('Done', 900)
('Done', 1000)
('Done', 1100)
('Done', 1200)
('Done', 1300)
('Done', 1400)
CRIM evaluation:
MRR: 0.27074
P@1: 0.22148
P@5: 0.13155
P@10: 0.12557


* Overfit manifests for sure.  Person, work-of-art feature heavily as hypernyms even for query terms which are unrelated

### Experiment with dual projections.  Extended epochs of first cycle to 25.  Reduced learning rate of second cycle to reduce overfit and training for 5 epochs

#### First Cycle
('Epochs: ', 25, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 2, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout rate: ', 0.35, 'Kernel constraint: ', 'None')
('Epoch:', 1, 'Loss:', 160.74599489569664, 'Test Loss:', 160.929179251194)
('Epoch:', 2, 'Loss:', 96.68699912726879, 'Test Loss:', 96.50307583808899)
('Epoch:', 3, 'Loss:', 82.84739479422569, 'Test Loss:', 82.26891721785069)
('Epoch:', 4, 'Loss:', 73.6079184487462, 'Test Loss:', 72.73883473873138)
('Epoch:', 5, 'Loss:', 67.65975984185934, 'Test Loss:', 67.05146090686321)
('Epoch:', 6, 'Loss:', 63.158750265836716, 'Test Loss:', 62.18984428048134)
('Epoch:', 7, 'Loss:', 59.12502943724394, 'Test Loss:', 59.458139173686504)
('Epoch:', 8, 'Loss:', 55.11561170220375, 'Test Loss:', 56.17893383651972)
('Epoch:', 9, 'Loss:', 52.53363721072674, 'Test Loss:', 54.554423585534096)
('Epoch:', 10, 'Loss:', 50.518984742462635, 'Test Loss:', 53.33502481132746)
('Epoch:', 11, 'Loss:', 48.320613726973534, 'Test Loss:', 51.69379674643278)
('Epoch:', 12, 'Loss:', 46.352201879024506, 'Test Loss:', 51.4660424888134)
('Epoch:', 13, 'Loss:', 45.39920901507139, 'Test Loss:', 51.60422394424677)
('Epoch:', 14, 'Loss:', 44.41912394762039, 'Test Loss:', 50.1107277572155)
('Epoch:', 15, 'Loss:', 43.92356888204813, 'Test Loss:', 50.16625649482012)
('Epoch:', 16, 'Loss:', 42.618239261209965, 'Test Loss:', 49.68374668061733)
('Epoch:', 17, 'Loss:', 41.93088800087571, 'Test Loss:', 49.52678156644106)
('Epoch:', 18, 'Loss:', 42.28301604837179, 'Test Loss:', 49.41834541410208)
('Epoch:', 19, 'Loss:', 41.13799152523279, 'Test Loss:', 48.8290878534317)
('Epoch:', 20, 'Loss:', 40.83357220888138, 'Test Loss:', 48.22099205851555)
('Epoch:', 21, 'Loss:', 40.20593152567744, 'Test Loss:', 48.0087883323431)
('Epoch:', 22, 'Loss:', 40.49513000249863, 'Test Loss:', 47.994462229311466)
('Epoch:', 23, 'Loss:', 39.6955735757947, 'Test Loss:', 48.454381965100765)
('Epoch:', 24, 'Loss:', 39.38141195476055, 'Test Loss:', 48.52395910024643)
('Epoch:', 25, 'Loss:', 39.834498304873705, 'Test Loss:', 48.30932606011629)
Generating predictions...
('Done', 100)
('Done', 200)
('Done', 300)
('Done', 400)
('Done', 500)
('Done', 600)
('Done', 700)
('Done', 800)
('Done', 900)
('Done', 1000)
('Done', 1100)
('Done', 1200)
('Done', 1300)
('Done', 1400)
CRIM evaluation:
MRR: 0.11931
P@1: 0.07872
P@5: 0.05491
P@10: 0.05259

#### Second cycle
('Epochs: ', 5, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 2, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout rate: ', 0.35, 'Kernel constraint: ', 'None')
('Epoch:', 1, 'Loss:', 30.962760228663683, 'Test Loss:', 46.86121924221516)
('Epoch:', 2, 'Loss:', 18.49303602334112, 'Test Loss:', 43.47840115427971)
('Epoch:', 3, 'Loss:', 12.296762353973463, 'Test Loss:', 41.97538521140814)
('Epoch:', 4, 'Loss:', 8.417490374995396, 'Test Loss:', 40.455049715936184)
('Epoch:', 5, 'Loss:', 6.027016263862606, 'Test Loss:', 40.4208921790123)
Generating predictions...
('Done', 100)
('Done', 200)
('Done', 300)
('Done', 400)
('Done', 500)
('Done', 600)
('Done', 700)
('Done', 800)
('Done', 900)
('Done', 1000)
('Done', 1100)
('Done', 1200)
('Done', 1300)
('Done', 1400)
CRIM evaluation:
MRR: 0.30662
P@1: 0.2515
P@5: 0.14699
P@10: 0.13961

-------------------------------------------------------------------------------------------------
### Experiment with two Phi layers. 
#### I insert dropout after the hyponym and hypernym embeddings and after the second phi.  Training goes on for 25 epochs.  The solution converges much more slowly than if were using a single affine layer.

('Epochs: ', 25, 'Batch size: ', 32, 'm: ', 10, 'pki_k: ', 1, 'train_embeddings: ', False, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'Dropout rate: ', 0.35, 'Kernel constraint: ', 'None')
('Epoch:', 1, 'Loss:', 132.44914108514786, 'Test Loss:', 129.82295709848404)
('Epoch:', 2, 'Loss:', 95.7211739718914, 'Test Loss:', 92.90788823366165)
('Epoch:', 3, 'Loss:', 93.52208907902241, 'Test Loss:', 90.88466887176037)
('Epoch:', 4, 'Loss:', 91.18207442760468, 'Test Loss:', 89.7815543860197)
('Epoch:', 5, 'Loss:', 88.62042617797852, 'Test Loss:', 87.20721289515495)
('Epoch:', 6, 'Loss:', 85.08720774948597, 'Test Loss:', 83.62825165688992)
('Epoch:', 7, 'Loss:', 82.6170591711998, 'Test Loss:', 79.72680546343327)
('Epoch:', 8, 'Loss:', 78.23334312438965, 'Test Loss:', 74.8099833726883)
('Epoch:', 9, 'Loss:', 76.347172498703, 'Test Loss:', 72.44513177871704)
('Epoch:', 10, 'Loss:', 74.18829296529293, 'Test Loss:', 69.96501626074314)
('Epoch:', 11, 'Loss:', 72.2932140827179, 'Test Loss:', 67.46124893426895)
('Epoch:', 12, 'Loss:', 71.2011769413948, 'Test Loss:', 65.32345585525036)
('Epoch:', 13, 'Loss:', 70.07241632044315, 'Test Loss:', 63.981889829039574)
('Epoch:', 14, 'Loss:', 68.04309992492199, 'Test Loss:', 62.387570425868034)
('Epoch:', 15, 'Loss:', 67.2650830000639, 'Test Loss:', 61.06334821879864)
('Epoch:', 16, 'Loss:', 67.25092969834805, 'Test Loss:', 60.93150553107262)
('Epoch:', 17, 'Loss:', 66.12663190811872, 'Test Loss:', 60.26040391623974)
('Epoch:', 18, 'Loss:', 64.85526475310326, 'Test Loss:', 59.32122567296028)
('Epoch:', 19, 'Loss:', 64.72251350432634, 'Test Loss:', 58.52266174554825)
('Epoch:', 20, 'Loss:', 63.96191988885403, 'Test Loss:', 57.419067934155464)
('Epoch:', 21, 'Loss:', 62.82784986868501, 'Test Loss:', 56.976065531373024)
('Epoch:', 22, 'Loss:', 62.8604651093483, 'Test Loss:', 57.06210967898369)
('Epoch:', 23, 'Loss:', 62.14057156443596, 'Test Loss:', 56.878740444779396)
('Epoch:', 24, 'Loss:', 61.323521822690964, 'Test Loss:', 56.8377401381731)
('Epoch:', 25, 'Loss:', 61.14446556568146, 'Test Loss:', 56.42692677676678)

* Having one than one linear layer does not increase the hypothesis space of the model.
* Attempting model with 2 hidden layer each with non-linear activation functions stopped impeded the model from learning anything


# Hard Clustering

## Common Routines

In [18]:
def get_embeddings_model(dim, embedding_matrix):
    hypo_input = Input(shape=(1,))
    hyper_input = Input(shape=(1,))

    word_embedding = Embedding(embedding_matrix.shape[0], dim, name='WE')

    hypo_embedding = word_embedding(hypo_input)
    hyper_embedding = word_embedding(hyper_input)

    embedding_model = Model(inputs=[hypo_input, hyper_input], outputs=[hypo_embedding, hyper_embedding])

    # inject pre-trained embeddings into this mini, resusable model/layer
    embedding_model.get_layer(name='WE').set_weights([embedding_matrix])
    embedding_model.get_layer(name='WE').trainable = False
    
    return embedding_model

In [39]:
def get_cluster_CRIM_model(phi_k=1,
                           embeddings_layer=None,
                           embeddings_dim = 200,
                           phi_init = None,
                           phi_activity_regularisation = None,
                           sigmoid_kernel_regularisation = None,
                           sigmoid_bias_regularisation = None,
                           sigmoid_kernel_constraint = None,
                           dropout_rate = 0.,
                           learning_rate = 0.001
                  ):
    
    hypo_input  = Input(shape=(1,), name='Hyponym')
    hyper_input = Input(shape=(1,), name='Hypernym')
        
    hypo_embedding, hyper_embedding = embeddings_layer([hypo_input, hyper_input])
            
    # Add Dropout to avoid overfit    
    hypo_embedding = Dropout(dropout_rate, name='Dropout_Hypo')(hypo_embedding)
    hyper_embedding = Dropout(dropout_rate, name='Dropout_Hyper')(hyper_embedding)
    
    phi_layer = []
    for i in range(phi_k):
        phi_layer.append(Dense(embeddings_dim, activation=None, use_bias=False, 
                               activity_regularizer=phi_activity_regularisation,
                               kernel_initializer=phi_init,                               
                               name='Phi%d' % (i)) (hypo_embedding))            

    if phi_k == 1:
        # flatten tensors
        phi = Flatten()(phi_layer[0])
        hyper_embedding = Flatten()(hyper_embedding)    
    else:
        phi = concatenate(phi_layer, axis=1)
    
    phi = Dropout(dropout_rate, name='Dropout_Phi')(phi)
    
    # this is referred to as "s" in the "CRIM" paper    
    phi_hyper = Dot(axes=-1, normalize=False, name='DotProduct')([phi, hyper_embedding])
    
    if phi_k > 1:
        phi_hyper = Flatten()(phi_hyper)
    
    predictions = Dense(1, activation="sigmoid", name='Prediction',
                        use_bias=True,
                        kernel_initializer='random_normal',
                        kernel_constraint= sigmoid_kernel_constraint,
                        bias_initializer='random_normal',                        
                        kernel_regularizer=sigmoid_kernel_regularisation,
                        bias_regularizer=sigmoid_bias_regularisation
                       ) (phi_hyper)

    # instantiate model
    model = Model(inputs=[hypo_input, hyper_input], outputs=predictions)        

    adam = Adam(lr = learning_rate, beta_1 = 0.9, beta_2 = 0.9, clipnorm=1.)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [20]:
# light-weight data class containing only 
class ClusterHybrid:
            
    def __init__(self, cluster, cluster_predictions,
                 orig_data,                 
                 phi_init, sigmoid_kernel_constraint,
                 embeddings_layer, dropout_rate, learning_rate
                 ):
        
        # create model
        self.model = self._init_model(phi_init=phi_init,                                       
                                      embeddings_layer = embedding_layer, 
                                      embeddings_dim = orig_data.embeddings_dim,
                                      sigmoid_kernel_constraint = sigmoid_kernel_constraint,
                                      dropout_rate = dropout_rate, learning_rate = learning_rate)
        
        self.cluster_id = cluster
        self.tokenizer = orig_data.tokenizer
        
        self.train_query = map(lambda x: orig_data.train_query[x], np.where(cluster_predictions == cluster)[0])
        self.train_hyper = map(lambda x: orig_data.train_hyper[x], np.where(cluster_predictions == cluster)[0])
                                
        self.valid_query = orig_data.valid_query
        self.valid_hyper = orig_data.valid_hyper
        
        self.train_query_seq, self.train_hyper_seq, self.valid_query_seq, self.valid_hyper_seq =\
        map(lambda x: orig_data.tokenizer.texts_to_sequences(x),\
            [self.train_query, self.train_hyper, self.valid_query, self.valid_hyper])
        
        self.train_query_seq, self.train_hyper_seq, self.valid_query_seq, self.valid_hyper_seq =\
        map(lambda x: np.asarray(x, dtype='int32'),\
            [self.train_query_seq, self.train_hyper_seq, self.valid_query_seq, self.valid_hyper_seq])
        

        self.loss = 0.
        self.test_loss = 0
    
    def _init_model(self, phi_init, sigmoid_kernel_constraint, 
                    embeddings_layer, embeddings_dim,
                    dropout_rate, learning_rate):
        
        return get_cluster_CRIM_model(phi_init=phi_init, 
                                      embeddings_layer = embeddings_layer, embeddings_dim = embeddings_dim,
                                      sigmoid_kernel_constraint = sigmoid_kernel_constraint,
                                      dropout_rate = dropout_rate, learning_rate = learning_rate)
    
    def update_loss(self, new_loss):
        self.loss += new_loss
        
    def update_test_loss(self, new_loss):
        self.test_loss += new_loss   

## Hard Clustering Specific

In [70]:
from sklearn.cluster import KMeans

cluster_k = 5

In [34]:
# calculate offsets 
training_query_vector = np.zeros((len(data.train_query), data.embeddings_dim))
training_hyper_vector = np.zeros((len(data.train_hyper), data.embeddings_dim))
for i in range(len(data.train_query)):
    query_embedding_lookup = data.tokenizer.word_index[data.train_query[i]]
    hyper_embedding_lookup = data.tokenizer.word_index[data.train_hyper[i]]
    
    training_query_vector[i] = data.embedding_matrix[query_embedding_lookup]
    training_hyper_vector[i] = data.embedding_matrix[hyper_embedding_lookup]
    

train_offsets = training_hyper_vector - training_query_vector

In [71]:
km = KMeans(n_clusters = cluster_k, n_jobs=-1, random_state=42)
#km.fit_predict(train_offsets)
km.fit_predict(train_offsets)

array([2, 3, 3, ..., 4, 3, 0], dtype=int32)

In [72]:
from collections import Counter
Counter(km.predict(train_offsets))

Counter({0: 310, 1: 2762, 2: 2665, 3: 2318, 4: 3724})

In [110]:
#np.where(km.predict(train_offsets) == 0)
map(lambda x: (data.train_query[x], data.train_hyper[x]), np.where(km.predict(train_offsets) == 4)[0])

[(u'turonian', u'geologic_timescale'),
 (u'turonian', u'geological_period'),
 (u'abhorrence', u'distaste'),
 (u'abhorrence', u'disgust'),
 (u'tropical_storm', u'windstorm'),
 (u'tropical_storm', u'violent_storm'),
 (u'tropical_storm', u'storm_damage'),
 (u'tropical_storm', u'storm'),
 (u'tropical_storm', u'tempest'),
 (u'tropical_storm', u'wind'),
 (u'pollution', u'dirtiness'),
 (u'pollution', u'dirtying'),
 (u'pollution', u'sanitary_condition'),
 (u'pollution', u'uncleanness'),
 (u'photomicrograph', u'picture'),
 (u'photomicrograph', u'photograph'),
 (u'photomicrograph', u'photo'),
 (u'swamp_gum', u'eucalyptus_tree'),
 (u'swamp_gum', u'gum_tree'),
 (u'wing', u'emblem'),
 (u'wing', u'air_unit'),
 (u'cumulus', u'natural_phenomenon'),
 (u'cumulus', u'phenomenon'),
 (u'cumulus', u'atmospheric_phenomenon'),
 (u'silver', u'gray'),
 (u'silver', u'badge_of_honour'),
 (u'silver', u'riches'),
 (u'silver', u'hoarded_wealth'),
 (u'silver', u'treasure'),
 (u'silver', u'cutlery'),
 (u'sand_verbena'

In [69]:
# implement training algorithm modification to deal with clusters
def train_on_clusters(models,      # the clustered models on which parameters will be learnt
                      epochs,      # number of epochs to run          
                      batch_size,  # size of mini-batch 
                      m,           # number of negative samples
                      #data,        # data required for training                              
                      neg_strategy
                     ):                                   
    
    # train algorithm
    for epoch in range(epochs):
        # train each model on their corresponding data
        for model in models:            
            print ("Doing model: ", model.cluster_id, "; epoch: ", epoch)
            # reset loss
            model.loss = 0.
            model.test_loss = 0.
            
            samples = np.arange(len(model.train_query_seq))
            validation_samples = np.arange(len(model.valid_query_seq))
                                        
            np.random.shuffle(samples)                    
                        
            for b in range(0, len(samples), batch_size):    
                if ((b + 1) % 500) == 0:
                    print ('Model: ', model.cluster_id, '; processed ', idx+1, 'samples.')
                                    
                batch_X_term, batch_X_hyper, batch_y_label =\
                    extend_batch_with_negatives(model.train_query_seq[b:b + batch_size], 
                                                model.train_hyper_seq[b:b + batch_size],
                                                neg_strategy,
                                                model.tokenizer, m
                                               )  

                #print model.cluster_id, len(batch_X_term)
                model.update_loss(model.model.train_on_batch([batch_X_term, batch_X_hyper], batch_y_label)[0])                                
                
        # validate on entire validation set after training each model
            
        test_query, test_hyper, test_y_label =\
            extend_batch_with_negatives(model.valid_query_seq, 
                                        model.valid_hyper_seq,
                                        neg_strategy,
                                        model.tokenizer, m
                                       )  
        #batch_label = [1.] * batch_query.shape[0]
        for q, h, l in zip(test_query, test_hyper, test_y_label):                                    
            test_losses = list(map(lambda c: c.model.test_on_batch([q, h], [l])[0], models))
            best_cluster = np.argmin(test_losses)
            models[best_cluster].update_test_loss(test_losses[best_cluster])
                            
        print('Epoch:', epoch+1,\
              'Loss:', np.mean([model.loss for model in models]),\
              'Test Loss:', np.mean([model.test_loss for model in models]))    

In [276]:
# initialise clusters
embeddings_layer = get_embeddings_model(dim=data.embeddings_dim, embedding_matrix=data.embedding_matrix)

cluster_list = []
cluster_predictions = km.predict(train_offsets)



for c in range(cluster_k):
    cluster_list.append(ClusterHybrid(cluster=c, 
                                      cluster_predictions = cluster_predictions, 
                                      orig_data = data, 
                                      phi_init = random_identity, 
                                      sigmoid_kernel_constraint = None,
                                      embeddings_layer = embeddings_layer,
                                      dropout_rate = 0.3, 
                                      learning_rate = 0.001 ))

In [277]:
# negative sampling options
neg_sampling_options = {'synonym':data.synonyms,                                                 
                        'random':data.random_words
                       }

# phi random init options
phi_init_options = {'random_plus_identity': random_plus_identity,
                    'random_identity': random_identity, 
                    'random_normal': random_normal}

kernel_constraints = {'None': None, 'ForceToOne': ForceToOne()}

# positive batch size
batch_size = 32

# implement mini-batch stochastic training
epochs = 10


# number of negative samples
m = 1
# number of projections
phi_k = 1
# train (True) or freeze
train_embeddings = False
# negative sample strategy
negative_option = 'random'
# initialise phi strategy
phi_init_option = 'random_identity'
# constrain LR parameter
kernel_constraint_option = 'None'
# dropout rate
dropout_rate = 0.3
learning_rate = 0.001


print ("Start training")
train_on_clusters(cluster_list, epochs, batch_size, m, neg_sampling_options[negative_option])

Start training
('Doing model: ', 0, '; epoch: ', 0)
('Doing model: ', 1, '; epoch: ', 0)
('Doing model: ', 2, '; epoch: ', 0)
('Doing model: ', 3, '; epoch: ', 0)
('Doing model: ', 4, '; epoch: ', 0)
('Epoch:', 1, 'Loss:', 51.20275011062622, 'Test Loss:', 54.16907963752747)
('Doing model: ', 0, '; epoch: ', 1)
('Doing model: ', 1, '; epoch: ', 1)
('Doing model: ', 2, '; epoch: ', 1)
('Doing model: ', 3, '; epoch: ', 1)
('Doing model: ', 4, '; epoch: ', 1)
('Epoch:', 2, 'Loss:', 49.9156032204628, 'Test Loss:', 51.34425748586655)
('Doing model: ', 0, '; epoch: ', 2)
('Doing model: ', 1, '; epoch: ', 2)
('Doing model: ', 2, '; epoch: ', 2)
('Doing model: ', 3, '; epoch: ', 2)
('Doing model: ', 4, '; epoch: ', 2)
('Epoch:', 3, 'Loss:', 47.629679238796236, 'Test Loss:', 47.31288024187088)
('Doing model: ', 0, '; epoch: ', 3)
('Doing model: ', 1, '; epoch: ', 3)
('Doing model: ', 2, '; epoch: ', 3)
('Doing model: ', 3, '; epoch: ', 3)
('Doing model: ', 4, '; epoch: ', 3)
('Epoch:', 4, 'Loss:

In [278]:
print ("Generating predictions...")
cluster_predictions = predict_cluster_hypernyms(data, cluster_list)

print ("CRIM evaluation:")
score_names, all_scores = get_evaluation_scores((data.test_query, data.test_hyper), cluster_predictions)
for k in range(len(score_names)):
    print (score_names[k]+': '+str(round(sum([score_list[k] for score_list in all_scores]) / len(all_scores), 5)))


Generating predictions...
('Done', 100)
('Done', 200)
('Done', 300)
('Done', 400)
('Done', 500)
('Done', 600)
('Done', 700)
('Done', 800)
('Done', 900)
('Done', 1000)
('Done', 1100)
('Done', 1200)
('Done', 1300)
('Done', 1400)
CRIM evaluation:
MRR: 0.03292
P@1: 0.02402
P@5: 0.01587
P@10: 0.01505


In [44]:
#cluster_list[0].model.get_layer(name='Phi0').get_weights()[0]

for idx, m in enumerate(clusters):
    print idx, m.loss, m.test_loss, m.model.get_layer(name='Prediction').get_weights()[0]

i = data.tokenizer.word_index['tussle']
j = data.tokenizer.word_index['student']
for c in clusters:
    print c.model.predict([[i], [j]])[0]

0 3763.100256009726 48.5477934544906 [[1.]]
1 1261.7794247418642 52.96302166581154 [[1.]]
[0.95837396]
[0.26617956]


## Hard Clustering Results

* batch_size = 32
* epochs = 10
* m = 1
* phi_k = 1
* train_embeddings = False
* negative_option = 'random'
* phi_init_option = 'random_identity'
* kernel_constraint_option = 'None'
* dropout_rate = 0.3
* learning_rate = 0.001

* Performed hard clustering using k = 5.  Vector offset used as basis for clustering.

Start training
('Doing model: ', 0, '; epoch: ', 0)
('Doing model: ', 1, '; epoch: ', 0)
('Doing model: ', 2, '; epoch: ', 0)
('Doing model: ', 3, '; epoch: ', 0)
('Doing model: ', 4, '; epoch: ', 0)
('Epoch:', 1, 'Loss:', 51.20275011062622, 'Test Loss:', 54.16907963752747)
('Doing model: ', 0, '; epoch: ', 1)
('Doing model: ', 1, '; epoch: ', 1)
('Doing model: ', 2, '; epoch: ', 1)
('Doing model: ', 3, '; epoch: ', 1)
('Doing model: ', 4, '; epoch: ', 1)
('Epoch:', 2, 'Loss:', 49.9156032204628, 'Test Loss:', 51.34425748586655)
('Doing model: ', 0, '; epoch: ', 2)
('Doing model: ', 1, '; epoch: ', 2)
('Doing model: ', 2, '; epoch: ', 2)
('Doing model: ', 3, '; epoch: ', 2)
('Doing model: ', 4, '; epoch: ', 2)
('Epoch:', 3, 'Loss:', 47.629679238796236, 'Test Loss:', 47.31288024187088)
('Doing model: ', 0, '; epoch: ', 3)
('Doing model: ', 1, '; epoch: ', 3)
('Doing model: ', 2, '; epoch: ', 3)
('Doing model: ', 3, '; epoch: ', 3)
('Doing model: ', 4, '; epoch: ', 3)
('Epoch:', 4, 'Loss:', 44.66189357638359, 'Test Loss:', 42.583502805233)
('Doing model: ', 0, '; epoch: ', 4)
('Doing model: ', 1, '; epoch: ', 4)
('Doing model: ', 2, '; epoch: ', 4)
('Doing model: ', 3, '; epoch: ', 4)
('Doing model: ', 4, '; epoch: ', 4)
('Epoch:', 5, 'Loss:', 41.36941378712654, 'Test Loss:', 37.747225880622864)
('Doing model: ', 0, '; epoch: ', 5)
('Doing model: ', 1, '; epoch: ', 5)
('Doing model: ', 2, '; epoch: ', 5)
('Doing model: ', 3, '; epoch: ', 5)
('Doing model: ', 4, '; epoch: ', 5)
('Epoch:', 6, 'Loss:', 38.09736560583114, 'Test Loss:', 33.176356403529645)
('Doing model: ', 0, '; epoch: ', 6)
('Doing model: ', 1, '; epoch: ', 6)
('Doing model: ', 2, '; epoch: ', 6)
('Doing model: ', 3, '; epoch: ', 6)
('Doing model: ', 4, '; epoch: ', 6)
('Epoch:', 7, 'Loss:', 34.89849599599838, 'Test Loss:', 29.117837768793105)
('Doing model: ', 0, '; epoch: ', 7)
('Doing model: ', 1, '; epoch: ', 7)
('Doing model: ', 2, '; epoch: ', 7)
('Doing model: ', 3, '; epoch: ', 7)
('Doing model: ', 4, '; epoch: ', 7)
('Epoch:', 8, 'Loss:', 32.04033596813679, 'Test Loss:', 25.581615015119315)
('Doing model: ', 0, '; epoch: ', 8)
('Doing model: ', 1, '; epoch: ', 8)
('Doing model: ', 2, '; epoch: ', 8)
('Doing model: ', 3, '; epoch: ', 8)
('Doing model: ', 4, '; epoch: ', 8)
('Epoch:', 9, 'Loss:', 29.56286123096943, 'Test Loss:', 22.568919814378024)
('Doing model: ', 0, '; epoch: ', 9)
('Doing model: ', 1, '; epoch: ', 9)
('Doing model: ', 2, '; epoch: ', 9)
('Doing model: ', 3, '; epoch: ', 9)
('Doing model: ', 4, '; epoch: ', 9)
('Epoch:', 10, 'Loss:', 27.37933742403984, 'Test Loss:', 20.065976665169)


CRIM evaluation:
MRR: 0.03292
P@1: 0.02402
P@5: 0.01587
P@10: 0.01505

---------------------------------------------------------------------------------------------


* We could try to train a simple model for a few epochs.  
* Once that is done we can refine the results by creating clusters, each initialised with the phi weights from the previous attempt

# Soft Clustering

In [21]:
class YamaneCluster(ClusterHybrid):
    def __init__(self, phi_init, 
                 embeddings_layer, embeddings_dim,
                 sigmoid_kernel_constraint, 
                 dropout_rate, learning_rate):        
        
        # create Keras model
        self.model = self._init_model(phi_init = phi_init, 
                                      embeddings_layer = embeddings_layer,
                                      embeddings_dim = embeddings_dim,
                                      sigmoid_kernel_constraint = sigmoid_kernel_constraint,
                                      dropout_rate  = dropout_rate,
                                      learning_rate = learning_rate)
        # initialise variables     
        self.epoch_count = 0
        self.loss = 0.
        self.test_loss = 0.
        self.mrr = []
    
    def increment_epoch(self):
        self.epoch_count += 1

In [67]:
def yamane_train(
    epochs,      # number of epochs to run
    m,           # number of negative samples
    data,        # class instance containing all the data required for training/testing        
    embedding_layer,     # shared embeddings layer
    threshold    = 0.15,     # threshold; similarity below this score will trigger new cluster
    negative_option = 'random', # pass dictionary of random terms 
    phi_init_option = None,     # phi dense layer initialisation strategy
    sigmoid_constraint_option = 'None',
    dropout_rate = 0.,
    learning_rate = 0.001,
    cluster_max = 5
): 
    
    phi_init_options = {'random_identity': random_identity, 
                        'random_normal': random_normal, 
                        'random_plus_identity': random_plus_identity}
    neg_sampling_options = {'synonym':data.synonyms, 'random':data.random_words}
    sigmoid_constraint_options = {'ForceToOne': ForceToOne(), 'None': None}
    
    sigmoid_kernel_constraint = sigmoid_constraint_options[sigmoid_constraint_option]
    
    neg_strategy = neg_sampling_options[negative_option]
            
    # create sequences
    # we have two sets of inputs: one for training query and hypernym terms;
    #                             another for the validation query/hyper terms;
    term_train_seq = data.tokenizer.texts_to_sequences(data.train_query)
    hyper_train_seq = data.tokenizer.texts_to_sequences(data.train_hyper)

    #term_test_seq = data.tokenizer.texts_to_sequences(data.valid_query)
    #hyper_test_seq = data.tokenizer.texts_to_sequences(data.valid_hyper)
    
    # convert all to arrays
    #term_train_seq, hyper_train_seq, term_test_seq, hyper_test_seq =\
    #[np.asarray(x, dtype='int32') for x in [term_train_seq, hyper_train_seq, term_test_seq, hyper_test_seq]]
    
    term_train_seq, hyper_train_seq = [np.asarray(x, dtype='int32') for x in [term_train_seq, hyper_train_seq]]
            
    # this list stores which cluster each training sequence pertains to
    sample_clusters = np.zeros(len(term_train_seq), dtype='int32')
    
    print ("m: ", m, "lambda: ", threshold, "max epoch per cluster: ", epochs, 
           "Negative sampling: ", negative_option, "Phi Init: ", phi_init_option,
           "sigmoid_kernel_constraint: ", sigmoid_constraint_option, 
           "dropout: ", dropout_rate, "learning_rate: ", learning_rate, 
           "cluster_max: ", cluster_max          
          )
    
    
    print ("Sample clusters size: ", len(sample_clusters))
    # list containing 1 model per cluster
    clusters = []    
        
    clusters.append(YamaneCluster(phi_init = phi_init_options[phi_init_option],
                                  embeddings_layer = embedding_layer,
                                  embeddings_dim = data.embeddings_dim,
                                  sigmoid_kernel_constraint = sigmoid_kernel_constraint,
                                  dropout_rate = dropout_rate,
                                  learning_rate = learning_rate))
                    
    # get training set indices
    indices = np.arange(len(term_train_seq))  
    
    # get test set indices
    #test_indices = np.arange(len(term_test_seq))
            
    # initialise each training sample to cluster 0
    sample_clusters[indices] = 0        
    
    # seed random generator
    np.random.seed(42)
    
    # indicator of "current" sample cluster index
    z_i = 0
                    
    while np.min([c.epoch_count for c in clusters]) < epochs:
        # reset loss for each cluster                        
        for c in clusters:
            if c.epoch_count < epochs:                
                c.loss = 0.
            c.test_loss = 0.                
        
        # shuffle indices every epoch
        np.random.shuffle(indices)
        
        # train algorithm by stochastic gradient descent, one sample at a time
        # learn 1 matrix of first epoch only
        for idx, i in enumerate(indices):                        
            if (idx + 1) % 1000 == 0:
                print ("Processed ", idx+1, "samples...")
            
            # calculate similarity on all clusters
            sim = list(map(lambda x: x.model.predict([term_train_seq[i], hyper_train_seq[i]]), clusters))            
            max_sim = np.argmax(sim)
            #print "Term:", tokenizer.index_word[term_train_seq[i][0]], 'Hyper:', tokenizer.index_word[hyper_train_seq[i][0]], "Max Similarity cluster:", max_sim, "(sim = %0.8f)" % (sim[max_sim])
            # limit cluster creation to a max of 25.
            if ((clusters[0].epoch_count > 0) and (sim[max_sim] < threshold) and (len(clusters) < cluster_max)): 
                # add new cluster to list of clusters
                print data.tokenizer.index_word[term_train_seq[i][0]], data.tokenizer.index_word[hyper_train_seq[i][0]]
                print max_sim, sim[max_sim]
                clusters.append(YamaneCluster(phi_init = phi_init_options[phi_init_option],
                                               embeddings_layer = embedding_layer,
                                               embeddings_dim = data.embeddings_dim,
                                               sigmoid_kernel_constraint = sigmoid_kernel_constraint,
                                               dropout_rate = dropout_rate,
                                               learning_rate = learning_rate))
                
                # assign current cluster index to latest model
                z_i = len(clusters) - 1
                sample_clusters[i] = z_i
            else:            
                z_i = max_sim
                sample_clusters[i] = z_i                
                        
            # if current cluster reached/exceeded epoch count, skip current sample (i.e don't update cluster)
            if clusters[z_i].epoch_count < epochs:                                            
                # extend samples in cluster with negative samples
                batch_X_term, batch_X_hyper, batch_y_label =\
                    extend_batch_with_negatives(term_train_seq[i], 
                                                hyper_train_seq[i],
                                                neg_strategy,
                                                data.tokenizer, m
                                               )  

                # update parameters of cluster 
                clusters[z_i].update_loss(
                    clusters[z_i].model.train_on_batch([batch_X_term, batch_X_hyper], batch_y_label)[0]
                )
        
            ####################### END OF EPOCH #######################
        
        # measure test loss at end of epoch
        # every 100 samples (and updates are processed), we will test performance on validation set
        # of 32 randomly chosen samples. We will record test loss of every cluster and report on 
        # lowest loss                                
        
        #batch_query, batch_hyper = term_test_seq[test_indices[:32]], hyper_test_seq[test_indices[:32]]
        
        #batch_query, batch_hyper, test_y_label =\
        #    extend_batch_with_negatives(term_test_seq, 
        #                                hyper_test_seq,
        #                                neg_strategy,
        #                                data.tokenizer, m
        #                               )  
                
        #for q, h, l in zip(batch_query, batch_hyper, test_y_label):                                    
        #    test_losses = list(map(lambda c: c.model.test_on_batch([q, h], [l])[0], clusters))
        #    best_cluster = np.argmin(test_losses)
        #    clusters[best_cluster].update_test_loss(
        #        test_losses[best_cluster]
        #    )                    
        
        # instead of test loss, measure MRR as a more indicative validation metric
        print ("Running evaluation on trial data set...")
        predictions = predict_cluster_hypernyms(data.valid_query, data.tokenizer, clusters)
        _, all_scores = get_evaluation_scores((data.valid_query, data.valid_hyper), predictions)
        mrr = round(sum([score_list[0] for score_list in all_scores]) / len(all_scores), 5)
        clusters[0].mrr.append(mrr)
        
        # increase epoch count for clusters
        for cluster in clusters:            
            cluster.epoch_count += 1
                
        print('Epoch:', max([c.epoch_count for c in clusters]), 'Cluster #:', len(clusters) ,
              'Loss:', np.mean([c.loss for c in clusters]),
              'Test MRR:', mrr)
    return clusters, sample_clusters

In [68]:
import datetime

# initialise embedding later which will be shared among all clusters
embedding_layer = get_embeddings_model(dim=data.embeddings_dim, embedding_matrix=data.embedding_matrix)
epochs = 10
m = 10

print ("Training started at: %s" %  (datetime.datetime.now()))
clusters, sample_clusters =\
    yamane_train(epochs, m, 
                 data,
                 embedding_layer,
                 threshold = 0.15,
                 negative_option = 'random',
                 phi_init_option = 'random_plus_identity',
                 sigmoid_constraint_option = 'ForceToOne',
                 dropout_rate = 0.3,
                 learning_rate = 0.001,
                 cluster_max = 25
                )

print ("Training concluded at: %s" % (datetime.datetime.now()))



Training started at: 2018-12-12 21:48:50.279247
('m: ', 10, 'lambda: ', 0.15, 'max epoch per cluster: ', 10, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_plus_identity', 'sigmoid_kernel_constraint: ', 'ForceToOne', 'dropout: ', 0.3, 'learning_rate: ', 0.001, 'cluster_max: ', 25)
('Sample clusters size: ', 11779)
('Processed ', 1000, 'samples...')
('Processed ', 2000, 'samples...')
('Processed ', 3000, 'samples...')
('Processed ', 4000, 'samples...')
('Processed ', 5000, 'samples...')
('Processed ', 6000, 'samples...')
('Processed ', 7000, 'samples...')
('Processed ', 8000, 'samples...')
('Processed ', 9000, 'samples...')
('Processed ', 10000, 'samples...')
('Processed ', 11000, 'samples...')
Running evaluation on trial data set...
('Epoch:', 1, 'Cluster #:', 1, 'Loss:', 2645.7672815788537, 'Test MRR:', 0.10333)
coalition annotation
0 [[0.14232686]]
battersea_arts_centre constructed_structure
1 [[0.12914188]]
lyceum constructed_structure
1 [[0.14685696]]
('Processed ', 1000, '

In [70]:
for c in clusters:
    print c.loss, c.test_loss, c.epoch_count, c.mrr
    
    
map(lambda x: (data.train_query[x], data.train_hyper[x]), np.where(sample_clusters == 0)[0])
#c15 = Counter(map(lambda x: data.train_hyper[x], np.where(sample_clusters == 18)[0]))
#sorted(c15.items(), key = lambda (k,v): v, reverse=True)
#Counter(sample_clusters)

353.48588428401854 0.0 11 [0.10333, 0.05333, 0.06417, 0.05733, 0.059, 0.078, 0.094, 0.08517, 0.08822, 0.07756, 0.06556]
38.093901408836246 0.0 10 []
13.049090251326561 0.0 10 []
47.21333260461688 0.0 10 []
19.317077357321978 0.0 10 []
30.870450855232775 0.0 10 []
12.441514313220978 0.0 10 []
12.182713583111763 0.0 10 []
15.092968666460365 0.0 10 []
20.62631884170696 0.0 10 []
16.031787231564522 0.0 10 []
7.054915741086006 0.0 10 []
9.180294819176197 0.0 10 []
21.313419584184885 0.0 10 []
65.07340764114633 0.0 10 []
11.535719199106097 0.0 10 []
13.696264486759901 0.0 10 []
47.03833513520658 0.0 10 []
26.871564440894872 0.0 10 []
12.247797943651676 0.0 10 []
68.35207805968821 0.0 10 []
8.792612202465534 0.0 10 []
13.14311645552516 0.0 10 []
63.05965260975063 0.0 10 []
27.230451131239533 0.0 10 []


[(u'blackfly', u'insect'),
 (u'turonian', u'physical_property'),
 (u'turonian', u'geological_period'),
 (u'turonian', u'geological_time'),
 (u'turonian', u'geologic_time'),
 (u'tropical_storm', u'atmosphere'),
 (u'tropical_storm', u'windstorm'),
 (u'tropical_storm', u'violent_storm'),
 (u'tropical_storm', u'atmospheric_state'),
 (u'tropical_storm', u'storm_damage'),
 (u'tropical_storm', u'atmospheric_phenomenon'),
 (u'tropical_storm', u'storm'),
 (u'tropical_storm', u'cyclone'),
 (u'tropical_storm', u'natural_phenomenon'),
 (u'tropical_storm', u'tempest'),
 (u'tropical_storm', u'wind'),
 (u'militarization', u'social_control'),
 (u'pollution', u'environmental_condition'),
 (u'photomicrograph', u'digital_image'),
 (u'swamp_gum', u'plant'),
 (u'swamp_gum', u'woody_plant'),
 (u'swamp_gum', u'eucalypt'),
 (u'song', u'sound'),
 (u'song', u'work_of_art'),
 (u'song', u'musical_composition'),
 (u'song', u'vocal_music'),
 (u'song', u'musical_work'),
 (u'song', u'piece_of_music'),
 (u'song', u'hu

### Evaluate without attempting to cluster test terms

In [71]:
print ("Generating predictions...")
yamane_predictions = predict_cluster_hypernyms(data.test_query, data.tokenizer, clusters)

print ("CRIM evaluation:")
score_names, all_scores = get_evaluation_scores((data.test_query, data.test_hyper), yamane_predictions)
for k in range(len(score_names)):
    print (score_names[k]+': '+str(round(sum([score_list[k] for score_list in all_scores]) / len(all_scores), 5)))



Generating predictions...
('Done', 100)
('Done', 200)
('Done', 300)
('Done', 400)
('Done', 500)
('Done', 600)
('Done', 700)
('Done', 800)
('Done', 900)
('Done', 1000)
('Done', 1100)
('Done', 1200)
('Done', 1300)
('Done', 1400)
CRIM evaluation:
MRR: 0.09636
MAP: 0.04405
P@1: 0.06871
P@5: 0.04286
P@10: 0.04008


In [74]:
predict_cluster_hypernyms(['blackfly'], data.tokenizer, clusters)




{'blackfly': [u'animal',
  u'insect',
  u'integrating',
  u'information_systems',
  u'information_management',
  u'implementation',
  u'agricultural_pest',
  u'systems',
  u'data_management',
  u'information_system',
  u'nonliving',
  u'waterbody',
  u'technical',
  u'integrated',
  u'enable']}

### Train KNN classifier on clustering data jointly learnt by model

In [76]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5, weights='distance')

# prepare knn dataset based on learnt clusters
train_seq = np.array(data.tokenizer.texts_to_sequences(data.train_query))

X_knn = {}
for idx, c in enumerate(clusters):
    cluster_ids = np.where(sample_clusters == idx)
    # we can reduce duplicate terms to unique terms    
    uniq_terms = np.unique(train_seq[cluster_ids])
    #print (uniq_terms)    
    X_knn[idx] = data.embedding_matrix[uniq_terms]  

X_features = X_knn[0]
y = np.zeros(X_knn[0].shape[0], dtype='int16')

for k in range(1,len(clusters)):
    X_features = np.vstack((X_features, X_knn[k]))
    y = np.hstack((y, np.array([k] * X_knn[k].shape[0])))
    
neigh.fit(X_features, y) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='distance')

In [79]:
print ("Generating predictions...")
yamane_predictions = predict_cluster_hypernyms(data.test_query, data.tokenizer, clusters, neigh)

print ("CRIM evaluation:")
score_names, all_scores = get_evaluation_scores((data.test_query, data.test_hyper), yamane_predictions)
for k in range(len(score_names)):
    print (score_names[k]+': '+str(round(sum([score_list[k] for score_list in all_scores]) / len(all_scores), 5)))


Generating predictions...
('Done', 100)
('Done', 200)
('Done', 300)
('Done', 400)
('Done', 500)
('Done', 600)
('Done', 700)
('Done', 800)
('Done', 900)
('Done', 1000)
('Done', 1100)
('Done', 1200)
('Done', 1300)
('Done', 1400)
CRIM evaluation:
MRR: 0.09942
MAP: 0.04663
P@1: 0.07071
P@5: 0.04554
P@10: 0.04292


In [146]:
#crim_get_top_hypernyms('turonian', None, clusters[15].model, data, 15)

In [38]:
from collections import Counter
#yamane_predictions['dashi']
np.mean(clusters[10].model.get_layer(name='Phi0').get_weights()[0])
Counter(sample_clusters)

Counter({0: 36,
         1: 3088,
         2: 1416,
         3: 1953,
         4: 137,
         5: 119,
         6: 245,
         7: 468,
         8: 989,
         9: 275,
         10: 396,
         11: 402,
         12: 240,
         13: 102,
         14: 191,
         15: 53,
         16: 35,
         17: 92,
         18: 86,
         19: 110,
         20: 554,
         21: 335,
         22: 126,
         23: 158,
         24: 173})

# Yamane Results

Training started...
* ('m: ', 10, 'lambda: ', 0.15, 'max epoch per cluster: ', 15, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_normal')
('Epoch:', 1, 'Cluster #:', 25, 'Loss:', 185.25700701117515, 'Test Loss:', 462.11729974992573)
('Epoch:', 2, 'Cluster #:', 25, 'Loss:', 94.78926725581289, 'Test Loss:', 257.9614339789539)
('Epoch:', 3, 'Cluster #:', 25, 'Loss:', 68.510555833322, 'Test Loss:', 168.7855049063693)
('Epoch:', 4, 'Cluster #:', 25, 'Loss:', 56.637282415106895, 'Test Loss:', 131.58040585272218)
('Epoch:', 5, 'Cluster #:', 25, 'Loss:', 48.74302359417314, 'Test Loss:', 114.5913256756349)
('Epoch:', 6, 'Cluster #:', 25, 'Loss:', 43.784197641848586, 'Test Loss:', 103.29953979119527)
('Epoch:', 7, 'Cluster #:', 25, 'Loss:', 39.676348549440156, 'Test Loss:', 94.9497262534533)
('Epoch:', 8, 'Cluster #:', 25, 'Loss:', 36.70157310644514, 'Test Loss:', 88.89629495775293)
('Epoch:', 9, 'Cluster #:', 25, 'Loss:', 33.68777084685018, 'Test Loss:', 85.53309237194794)
('Epoch:', 10, 'Cluster #:', 25, 'Loss:', 31.62626134102262, 'Test Loss:', 80.67354718668425)
('Epoch:', 11, 'Cluster #:', 25, 'Loss:', 30.100850980100514, 'Test Loss:', 83.60007793995133)
('Epoch:', 12, 'Cluster #:', 25, 'Loss:', 27.972241665369946, 'Test Loss:', 78.48380810162551)
('Epoch:', 13, 'Cluster #:', 25, 'Loss:', 26.79718875462626, 'Test Loss:', 78.7881210642461)
('Epoch:', 14, 'Cluster #:', 25, 'Loss:', 25.295018676088365, 'Test Loss:', 75.88227921965843)
('Epoch:', 15, 'Cluster #:', 25, 'Loss:', 24.44286949604233, 'Test Loss:', 71.73972092286799)
2018-12-10 02:39:45.787354

CRIM evaluation:
MRR: 0.02308
P@1: 0.01201
P@5: 0.01234
P@10: 0.01186

---------------------------------------------------------------
* ('m: ', 10, 'lambda: ', 0.1, 'max epoch per cluster: ', 20, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'sigmoid_constraint': 'ForceToOne', dropout=0.3, learning_rate=0.001)


('Sample clusters size: ', 11779)
('Epoch:', 1, 'Cluster #:', 30, 'Loss:', 164.64302213018138, 'Test Loss:', 16.821501479359963)
('Epoch:', 2, 'Cluster #:', 30, 'Loss:', 85.95771886678412, 'Test Loss:', 9.479251270126163)
('Epoch:', 3, 'Cluster #:', 30, 'Loss:', 60.89030767480532, 'Test Loss:', 7.113059009361313)
('Epoch:', 4, 'Cluster #:', 30, 'Loss:', 49.35616979487046, 'Test Loss:', 5.911477896573342)
('Epoch:', 5, 'Cluster #:', 30, 'Loss:', 42.839287023517926, 'Test Loss:', 5.237530594305887)
('Epoch:', 6, 'Cluster #:', 30, 'Loss:', 37.95000084499964, 'Test Loss:', 4.801821103322982)
('Epoch:', 7, 'Cluster #:', 30, 'Loss:', 34.69708706878203, 'Test Loss:', 4.445396709869783)
('Epoch:', 8, 'Cluster #:', 30, 'Loss:', 31.344791742943926, 'Test Loss:', 4.20229308469837)
('Epoch:', 9, 'Cluster #:', 30, 'Loss:', 29.270582812105324, 'Test Loss:', 4.001659726132235)
('Epoch:', 10, 'Cluster #:', 30, 'Loss:', 27.312983008645823, 'Test Loss:', 3.8331675905460845)
('Epoch:', 11, 'Cluster #:', 30, 'Loss:', 25.86424312723296, 'Test Loss:', 3.7695775713167223)
('Epoch:', 12, 'Cluster #:', 30, 'Loss:', 24.26373833860683, 'Test Loss:', 3.643787452736217)
('Epoch:', 13, 'Cluster #:', 30, 'Loss:', 23.194776870778636, 'Test Loss:', 3.5394367815238903)
('Epoch:', 14, 'Cluster #:', 30, 'Loss:', 22.077418524692256, 'Test Loss:', 3.4249304910639884)
('Epoch:', 15, 'Cluster #:', 30, 'Loss:', 20.919567144932383, 'Test Loss:', 3.3781916145887294)
('Epoch:', 16, 'Cluster #:', 30, 'Loss:', 20.137791508837896, 'Test Loss:', 3.305138163670991)
('Epoch:', 17, 'Cluster #:', 30, 'Loss:', 19.10677393638859, 'Test Loss:', 3.261422005968933)
('Epoch:', 18, 'Cluster #:', 30, 'Loss:', 18.686860465103926, 'Test Loss:', 3.1841310813365644)
('Epoch:', 19, 'Cluster #:', 30, 'Loss:', 17.963673188673297, 'Test Loss:', 3.117555515795205)
('Epoch:', 20, 'Cluster #:', 30, 'Loss:', 16.913953626791166, 'Test Loss:', 3.1116348788942028)
2018-12-10 12:28:42.204443

* Cluster distribution
Counter({0: 68,
         1: 516,
         2: 56,
         3: 1415,
         4: 345,
         5: 30,
         6: 452,
         7: 38,
         8: 45,
         9: 693,
         10: 2442,
         11: 18,
         12: 908,
         13: 28,
         14: 141,
         15: 1306,
         16: 84,
         17: 315,
         18: 412,
         19: 394,
         20: 364,
         21: 60,
         22: 189,
         23: 105,
         24: 24,
         25: 127,
         26: 731,
         27: 119,
         28: 34,
         29: 320})
         
CRIM evaluation:
MRR: 0.02787
P@1: 0.01935
P@5: 0.01447
P@10: 0.01298


* After fitting a KNN model on the cluster results and using it to fine-tune which clusters to compute prediction of each query term, the results improved three-fold.

CRIM evaluation:
MRR: 0.06088
P@1: 0.0447
P@5: 0.02825
P@10: 0.02616

* However, Yamane is, in general, disappointing with respect to SharedTask challenge.  

----------------------------------------------------------------------------------------
Training started...
('m: ', 5, 'lambda: ', 0.05, 'max epoch per cluster: ', 10, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'sigmoid_kernel_constraint: ', 'None', 'dropout: ', 0.2, 'learning_rate: ', 0.001, 'cluster_max: ', 15)
('Sample clusters size: ', 11779)
('Epoch:', 1, 'Cluster #:', 3, 'Loss:', 1109.3663462693028, 'Test Loss:', 50.47739016701659)
('Epoch:', 2, 'Cluster #:', 4, 'Loss:', 443.7235447903154, 'Test Loss:', 17.8407350068523)
('Epoch:', 3, 'Cluster #:', 4, 'Loss:', 337.1281598309441, 'Test Loss:', 20.06709446922047)
('Epoch:', 4, 'Cluster #:', 4, 'Loss:', 267.10441667238143, 'Test Loss:', 19.677748865275674)
('Epoch:', 5, 'Cluster #:', 4, 'Loss:', 229.65082203910208, 'Test Loss:', 17.97698300000083)
('Epoch:', 6, 'Cluster #:', 4, 'Loss:', 193.899041174633, 'Test Loss:', 16.751907205700498)
('Epoch:', 7, 'Cluster #:', 4, 'Loss:', 174.12937078346036, 'Test Loss:', 16.246821108061788)
('Epoch:', 8, 'Cluster #:', 5, 'Loss:', 143.5690400129045, 'Test Loss:', 8.259589462610172)
('Epoch:', 9, 'Cluster #:', 5, 'Loss:', 122.85453648527569, 'Test Loss:', 10.32649805369518)
('Epoch:', 10, 'Cluster #:', 5, 'Loss:', 120.9294774463863, 'Test Loss:', 11.019405025631883)
('Epoch:', 11, 'Cluster #:', 5, 'Loss:', 117.86586058200308, 'Test Loss:', 11.315035963446595)
('Epoch:', 12, 'Cluster #:', 5, 'Loss:', 117.66658123196073, 'Test Loss:', 11.45983252564142)
('Epoch:', 13, 'Cluster #:', 5, 'Loss:', 117.53339115441747, 'Test Loss:', 11.49364635983179)
('Epoch:', 14, 'Cluster #:', 5, 'Loss:', 117.17556676329085, 'Test Loss:', 11.589454222113588)
('Epoch:', 15, 'Cluster #:', 5, 'Loss:', 116.93114152015157, 'Test Loss:', 11.61919019261072)
('Epoch:', 16, 'Cluster #:', 5, 'Loss:', 116.64158589304395, 'Test Loss:', 11.655326009184815)
('Epoch:', 17, 'Cluster #:', 5, 'Loss:', 116.35099397958227, 'Test Loss:', 11.681808996588686)
2018-12-10 21:24:08.962568

CRIM evaluation:
MRR: 0.05966
P@1: 0.03669
P@5: 0.02672
P@10: 0.02536

* After predicting query term clusters
CRIM evaluation:
MRR: 0.06088
P@1: 0.0447
P@5: 0.02825
P@10: 0.02616

----------------------------------------------------------------------------------------------
('m: ', 5, 'lambda: ', 0.1, 'max epoch per cluster: ', 10, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'sigmoid_kernel_constraint: ', 'ForceToOne', 'dropout: ', 0.3, 'learning_rate: ', 0.001, 'cluster_max: ', 15)
('Sample clusters size: ', 11779)
('Epoch:', 1, 'Cluster #:', 9, 'Loss:', 482.0829993935509, 'Test Loss:', 27.62408774221937)
('Epoch:', 2, 'Cluster #:', 10, 'Loss:', 281.1839863856323, 'Test Loss:', 17.055058593617286)
('Epoch:', 3, 'Cluster #:', 11, 'Loss:', 208.62401276072805, 'Test Loss:', 12.390350534596523)
('Epoch:', 4, 'Cluster #:', 12, 'Loss:', 166.41537183626983, 'Test Loss:', 9.618624108000708)
('Epoch:', 5, 'Cluster #:', 13, 'Loss:', 134.7260290642522, 'Test Loss:', 7.7731040777703475)
('Epoch:', 6, 'Cluster #:', 13, 'Loss:', 119.45636204084552, 'Test Loss:', 7.063226612456044)
('Epoch:', 7, 'Cluster #:', 13, 'Loss:', 108.30129835629151, 'Test Loss:', 6.616330906035168)
('Epoch:', 8, 'Cluster #:', 13, 'Loss:', 101.58308503461907, 'Test Loss:', 6.293532006980128)
('Epoch:', 9, 'Cluster #:', 13, 'Loss:', 94.7640274246464, 'Test Loss:', 5.976787171151955)
('Epoch:', 10, 'Cluster #:', 13, 'Loss:', 88.485667872232, 'Test Loss:', 5.796518988622176)
('Epoch:', 11, 'Cluster #:', 13, 'Loss:', 88.32206717526479, 'Test Loss:', 5.7801972194946964)
('Epoch:', 12, 'Cluster #:', 13, 'Loss:', 88.03503130540048, 'Test Loss:', 5.762659813637024)
('Epoch:', 13, 'Cluster #:', 13, 'Loss:', 87.9104809983742, 'Test Loss:', 5.754546612789104)
('Epoch:', 14, 'Cluster #:', 13, 'Loss:', 87.80618179810531, 'Test Loss:', 5.747131267707188)
2018-12-11 00:15:28.930610

CRIM evaluation:
MRR: 0.05927
P@1: 0.03602
P@5: 0.03072
P@10: 0.02855

* After KNN

CRIM evaluation:
MRR: 0.06201
P@1: 0.03736
P@5: 0.03205
P@10: 0.02976

------------------------------------------------------------------------------------------------

('Sample clusters size: ', 11779)
('m: ', 1, 'lambda: ', 0.13, 'max epoch per cluster: ', 10, 'Negative sampling: ', 'random', 'Phi Init: ', 'random_identity', 'sigmoid_kernel_constraint: ', 'ForceToOne', 'dropout: ', 0.3, 'learning_rate: ', 0.001, 'cluster_max: ', 25)
('Epoch:', 1, 'Cluster #:', 2, 'Loss:', 2540.3022823217325, 'Test Loss:', 53.87453193264082)
('Epoch:', 2, 'Cluster #:', 2, 'Loss:', 1667.1898109320173, 'Test Loss:', 40.18764992605429)
('Epoch:', 3, 'Cluster #:', 2, 'Loss:', 1361.9321219512858, 'Test Loss:', 34.321806932479376)
('Epoch:', 4, 'Cluster #:', 2, 'Loss:', 1178.6240238926068, 'Test Loss:', 30.980050023383228)
('Epoch:', 5, 'Cluster #:', 2, 'Loss:', 1044.863646138525, 'Test Loss:', 27.789724176647724)
('Epoch:', 6, 'Cluster #:', 3, 'Loss:', 643.6681745337323, 'Test Loss:', 16.18063761241986)
('Epoch:', 7, 'Cluster #:', 3, 'Loss:', 585.461576843336, 'Test Loss:', 14.75499623647435)
('Epoch:', 8, 'Cluster #:', 3, 'Loss:', 543.6171705845585, 'Test Loss:', 13.808908825924542)
('Epoch:', 9, 'Cluster #:', 3, 'Loss:', 507.70913083471424, 'Test Loss:', 12.98376774797604)
('Epoch:', 10, 'Cluster #:', 3, 'Loss:', 484.66481586770334, 'Test Loss:', 12.76265995549602)
('Epoch:', 11, 'Cluster #:', 3, 'Loss:', 483.03113291140517, 'Test Loss:', 12.732977458312538)
('Epoch:', 12, 'Cluster #:', 3, 'Loss:', 481.7263510373843, 'Test Loss:', 12.668654509856728)
('Epoch:', 13, 'Cluster #:', 3, 'Loss:', 481.51302522893866, 'Test Loss:', 12.680976091299877)
('Epoch:', 14, 'Cluster #:', 3, 'Loss:', 480.39035817460336, 'Test Loss:', 12.628896979962215)
('Epoch:', 15, 'Cluster #:', 3, 'Loss:', 481.034190520407, 'Test Loss:', 12.582702009354458)
2018-12-11 01:20:21.632953

CRIM evaluation:
MRR: 0.06717
MAP: 0.03322
P@1: 0.0427
P@5: 0.03244
P@10: 0.03148

# Scratch Pad

In [268]:
model.vocab['dog'].index


3042

In [None]:
test_these_terms = np.asarray(data.tokenizer.texts_to_sequences(data.train_query))
test_these_hypers = np.asarray(data.tokenizer.texts_to_sequences(data.train_hyper))

indices = np.arange(len(test_these_terms))                               
#np.random.seed(32)
np.random.shuffle(indices)
print indices[:32]

term, hyper, label = extend_batch_with_negatives(test_these_terms[indices[:32]], test_these_hypers[indices[:32]], data.random_words, data.tokenizer, 5)
print len(term)
[(data.tokenizer.index_word[i[0]], data.tokenizer.index_word[j[0]], l) for i, j, l in zip(term, hyper, label)]

In [None]:
#hyper_candidates = [[data.tokenizer.word_index[hyper]] for hyper in data.vocab]
len(set(data.valid_hyper))

In [None]:

[l for l in crim_model.layers if type(l) == Dense][0].get_weights()[0]

In [None]:
np.sort(data.tokenizer.texts_to_sequences(data.vocab)).shape



In [82]:
#for idx, m in enumerate(clusters):
 #   print idx, m.loss, m.test_loss, m.model.get_layer(name='Prediction').get_weights()[0]

i = data.tokenizer.word_index['rod_laver']
j = data.tokenizer.word_index['person']
for c in clusters:
    print c.model.predict([[i], [j]])[0]

[0.9350891]
[0.01795901]
[0.11550742]
[0.00809616]
[0.11760344]
[0.05177224]
[0.09667916]
[0.11339272]
[0.05452686]
[0.0500305]
[0.08509561]
[0.20673454]
[0.15739459]
[0.02486995]
[0.06349776]
[0.11436869]
[0.07228256]
[0.4835079]
[0.03059227]
[0.11007205]
[0.07297403]
[0.09328794]
[0.08511002]
[0.19539028]
[0.0796214]


In [81]:
import gc
del clusters

gc.collect()


5585