In [1]:
from tensorflow.keras.layers import Input, Dense, Embedding, Dot, Flatten, Concatenate, concatenate, Reshape, Dropout, Lambda, Subtract
from tensorflow.keras.layers import GlobalMaxPooling1D, Conv1D

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.initializers import RandomNormal

from tensorflow.keras import backend as K
import tensorflow as tf

import numpy as np




In [2]:
# we'll pretent we're using embedding layers for our words

# just two "words" in our vocab (input hypo, input hyper)
input_hypo_seq = [[1]]
input_hyper_seq = [[2]]

embedding_matrix = np.zeros((4,5))
input_a = np.array([[0.1, -0.5, 0.4, 0.8, -0.7]])
input_b = np.array([[0.3, 0.2, 0.1, -0.23, 0.1]])
input_c = np.array([[-0.01, -0.053, 0.08, -0.001, 0]])
embedding_matrix[1] = input_a
embedding_matrix[2] = input_b
embedding_matrix[3] = input_c

embedding_matrix


array([[ 0.   ,  0.   ,  0.   ,  0.   ,  0.   ],
       [ 0.1  , -0.5  ,  0.4  ,  0.8  , -0.7  ],
       [ 0.3  ,  0.2  ,  0.1  , -0.23 ,  0.1  ],
       [-0.01 , -0.053,  0.08 , -0.001,  0.   ]])

In [None]:
print np.sqrt(np.sum(embedding_matrix[1] ** 2))

np.linalg.norm(embedding_matrix[1])

In [None]:
def test_lambda():

    hypo_input = Input(shape=(1,), name='Hyponym')
    hyper_input = Input(shape=(1,), name='Hypernym')

    word_embedding = Embedding(4, 5, name='WordEmbedding')
    
    
    hypo_embedding = word_embedding(hypo_input)
    hyper_embedding = word_embedding(hyper_input)

    #hypo_embedding = Flatten()(hypo_embedding)
    sub = Subtract()([hyper_embedding, hypo_embedding])
    sub = Lambda(lambda x : K.square(x))(sub)
    sub = Flatten()(sub)

    model_test = Model(inputs=[hypo_input, hyper_input], outputs=sub)
    
    model_test.get_layer(name='WordEmbedding').set_weights([embedding_matrix])
    model_test.get_layer(name='WordEmbedding').trainable = False
            
    #model_test.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model_test

In [None]:
model_test = test_lambda()
print model_test.predict([[1], [2]])

print (embedding_matrix[2] - embedding_matrix[1]) ** 2

In [None]:
def build_model():

    hypo_input = Input(shape=(1,), name='Hyponym')
    hyper_input = Input(shape=(1,), name='Hypernym')

    word_embedding = Embedding(4, 5, name='WordEmbedding')
    
    
    hypo_embedding = word_embedding(hypo_input)
    hyper_embedding = word_embedding(hyper_input)

    # this one is custom and is based on the CRIM paper. 
    # we initialise on random normal noise applied to an identity matrix
    def random_identity(shape, dtype="float32", partition_info=None):    
        identity = K.eye(shape[-1], dtype='float32')

        return identity 

    def random_identity_2(shape, dtype="float32", partition_info=None):    
        identity = K.eye(shape[-1], dtype='float32')
        normal = K.random_normal((shape[-1],shape[-1]), mean=0., stddev=0.05) 


        return normal + identity
        
    
    phi0 = Dense(5, activation=None, use_bias=False, 
                kernel_initializer=random_identity, name='Phi0')(hypo_embedding)

    #phi0 = Dropout(0.3, name='d1') (phi0,  training=True)
    
    phi1 = Dense(5, activation=None, use_bias=False, 
                kernel_initializer=random_identity_2, name='Phi1')(hypo_embedding)


    #phi1 = Dropout(0.3, name='d2') (phi1, training=True)
        
    phi = concatenate([phi0, phi1], axis=1, name='Phi')    
    #phi = Dropout(0.1, name='drop')(phi, training=True)
    
    phi = Lambda(lambda x: K.mean(x, axis=1, keepdims=True))(phi)
    
    #phi_hyper = Dot(axes=-1, normalize=True, name='DotProduct')([phi, hyper_embedding])
    
    #phi_hyper = Flatten()(phi_hyper)
    #phi_hyper = Lambda(lambda x: K.mean(x, axis=1, keepdims=True))(phi_hyper)
    

    #predictions = Dense(1, activation="sigmoid", name='Prediction',
    #                    kernel_initializer='random_normal',
    #                    bias_initializer='random_normal'
    #                   )(phi_hyper)

    model_test = Model(inputs=[hypo_input, hyper_input], outputs=phi)
    
    model_test.get_layer(name='WordEmbedding').set_weights([embedding_matrix])
    model_test.get_layer(name='WordEmbedding').trainable = False
    
    #inter_model = Model(inputs=model_test.input, outputs=model_test.get_layer(name='drop').output)
    
    #model_test.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model_test




In [None]:
print embedding_matrix

model_test = build_model()

print model_test.predict([[1], [2]])


phi_0_W = model_test.get_layer(name='Phi0').get_weights()[0]
phi_1_W = model_test.get_layer(name='Phi1').get_weights()[0]

#print ("Printing phi weights")
#print phi_0_W
#print phi_1_W

print ("Printing hypo projections")
proj1 = np.dot(embedding_matrix[1], phi_0_W)
proj2 = np.dot(embedding_matrix[1], phi_1_W)
print proj1
print proj2

print ("Printing mean hypo")
concat_W = np.vstack((proj1, proj2))
print np.mean(concat_W, axis=0)
#print (embedding_matrix[2] - embedding_matrix[1]) ** 2

In [None]:
def custom_loss(hypo_tensor, phi_weights):
    def inner_product(y_true, y_pred):        
        hypo_tensor_s = K.squeeze(hypo_tensor, axis=1)                                        
        phi_weights_norm = K.l2_normalize(phi_weights, axis=1)
        simil = y_true * 0.01 * K.mean(K.dot(phi_weights_norm, K.transpose(hypo_tensor_s)) ** 2)
        #simil = K.dot(phi_weights_norm, K.transpose(hypo_tensor_s)) ** 2
        return K.binary_crossentropy(y_true, y_pred) + simil                        
        
    return inner_product


def custom_loss2(hypo_phi_tensor):
    def inner_product(y_true, y_pred):                
        simil = 0.1 * K.mean(hypo_phi_tensor ** 2, axis=-1)        
        return K.mean(K.binary_crossentropy(y_true, y_pred), axis=-1) + simil                        
                      
                              
    return inner_product

def build_variant():

    hypo_input = Input(shape=(1,), name='Hyponym')
    hyper_input = Input(shape=(1,), name='Hypernym')

    word_embedding = Embedding(4, 5, name='WordEmbedding')
    
    
    hypo_embedding = word_embedding(hypo_input)
    hyper_embedding = word_embedding(hyper_input)

    # this one is custom and is based on the CRIM paper. 
    # we initialise on random normal noise applied to an identity matrix
    def random_identity(shape, dtype="float32", partition_info=None):    
        identity = K.eye(shape[-1], dtype='float32')

        return identity 

    def random_identity_2(shape, dtype="float32", partition_info=None):    
        identity = K.eye(shape[-1], dtype='float32')
        normal = K.random_normal((shape[-1],shape[-1]), mean=0., stddev=0.05) 


        return normal + identity
            
    phi = Dense(5, activation=None, use_bias=False, 
                kernel_initializer=random_identity, name='Phi0')(hypo_embedding)

    phi = Flatten(name='FlattenPhi')(phi)
    hyper_embedding = Flatten(name='FlattenHyper')(hyper_embedding)
            
    #phi_hyper = Flatten()(phi_hyper)
    phi_hyper = Dot(axes=-1, normalize=True, name='DotProduct1')([phi, hyper_embedding])    
    phi_hypo = Dot(axes=-1, normalize=False, name='DotProduct2')([phi, hypo_embedding])

    predictions = Dense(1, activation="sigmoid", name='Prediction',
                        kernel_initializer='random_normal',
                        bias_initializer='random_normal'
                       )(phi_hyper)

    model_test = Model(inputs=[hypo_input, hyper_input], outputs=predictions)
        
    #regul_loss = custom_loss(hypo_embedding, phi)
    regul_loss = custom_loss2(phi_hypo)
    
    model_test.get_layer(name='WordEmbedding').set_weights([embedding_matrix])
    model_test.get_layer(name='WordEmbedding').trainable = False
    
    #inter_model = Model(inputs=model_test.input, outputs=model_test.get_layer(name='drop').output)
    
    model_test.compile(optimizer='rmsprop', loss=regul_loss, metrics=['accuracy'])
    
    return model_test

def build_classifier(feature_extractor):
    hypo_input = Input(shape=(1,), name='Hyponym')
    hyper_input = Input(shape=(1,), name='Hypernym')
    s_vector = feature_extractor([hypo_input, hyper_input])
    
    predictions = Dense(1, activation="sigmoid", name='Prediction',
                        kernel_initializer='random_normal',
                        bias_initializer='random_normal') (s_vector)
    
    model_test = Model(inputs=[hypo_input, hyper_input], outputs=predictions)
    model_test.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return model_test
    


In [None]:
model_test = build_variant()
model_test.summary()



In [68]:
def custom_loss(hypo_tensor, phi_weights):
    def inner_product(y_true, y_pred):        
        hypo_tensor_s = K.squeeze(hypo_tensor, axis=1)                                        
        phi_weights_norm = K.l2_normalize(phi_weights, axis=1)
        simil = y_true * 0.01 * K.mean(K.dot(phi_weights_norm, K.transpose(hypo_tensor_s)) ** 2)
        #simil = K.dot(phi_weights_norm, K.transpose(hypo_tensor_s)) ** 2
        return K.binary_crossentropy(y_true, y_pred) + simil                        
        
    return inner_product


def custom_loss2(random_sim):
    def mse(y_true, y_pred):                
        #K.print_tensor(y_true, message='Hello:')
        #K.print_tensor(y_pred, message='Hello:')                        
        return K.sqrt(K.sum(K.square(y_pred - y_true), axis=-1)) + K.square(random_sim)
                                                    
    return mse

def build_square_error():

    hypo_input = Input(shape=(1,), name='Hyponym')
    neg_input = Input(shape=(1,), name='Negative')
    word_embedding = Embedding(4, 5, name='WordEmbedding')
        
    hypo_embedding = word_embedding(hypo_input)   
    neg_embedding = word_embedding(neg_input)

    # this one is custom and is based on the CRIM paper. 
    # we initialise on random normal noise applied to an identity matrix
    def random_identity(shape, dtype="float32", partition_info=None):    
        identity = K.eye(shape[-1], dtype='float32')

        return identity 

    def random_identity_2(shape, dtype="float32", partition_info=None):    
        identity = K.eye(shape[-1], dtype='float32')
        normal = K.random_normal((shape[-1],shape[-1]), mean=0., stddev=0.05) 


        return normal + identity
            
    phi = Dense(5, activation=None, use_bias=False, 
                kernel_initializer=random_identity, name='Phi0')(hypo_embedding)

    phi = Flatten(name='FlattenPhi')(phi)
    neg_embedding = Flatten(name='FlattenNeg')(neg_embedding)
    
    # calculate similarity of projection with negative
    random_sim = Dot(axes=-1, normalize=False, name='DotProductRand')([phi, neg_embedding])
    mse = custom_loss2(random_sim)
    
    model_test = Model(inputs=[hypo_input, neg_input], outputs=phi)
        
    #regul_loss = custom_loss(hypo_embedding, phi)
    
    
    model_test.get_layer(name='WordEmbedding').set_weights([embedding_matrix])
    model_test.get_layer(name='WordEmbedding').trainable = False
            
    model_test.compile(optimizer='rmsprop', loss=mse, metrics=['accuracy'])
    
    return model_test


In [70]:
model_test = build_square_error()
#model_test.summary()
print embedding_matrix

#print model_test.predict([[1], [2]])

#print model_test.predict([[[1], [2]], [[3],[1]]])
print model_test.test_on_batch([[[1], [2]], [[3],[1]]], embedding_matrix[2:4].reshape(2,-1))

#print model_test.test_on_batch([[1, 2]], embedding_matrix[2:4].reshape(2,-1))

a = np.sqrt(np.sum((embedding_matrix[1] - embedding_matrix[2] ) ** 2))
b = np.sqrt(np.sum((embedding_matrix[2] - embedding_matrix[3] ) ** 2))

#(a+b)/2
#print b
print (a + b + (np.dot(embedding_matrix[1], embedding_matrix[3]) ** 2) +\
              (np.dot(embedding_matrix[2], embedding_matrix[1]) ** 2))/2

[[ 0.     0.     0.     0.     0.   ]
 [ 0.1   -0.5    0.4    0.8   -0.7  ]
 [ 0.3    0.2    0.1   -0.23   0.1  ]
 [-0.01  -0.053  0.08  -0.001  0.   ]]
[1.0397483, 0.0]
1.0397483683564293


In [82]:
def custom_loss2(random_sim):
    def err_diff(y_true, y_pred):                
        #K.print_tensor(y_true, message='Hello:')
        #K.print_tensor(y_pred, message='Hello:')                        
        return (y_pred - y_true) + K.square(random_sim)
                                                    
    return err_diff

def build_square_error2():

    hypo_input = Input(shape=(1,), name='Hyponym')    
    neg_input = Input(shape=(1,), name='Negative')
    hyper_input = Input(shape=(1,), name='Hypernym')
    
    word_embedding = Embedding(4, 5, name='WordEmbedding')
        
    hypo_embedding = word_embedding(hypo_input)   
    neg_embedding = word_embedding(neg_input)
    hyper_embedding = word_embedding(hyper_input)

    # this one is custom and is based on the CRIM paper. 
    # we initialise on random normal noise applied to an identity matrix
    def random_identity(shape, dtype="float32", partition_info=None):    
        identity = K.eye(shape[-1], dtype='float32')

        return identity 

    def random_identity_2(shape, dtype="float32", partition_info=None):    
        identity = K.eye(shape[-1], dtype='float32')
        normal = K.random_normal((shape[-1],shape[-1]), mean=0., stddev=0.05) 


        return normal + identity
            
    phi = Dense(5, activation=None, use_bias=False, 
                kernel_initializer=random_identity, name='Phi0')(hypo_embedding)

    phi = Flatten(name='FlattenPhi')(phi)
    neg_embedding = Flatten(name='FlattenNeg')(neg_embedding)
    hyper_embedding = Flatten(name='FlattenHyper')(hyper_embedding)
    
    # calculate similarity of projection with negative
    random_sim = Dot(axes=-1, normalize=False, name='DotProductRand')([phi, neg_embedding])
    vector_diff = Subtract()([phi, hyper_embedding])
    mse = Lambda(lambda x: K.sqrt(K.sum(K.square(x), axis=-1)))(vector_diff)
    
    diff = custom_loss2(random_sim)
    
    model_test = Model(inputs=[hypo_input, neg_input, hyper_input], outputs=mse)
        
    #regul_loss = custom_loss(hypo_embedding, phi)
    
    
    model_test.get_layer(name='WordEmbedding').set_weights([embedding_matrix])
    model_test.get_layer(name='WordEmbedding').trainable = False
            
    model_test.compile(optimizer='rmsprop', loss=diff, metrics=['accuracy'])
    
    return model_test


In [97]:
model_test.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Hyponym (InputLayer)         (None, 1)                 0         
_________________________________________________________________
WordEmbedding (Embedding)    (None, 1, 5)              20        
_________________________________________________________________
Phi0 (Dense)                 (None, 1, 5)              25        
_________________________________________________________________
FlattenPhi (Flatten)         (None, 5)                 0         
Total params: 45
Trainable params: 25
Non-trainable params: 20
_________________________________________________________________


In [98]:
#model_test2 = build_square_error2()
#model_test2.summary()

#print model_test2.predict([[1], [3], [2]])
print model_test.test_on_batch([[[1], [2]], [[3],[1]]], embedding_matrix[2:4].reshape(2,-1))

print model_test2.test_on_batch([[[1], [2]], [[3], [1]], [[2], [3]]], [0., 0.])

#a = np.sqrt(np.sum((embedding_matrix[1] - embedding_matrix[2] ) ** 2))
#b = np.sqrt(np.sum((embedding_matrix[2] - embedding_matrix[3] ) ** 2))
#print a + (np.dot(embedding_matrix[1], embedding_matrix[3]) ** 2)

[1.0397483, 0.0]
[1.0397483, array([1., 1.], dtype=float32)]


In [None]:
model_test.get_layer(name='Prediction').get_weights()

In [None]:
def build_cnn():

    word_pair = Input(shape=(2,), name='WordPair')
    word_embedding = Embedding(4, 5, name='WordEmbedding')
    
    wp_embedding = word_embedding(word_pair)
    #flat = Flatten()(wp_embedding)
    conv1 = Conv1D(2, 1, activation='relu', name='Conv1_1')(wp_embedding)
    conv1_2 = Conv1D(2, 2, activation='relu', name='Conv1_2')(conv1)
    max_pool = GlobalMaxPooling1D()(conv1_2)
    dense = Dense(32, activation='relu', name='FullyConnected')(max_pool)
    
    model = Model(inputs = word_pair, outputs = dense)
    model.get_layer(name='WordEmbedding').set_weights([embedding_matrix])
    model.get_layer(name='WordEmbedding').trainable = False
    
    return model
    

model_test = build_cnn()
model_test.summary()

word_input = np.array([1, 2])
word_input = word_input.reshape(1,-1)

print word_input
model_test.predict(word_input)

#print embedding_matrix


In [None]:
feature_extractor = build_variant()
feature_extractor.summary()

print embedding_matrix

# print weights
print feature_extractor.get_layer(name='Phi0').get_weights()[0]

classifier1 = build_classifier(feature_extractor)
classifier2 = build_classifier(feature_extractor)

print "Training first classifier"
metrics = classifier1.train_on_batch([[1], [2]], [1])
print metrics
print "Print phi weights"
# print updated weights
print feature_extractor.get_layer(name='Phi0').get_weights()[0]

print "Training second classifier"
metrics = classifier2.train_on_batch([[1], [3]], [0])
print metrics
print "Print phi weights"
print feature_extractor.get_layer(name='Phi0').get_weights()[0]

In [None]:
from sklearn.preprocessing import normalize

embedding_matrix =  normalize(embedding_matrix, axis=1)

In [None]:
print np.linalg.norm(embedding_matrix, axis=1).reshape(-1, 1)
#np.dot(embedding_matrix[1], embedding_matrix[2])

#embedding_matrix[1:] /= np.linalg.norm(embedding_matrix, axis=1).reshape(-1, 1)[1:]

#np.dot(embedding_matrix[1], embedding_matrix[2])
print np.dot(embedding_matrix[1], embedding_matrix[1:].T)
print np.dot(embedding_matrix[1:], embedding_matrix[1])

In [None]:
model_test = build_model()
model_test.summary()
from keras.utils.vis_utils import plot_model
#from tensorflow.keras.utils import plot_model

plot_model(model_test, to_file='model_test.png', show_shapes=True, show_layer_names=True)

In [None]:
model_test.get_layer(name='dropout_2').output

In [None]:
#model_test = build_model()
model_test.fit([input_hypo_seq, input_hyper_seq], [1], epochs=1000)


In [None]:
print model_test.predict([input_hypo_seq, input_hyper_seq])

model_test.get_layer(name='Prediction').get_weights()

In [None]:
# simulate forward pass when predicting
import math

def sigmoid(x):
    return 1 / (1 + math.exp(-x))

features = model_test.get_layer(name='Prediction').get_weights()[0]
bias = model_test.get_layer(name='Prediction').get_weights()[1]

print "Weights:"
print features[0], features[1], bias
print "-"*30
proj1 = model_test.get_layer(name='Phi0').get_weights()[0]
proj2 = model_test.get_layer(name='Phi1').get_weights()[0]
print "Learnt projections:"
print proj1
print proj2
print "-"*30
P1 = np.dot(input_a, proj1)
P2 = np.dot(input_a, proj2)

P = np.concatenate((P1,P2), axis = 0)
s = np.dot(P, input_b.T)
print s 
pred = np.sum(s.flatten() * features.flatten()) + bias

sigmoid(pred)

# Original Yamane POC

In [None]:
def get_embeddings_model(dim, embedding_matrix):
    hypo_input = Input(shape=(1,))
    hyper_input = Input(shape=(1,))

    word_embedding = Embedding(embedding_matrix.shape[0], dim, name='WE')

    hypo_embedding = word_embedding(hypo_input)
    hyper_embedding = word_embedding(hyper_input)

    embedding_model = Model(inputs=[hypo_input, hyper_input], outputs=[hypo_embedding, hyper_embedding])

    # inject pre-trained embeddings into this mini, resusable model/layer
    embedding_model.get_layer(name='WE').set_weights([embedding_matrix])
    embedding_model.get_layer(name='WE').trainable = False
    return embedding_model

In [None]:
hypo_input = Input(shape=(1,))
hyper_input = Input(shape=(1,))

# create model that reuse weights of another model
# our model_test is not an actual model but a pre-trained
# embeddings layer
# this allows us to reuse the same embedding weights for every 
# cluster we need to create model.
# less resources are therefore consumed to build and train the 
# Yamane model

# employ model as a "layer"
embedding_layer = get_embeddings_model(3, embedding_matrix)
e1, e2 = embedding_layer([hypo_input, hyper_input])
hypo_flat = Flatten()(e1)
hyper_flat = Flatten()(e2)

model2 = Model([hypo_input, hyper_input], outputs=[hypo_flat, hyper_flat])


In [None]:
print model2.predict([input_hypo_seq, input_hyper_seq])[1]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(10)
# hyponym cluster #1 simulation
#X1 = np.random.normal(-0.005,0.001,size=(4,2)) 
X1 = np.random.normal(5,0.5,size=(40,2)) 
# hypernym for cluster #1
Y1 = np.array([[5., 3.]])

# hyponym cluster #2 simulation
X2 = np.random.normal(0,0.5,size=(40,2)) 
#X2[:,1] +=  0.01
Y2 = np.array([[2.5, 0.010]])

plt.figure()
plt.scatter(X1[:,0], X1[:,1],c='red')
plt.scatter(Y1[:,0], Y1[:,1],c='red')
plt.scatter(X2[:,0], X2[:,1],c='blue')
plt.scatter(Y2[:,0], Y2[:,1],c='blue')



In [None]:
# create embedding matrix from synthesised samples
X = np.vstack((X1, X2))
Y = np.vstack((Y1, Y2))

hyponym_size = X.shape[0]
hypernym_size = Y.shape[0]
dim = 2

vocab_size = hyponym_size + hypernym_size
embedding_matrix = np.zeros((vocab_size + 1, dim))


embedding_matrix[1:len(X)+1,:] = X
embedding_matrix[len(X)+1:,:] = Y


In [None]:
for vector in embedding_matrix:
    vector /= np.linalg.norm(vector)

embedding_matrix[0] = [0., 0.]

In [None]:
embedding_matrix

In [None]:
# create sequence input data
X_hyponym = X
Y_hypernym = np.vstack((np.tile(Y[0,:], (40,1)), np.tile(Y[1,:], (40,1))))

X_hyponym_seq = []
Y_hypernym_seq = []
for i in range(X_hyponym.shape[0]):
    X_hyponym_seq.append([np.argwhere(embedding_matrix == X_hyponym[i])[0,0]])
    Y_hypernym_seq.append([np.argwhere(embedding_matrix == Y_hypernym[i])[0,0]])

# all samples are positive
y_label = [1] * len(X_hyponym_seq) 

### Function that creates cluster model

In [None]:
from tensorflow.keras.layers import Input, Dense, Embedding, Dot, Flatten, Concatenate, concatenate, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.initializers import RandomNormal
from tensorflow.keras import backend as K

import tensorflow as tf

def get_new_cluster_model(embedding_layer, phi_dim):
    hypo_input = Input(shape=(1,), name='Hyponym')    
    hyper_input = Input(shape=(1,), name='Hypernym')
    
    hypo_embedding, hyper_embedding = embedding_layer([hypo_input, hyper_input])
    
    def random_identity(shape, dtype="float32", partition_info=None):    
        identity = K.eye(shape[-1], dtype='float32')
    
        rnorm = K.random_normal((shape[-1],shape[-1]), 
                             mean=0., stddev=1.)

        return identity * rnorm

    phi = Dense(phi_dim, activation=None, use_bias=False, 
                kernel_initializer="random_normal", name='Phi')(hypo_embedding)
    
    # flatten phi and hyper_embedding tensors
    phi = Flatten()(phi)
    hyper_embedding = Flatten()(hyper_embedding)
    
    phi_hyper = Dot(axes=-1, normalize=False, name='DotProduct')([phi, hyper_embedding])
    
    predictions = Dense(1, activation="sigmoid", kernel_regularizer=l1(0.001), bias_regularizer=l1(0.001), name='Prediction')(phi_hyper)
    # instantiate model
    model = Model(inputs=[hypo_input, hyper_input], outputs=predictions)
    
    # compile using binary_crossentropy loss
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
# get instance of embedding model which we will inject into every new 
# cluster model.  No sense into having individual embedding layers since
# embedding layers are not trainable in this model
embedding_layer = get_embeddings_model(dim=2, embedding_matrix=embedding_matrix)



In [None]:
model.summary()

### Yamane learning algorithm

In [None]:
X_hyponym_seq = np.array(X_hyponym_seq, dtype='int32')
Y_hypernym_seq = np.array(Y_hypernym_seq, dtype='int32')
y_label = np.array(y_label, dtype='int32')

# all samples initialised to 1
sample_clusters = np.ones(len(X_hyponym_seq), dtype='int32')

# list of clusters
clusters = []
clusters.append(get_new_cluster_model(embedding_layer, phi_dim=2))
# initialise all samples to cluster 1

indices = np.arange(len(X_hyponym_seq))
np.random.seed(42)
# shuffle training set
np.random.shuffle(indices)
# simple structure that records cluster id for every sample
sample_clusters[indices] = 0


In [None]:
losses = [0.]
epochs = 50
threshold = 0.4
# indicator of "current" sample cluster index
z_i = 0

# iterate over samples
for epoch in range(epochs):
    print "Epoch ======> %d" % (epoch)
    # reset losses to zero
    losses = [0. for _ in losses]
    # train algorithm by stochastic gradient descent, one sample at a time
    for i in indices:
        print "Doing", i, X_hyponym_seq[i], Y_hypernym_seq[i], sample_clusters[i]
        # find number of clusters
        sim = map(lambda x: x.predict([X_hyponym_seq[i], Y_hypernym_seq[i]]), clusters)
        max_sim = np.argmax(sim)
        print "Max Similarity cluster:", max_sim, "(sim = %0.8f)" % (sim[max_sim])
        if sim[max_sim] < threshold:                        
            # add new cluster to list of clusters
            clusters.append(get_new_cluster_model(embedding_layer, phi_dim=2))
            losses.append(0.)
            # assign current cluster index to latest model
            z_i = len(clusters) - 1
            sample_clusters[i] = z_i
        else:            
            z_i = max_sim
            sample_clusters[i] = z_i
                
        # get indices of elements belonging to cluster
        z_i_indices = np.argwhere(sample_clusters == z_i).flatten()        
        # shuffle batch indices
        np.random.shuffle(z_i_indices)
        # update parameters of cluster 
        losses[z_i] += clusters[z_i].train_on_batch([X_hyponym_seq[z_i_indices], Y_hypernym_seq[z_i_indices]], y_label[z_i_indices])[0]
        print "Loss on cluster", z_i, losses[z_i]
        



In [None]:
sample_clusters

In [None]:
print clusters[0].get_layer(name='Phi').get_weights()[0]

In [None]:
clusters[0].get_layer(name='Prediction').get_weights()

In [None]:
#Y_hypernym_seq
#wrong_Y = np.zeros(80, dtype='int32').reshape(-1,1)

#wrong_Y[:40] = Y_hypernym_seq[:40] -1
#wrong_Y[40:] = Y_hypernym_seq[40:] +1

np.round(clusters[0].predict([X_hyponym_seq, Y_hypernym_seq]), 2)

# Rough Notes