In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import random as rn
import os
from sklearn.decomposition import PCA
from rdflib import Graph
import utils
import transE
import numpy as np

In [2]:
# fb15k_237 = np.load('./data/fb15k_237.npz', allow_pickle=True)
# fb_train = fb15k_237['train']
# fb_valid = fb15k_237['valid']
# fb_test = fb15k_237['test']

In [3]:
SEED = 123
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
np.random.seed(SEED)
rn.seed(SEED)
tf.random.set_seed(SEED)

data = np.load(os.path.join('.','data','royalty.npz'))

train = data['X_train']
test = data['X_test']

train_exp = data['train_exp']
test_exp = data['test_exp']

#full_train = np.concatenate((train,train_exp), axis=0)

entities = data['entities'].tolist()
relations = data['relations'].tolist()

num_entities = len(entities)
num_relations = len(relations)

ent2idx = dict(zip(entities, range(num_entities)))
rel2idx = dict(zip(relations, range(num_relations)))

# idx2ent = dict(zip(range(num_entities),entities))
# idx2rel = dict(zip(range(num_relations),relations))

In [4]:
train2idx = utils.array2idx(train,ent2idx,rel2idx)
exp2idx = utils.array2idx(train_exp,ent2idx,rel2idx)

test2idx = utils.array2idx(test,ent2idx,rel2idx)
testexp2idx = utils.array2idx(test_exp,ent2idx,rel2idx)

In [5]:
embedding_size = 50
batch_size = 3
num_epochs = 2
margin = 2
learning_rate = .001

In [6]:
model = transE.transE(num_entities,num_relations,embedding_size,random_state=SEED)

In [7]:
train_data = tf.data.Dataset.from_tensor_slices((train2idx[:,0], train2idx[:,1], train2idx[:,2],
                                                exp2idx[:,0],exp2idx[:,1],exp2idx[:,2])).batch(batch_size)

optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)

In [8]:
epoch_loss = []

for epoch in range(num_epochs):

    for pos_head, rel, pos_tail, pos_head_exp,rel_exp, pos_tail_exp in train_data:

        neg_head, neg_tail = utils.get_negative_triples(
            head=pos_head, 
            rel=rel, 
            tail=pos_tail,
            num_entities=num_entities,
            random_state=SEED
            )

        neg_head_exp, neg_tail_exp = utils.get_negative_triples(
            head=pos_head_exp, 
            rel=rel_exp, 
            tail=pos_tail_exp,
            num_entities=num_entities,
            random_state=SEED
            )

        with tf.GradientTape() as tape:

            pos_head_e, pos_tail_e, neg_head_e, neg_tail_e, rel_e = model([
                pos_head,
                pos_tail, 
                neg_head, 
                neg_tail, 
                rel
                ]
            )

            pos_head_exp_e, pos_tail_exp_e, neg_head_exp_e, neg_tail_exp_e, rel_exp_e = model([
                pos_head_exp,
                pos_tail_exp,
                neg_head_exp,  
                neg_tail_exp, 
                rel_exp
                ]
            )

            prediction_loss = transE.pred_loss(pos_head_e,pos_tail_e,neg_head_e,neg_tail_e,rel_e)
            #explain_loss = transE.exp_loss(pos_head_exp_e,pos_tail_exp_e,neg_head_exp_e, neg_tail_exp_e, rel_exp_e)
            explain_loss = transE.exp_loss(pos_head_e,pos_tail_exp_e,pos_head_exp_e,pos_tail_exp_e,rel_e,rel_exp_e)
            
            total_loss = prediction_loss + explain_loss

        grads = tape.gradient(total_loss,model.trainable_variables)
        optimizer.apply_gradients(zip(grads,model.trainable_variables))

    #if not epoch % 10:
    print(f"Loss at epoch {epoch}: {total_loss.numpy()} ")
        
    epoch_loss.append(np.round(total_loss.numpy(),5))

Loss at epoch 0: 18.410451889038086 
Loss at epoch 1: 10.564764022827148 


In [9]:
entity_embeddings = utils.get_entity_embeddings(model)
relation_embeddings = utils.get_relation_embeddings(model)

In [10]:
test2idx[0]

array([1403,    0, 1460])

In [11]:
# def exp_score(triple,k,data,entity_embeddings,relation_embeddings):
    
#     triple_h_e = entity_embeddings[triple[0]]
#     triple_r_e = relation_embeddings[triple[1]]
#     triple_t_e = entity_embeddings[triple[2]]

#     h_e = entity_embeddings[data[:,0]]
#     r_e = relation_embeddings[data[:,1]]
#     t_e = entity_embeddings[data[:,2]]

#     squared_diff = np.square(triple_h_e - h_e) + np.square(triple_r_e-r_e) + np.square(triple_t_e-t_e)

#     l2_dist = np.sqrt(np.sum(squared_diff,axis=1))

#     closest_l2 = np.argsort(l2_dist)[:k]
    
#     return data[closest_l2]

transE.exp_score(test2idx[0],k=2,data=test2idx,entity_embeddings=entity_embeddings,relation_embeddings=relation_embeddings)

array([[1403,    0, 1460],
       [ 531,    0, 1219]])

In [None]:
#transE.exp_loss(pos_head_e,pos_tail_e,pos_head_exp_e,pos_tail_exp_e,rel_e,rel_exp_e)

# triple = (pos_head_e[0], rel_e[0], pos_tail_e[0])

# triple

In [None]:


# data = np.load('./data/human_data.npz')
# train = data['X']

# g=Graph()
# g.parse("../CORESE-DATA/human-data.rdf", format="xml")

# triples = []

# for i,j,k in g:
    
#     head = str(i).split('#')
#     rel = str(j).split('#')
#     tail = str(k).split('#')
    
#     if head[0] == 'http://www.inria.fr/2015/humans-instances':
        
#         triples.append((head[-1], rel[-1], tail[-1]))

# triples = [('Eve', 'type', 'Lecturer'),
#            #('Eve', 'type', 'Person'), 
#            ('Lecturer', 'subClassOf', 'Person'), 
#            #('David', 'type', 'Person'),
#            ('David', 'type', 'Researcher'),
#            ('Researcher', 'subClassOf', 'Person'),
#            ('Flora', 'hasSpouse', 'Gaston'),
#            ('Gaston', 'type', 'Person'),
#            #('Flora', 'type', 'Person'),
#           ]
# g=Graph()
# g.parse("/Users/nhalliwe/Desktop/CORESE-DATA/human-data.rdf", format="xml")
# triples = []

# for i,j,k in g:
    
#     head = str(i).split('#')
#     rel = str(j).split('#')
#     tail = str(k).split('#')
    
#     if head[0] == 'http://www.inria.fr/2015/humans-instances':
        
#         triples.append((head[-1], rel[-1], tail[-1]))

# train = np.array(triples)
# traces = utils.parse_traces(file_name='../traces/entailment.ttl')
# exp_triples = utils.get_exp_triples(train,traces)

# entities = np.unique(np.concatenate((train[:,0], train[:,2], exp_triples[:,0], exp_triples[:,2]), axis=0)).tolist()
# relations = np.unique(np.concatenate((train[:,1], exp_triples[:,1])), axis=0).tolist()

# num_entities = len(entities)
# num_relations = len(relations)

# ent2idx = dict(zip(entities, range(num_entities)))
# rel2idx = dict(zip(relations, range(num_relations)))

# idx2ent = {idx:ent for ent,idx in ent2idx.items()}
# idx2rel = {idx:rel for rel,idx in rel2idx.items()}

# train2idx = utils.train2idx(train,ent2idx,rel2idx)
# idx2train = utils.idx2train(train2idx,idx2ent,idx2rel)

In [None]:
# for i in train:
#     if tuple(i) in traces:
#         print(tuple(i),traces[tuple(i)])

In [None]:
# for h,r,t in traces[('Eve', 'type', 'Person')]:
#     print(ent2idx[h],rel2idx[r],ent2idx[t])


        
        #exp_triples.append(list(traces[tuple(i)]))

In [None]:
#idx = tf.convert_to_tensor([6,4])
#tf.concat([tf.gather_nd(train2idx,tf.where(train2idx[:,0] == idx)), tf.gather_nd(train2idx,tf.where(train2idx[:,2] == idx))], axis=0)

#tf.where(train2idx[:,0]==idx)

In [None]:
# A = np.zeros(shape=(num_entities,num_entities,num_relations))

# for h,r,t in train:
    
#     h_idx = entities.index(h)
#     r_idx = relations.index(r)
#     t_idx = entities.index(t)
    
#     A[h_idx,t_idx,r_idx] = 1

In [None]:
## valid2idx = []

# for head, rel, tail in valid:
    
#     head_idx = ent2idx[head]
#     tail_idx = ent2idx[tail]
#     rel_idx = rel2idx[rel]

#     valid2idx.append([head_idx, rel_idx, tail_idx])
    
# valid2idx = np.array(valid2idx)

In [None]:
# transE
# EMBEDDING_SIZE = 30
# BATCH_SIZE = 2
# NUM_EPOCHS = 200
# MARGIN = 2
# SQRT_SIZE = 6 / np.sqrt(EMBEDDING_SIZE)

# model = transE.build_model(
#     embedding_size=EMBEDDING_SIZE,
#     num_entities=num_entities,
#     num_relations=num_relations,
#     batch_size=BATCH_SIZE,
#     num_epochs=NUM_EPOCHS,
#     margin=MARGIN,
#     sqrt_size=SQRT_SIZE,
#     seed=SEED
#     )

In [None]:
# complex
# EMBEDDING_SIZE = 30
# BATCH_SIZE = 3
# NUM_EPOCHS = 200
# MARGIN = 1
# SQRT_SIZE = 6 / np.sqrt(EMBEDDING_SIZE)

# real_head_input = tf.keras.layers.Input(shape=(1,), name='real_head_input')
# img_head_input = tf.keras.layers.Input(shape=(1,), name='img_head_input')
# real_tail_input = tf.keras.layers.Input(shape=(1,), name='real_tail_input')
# img_tail_input = tf.keras.layers.Input(shape=(1,), name='img_tail_input')
# real_rel_input = tf.keras.layers.Input(shape=(1,), name='real_rel_input')
# img_rel_input = tf.keras.layers.Input(shape=(1,), name='img_rel_input')

# real_entity_embeddings = tf.keras.layers.Embedding(
#     input_dim=num_entities,
#     output_dim=EMBEDDING_SIZE,
#     name='real_entity_embeddings',
#     embeddings_initializer=tf.keras.initializers.RandomUniform(minval=-SQRT_SIZE, maxval=SQRT_SIZE, 
#                                                                seed=tf.random.set_seed(SEED))
#     )

# img_entity_embeddings = tf.keras.layers.Embedding(
#     input_dim=num_entities,
#     output_dim=EMBEDDING_SIZE,
#     name='img_entity_embeddings',
#     embeddings_initializer=tf.keras.initializers.RandomUniform(minval=-SQRT_SIZE, maxval=SQRT_SIZE, 
#                                                                seed=tf.random.set_seed(SEED))
#     )

# real_relation_embedding = tf.keras.layers.Embedding(
#     input_dim=num_relations,
#     output_dim=EMBEDDING_SIZE,
#     name='real_relation_embeddings',
#     embeddings_initializer=tf.keras.initializers.RandomUniform(minval=-SQRT_SIZE, maxval=SQRT_SIZE, 
#                                                                seed=tf.random.set_seed(SEED)),
#     )

# img_relation_embedding = tf.keras.layers.Embedding(
#     input_dim=num_relations,
#     output_dim=EMBEDDING_SIZE,
#     name='img_relation_embeddings',
#     embeddings_initializer=tf.keras.initializers.RandomUniform(minval=-SQRT_SIZE, maxval=SQRT_SIZE, 
#                                                                seed=tf.random.set_seed(SEED)),
#     )

# real_head = real_entity_embeddings(real_head_input)
# img_head = img_entity_embeddings(img_head_input)
# real_tail = real_entity_embeddings(real_tail_input)
# img_tail = img_entity_embeddings(img_tail_input)
# real_rel = real_relation_embedding(real_rel_input)
# img_rel = real_relation_embedding(img_rel_input)

# model = tf.keras.models.Model(
#     inputs=[
#         real_head_input,
#         img_head_input, 
#         real_tail_input, 
#         img_tail_input, 
#         real_rel_input,
#         img_rel_input
#         ], 
#     outputs=[
#         real_head,
#         img_head, 
#         real_tail, 
#         img_tail, 
#         real_rel,
#         img_rel
#         ]
#     )

In [None]:
# def get_negative_triples(head, rel, tail, num_entities, seed):
    
#     cond = tf.random.uniform(head.shape, 0, 2, dtype=tf.int64, seed=seed) #1 means keep entity
#     rnd = tf.random.uniform(head.shape, 0, num_entities-1, dtype=tf.int64, seed=seed)
    
#     neg_head = tf.where(cond == 1, head, rnd)
#     neg_tail = tf.where(cond == 1, rnd, tail)   
    
#     return neg_head, neg_tail

# train_data = tf.data.Dataset.from_tensor_slices((train2idx[:,0], train2idx[:,1], train2idx[:,2])).batch(BATCH_SIZE)
# #train_data = train_data.shuffle(buffer_size=50000, seed=tf.random.set_seed(SEED)).batch(BATCH_SIZE)

# #exp_decay = tf.keras.optimizers.schedules.ExponentialDecay(.01, 1000, .05)
# optimizer = tf.keras.optimizers.SGD(learning_rate=.001)

In [None]:
# def score(h,r,t):
#     return tf.reduce_sum(tf.square(h + r - t))

In [None]:
# losses = []

# for epoch in range(NUM_EPOCHS):
    
#     for head, rel, tail in train_data:
                
#         neg_head, neg_tail = get_negative_triples(head, rel, tail,seed=tf.random.set_seed(SEED))
        
#         with tf.GradientTape() as tape:
            
#             real_head_e,img_head_e, real_tail_e, img_tail_e, real_rel_e,img_rel_e = model([head, 
#                                                                            neg_head, tail, neg_tail, rel, rel])

            
#             dot1 = tf.reduce_sum(tf.multiply(real_rel_e, tf.multiply(real_head_e, real_tail_e)),1)
#             dot2 = tf.reduce_sum(tf.multiply(real_rel_e, tf.multiply(img_head_e, img_tail_e)),1)
#             dot3 = tf.reduce_sum(tf.multiply(img_rel_e, tf.multiply(real_head_e, img_tail_e)),1)
#             dot4 = tf.reduce_sum(tf.multiply(img_rel_e, tf.multiply(img_head_e, real_tail_e)),1)
            
#             embedding_loss = tf.reduce_sum(dot1+dot2+dot3-dot4)

#         grads = tape.gradient(embedding_loss, model.trainable_variables)
#         optimizer.apply_gradients(zip(grads,model.trainable_variables))
    
#     if not epoch % 10:
        
#         print('Current loss' , embedding_loss.numpy(),'at epoch', epoch)
    
#     losses.append(embedding_loss.numpy())
#train2idx

In [None]:
# for head, rel, tail in train_data:
#     print(head, rel, tail)

In [None]:
# losses = []

# for epoch in range(NUM_EPOCHS):
    
#     for head, rel, tail in train_data:
                
#         neg_head, neg_tail = utils.get_negative_triples(head, rel, tail,num_entities=num_entities,seed=SEED)
        
#         with tf.GradientTape() as tape:
                        
# #             pos_head_e,neg_head_e, pos_tail_e, neg_tail_e, rel_e= model([head, 
# #                                                                            neg_head, tail, neg_tail, rel])
            
#             pos_head_0,neg_head_0, pos_tail_0, neg_tail_0, rel_0 = model([head[0], 
#                                                                           neg_head[0], tail[0], 
#                                                                           neg_tail[0], rel[0]])
            
#             pos_head_1,neg_head_1, pos_tail_1, neg_tail_1, rel_1 = model([head[1], 
#                                                                           neg_head[1], tail[1], neg_tail[1], rel[1]])
            
# #             pos_head_2,neg_head_2, pos_tail_2, neg_tail_2, rel_2 = model([head[2], 
# #                                                                           neg_head[2], tail[2], neg_tail[2], rel[2]])
            
#             #pos = score(pos_head_0,rel_0, pos_tail_0) + \
#             #score(pos_head_1,rel_1, pos_tail_1) #+score(pos_head_2,rel_2, pos_tail_2)
            
#             #neg = score(neg_head_0,rel_0, neg_tail_0) + \
#             #score(neg_head_1,rel_1, neg_tail_1) #+\score(neg_head_2,rel_2, neg_tail_2)
            
            
#             pos = tf.reduce_sum(tf.square(pos_head_e + rel_e - pos_tail_e), axis=1)
#             neg = tf.reduce_sum(tf.square(neg_head_e + rel_e - neg_tail_e), axis=1)    
            
#             embedding_loss = tf.reduce_sum(tf.maximum(pos - neg + MARGIN, 0))
#             #pred_loss = tf.reduce_sum(tf.maximum(pos - neg + MARGIN, 0))
            
 
#         grads = tape.gradient(embedding_loss, model.trainable_variables)
#         optimizer.apply_gradients(zip(grads,model.trainable_variables))
    
#     if not epoch % 10:
        
#         print('Current loss' , embedding_loss.numpy(),'at epoch', epoch)
    
#     losses.append(embedding_loss.numpy())

In [None]:
#plt.plot(range(len(losses)), losses)

In [None]:
# all_names = np.concatenate((entities,relations))

# all_entities = model.get_layer('entity_embeddings').get_weights()[0]
# all_relations = model.get_layer('relation_embeddings').get_weights()[0]

# all_embeddings = np.concatenate((all_entities,all_relations))

In [None]:
# import numpy as np
# import os
# np.load(os.path.join('.','data','transE_embeddings.npz'))['entity_embeddings'].shape

In [None]:
# embeddings_2d = PCA(n_components=2, random_state=SEED).fit_transform(all_embeddings)
# fig, ax = plt.subplots(figsize=(12,12))
# ax.scatter(embeddings_2d[:,0], embeddings_2d[:,1])
# for i, txt in enumerate(all_names):
#     ax.annotate(txt, (embeddings_2d[i, 0], embeddings_2d[i, 1]))

In [None]:
#train

In [None]:
# all_names = list(all_names)
# all_names

In [None]:
# true_head = 'Eve'
# true_tail = 'Person'

# head_idx = all_names.index(true_head)
# tail_idx = all_names.index(true_tail)

In [None]:
# min_score = -100000000
# min_idx = -100000000

# for rel in relations:
    
#     rel_idx = all_names.index(rel)
    
#     current_score = -score(all_embeddings[head_idx],
#                            all_embeddings[rel_idx],
#                            all_embeddings[tail_idx]).numpy()
    
#     if current_score > min_score:
#         min_score = current_score
#         min_idx = rel_idx
        
#     print(all_names[rel_idx], current_score)

In [None]:
print(true_head, all_names[min_idx], true_tail)

In [None]:
# for h,r,t in train:
    
#     h_idx = all_names.index(h)
#     r_idx = all_names.index(r)
#     t_idx = all_names.index(t)
    
#     print(h,r,t,-score(all_embeddings[h_idx], all_embeddings[r_idx], all_embeddings[t_idx]).numpy())

In [None]:
# def closest_l2(source_head, source_rel, source_tail,k, data, entity_rel_names):
    
#     for h,r,t in data:
        
#         h_idx = entity_rel_names.index(h)
#         r_idx = entity_rel_names.index(r)
#         t_idx = entity_rel_names.index(t)
    
#     l2 = np.sqrt(np.sum((source_head - all_embeddings[h_idx])**2 + (rel- all_embeddings[r_idx])**2
#                 + (person-all_embeddings[t_idx])**2))

In [None]:
#find closest l2 triple
# eve = all_embeddings[head_idx]
# person = all_embeddings[tail_idx]
# rel = all_embeddings[min_idx]
# scores = []

# for h,r,t in train:
    
#     h_idx = all_names.index(h)
#     r_idx = all_names.index(r)
#     t_idx = all_names.index(t)
    
#     l2 = np.sqrt(np.sum((eve - all_embeddings[h_idx])**2 + (rel- all_embeddings[r_idx])**2
#                 + (person-all_embeddings[t_idx])**2))
#     scores.append(((h,r,t), l2))
#     print(h,r,t,l2)

In [None]:
# trace = set([('Eve', 'type', 'Lecturer'), ('Lecturer', 'subClassOf', 'Person')])
# gen = set([('Eve', 'type', 'Lecturer')])

In [None]:
#sorted(scores, key=lambda x:x[1])

In [None]:
# eve = all_embeddings[all_names.index('Eve')]
# person = all_embeddings[all_names.index('Person')]
# rel = all_embeddings[all_names.index('type')]

In [None]:
#head_idx, rel_idx, tail_idx = train2idx[0]

# def get_grad(head_idx, tail_idx, rel_idx, A):
    
#     with tf.GradientTape(persistent=True) as g:

# #         head_idx = tf.convert_to_tensor(head_idx)
# #         tail_idx = tf.convert_to_tensor(tail_idx)
# #         rel_idx = tf.convert_to_tensor(rel_idx)    
#         A = tf.convert_to_tensor(A)

#         head = tf.argmax(A[:,tail_idx, rel_idx])
#         tail = tf.argmax(A[head_idx,:, rel_idx])
#         rel = tf.argmax(A[head_idx,tail_idx,:])
 
#         head_e,_,tail_e,_,rel_e= model([head,head,tail,tail,rel])

#         get_score = score(head_e,rel_e,tail_e)
        
        
#     nabla = g.gradient(get_score, head_e)
#     return nabla
#get_grad(head_idx, tail_idx, rel_idx, A)

In [None]:
# import json

# with open('entity2wikidata.json','r') as f:
    
#     entities_dict = json.load(f)

# for k, d in entities_dict.items():
    
#     if 'France' in d['label']:
        
#         print(k)

# embeddings = model.get_layer('entity_embeddings').get_weights()[0]
# relations = model.get_layer('relation_embeddings').get_weights()[0]

# paris = '/m/05qtj'
# france = '/m/0f8l9c'

# paris_idx = ent2idx[paris]
# france_idx = ent2idx[france]
# capital = rel2idx['/location/country/capital']

# head, tail = embeddings[[paris_idx, france_idx], :]
# rel = relations[capital]

# #-np.linalg.norm((head+rel - tail),ord=2)

# scores = []

# for i in range(len(relations)):
    
#     temp_rel = relations[i]
    
#     score = -np.linalg.norm((head+temp_rel - tail),ord=2)
    
#     scores.append(score)

# idx2rel[np.argmax(scores)]

# for i in np.argsort(scores)[-10:]:
    
#     print(idx2rel[i])