In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import random as rn
import os
import utils
import transE
import numpy as np

In [2]:
# fb15k_237 = np.load('./data/fb15k_237.npz', allow_pickle=True)
# fb_train = fb15k_237['train']
# fb_valid = fb15k_237['valid']
# fb_test = fb15k_237['test']

In [3]:
SEED = 123
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
np.random.seed(SEED)
rn.seed(SEED)
tf.random.set_seed(SEED)

data = np.load(os.path.join('.','data','royalty_spouse.npz'))

train = data['X_train']
test = data['X_test']

train_exp = data['train_exp']
test_exp = data['test_exp']

full_exp = np.concatenate((train_exp,test_exp), axis=0)

#full_train = np.concatenate((train,train_exp), axis=0)

entities = data['entities'].tolist()
relations = data['relations'].tolist()

NUM_ENTITIES = len(entities)
NUM_RELATIONS = len(relations)

ent2idx = dict(zip(entities, range(NUM_ENTITIES)))
rel2idx = dict(zip(relations, range(NUM_RELATIONS)))

idx2ent = {v:k for k,v in ent2idx.items()}
idx2rel = {v:k for k,v in rel2idx.items()}

In [4]:
train2idx = utils.array2idx(train,ent2idx,rel2idx)
trainexp2idx = utils.array2idx(train_exp,ent2idx,rel2idx)

test2idx = utils.array2idx(test,ent2idx,rel2idx)
testexp2idx = utils.array2idx(test_exp,ent2idx,rel2idx)

In [None]:
EMBEDDING_SIZE = 50
BATCH_SIZE = 128
NUM_EPOCHS = 500
MARGIN = 2
LEARNING_RATE = .001

In [None]:
model = transE.ExTransE(NUM_ENTITIES,NUM_RELATIONS,EMBEDDING_SIZE,random_state=SEED)

In [None]:
model.compile(
        optimizer=tf.keras.optimizers.SGD(learning_rate=LEARNING_RATE),
        num_entities=NUM_ENTITIES,
        margin=MARGIN,
        pred_loss=transE.pred_loss,
        exp_loss=transE.exp_loss
        )

In [None]:
model.fit(
    x=[
        train2idx[:,0],
        train2idx[:,1],
        train2idx[:,2],
        trainexp2idx[:,:,0].flatten(),
        trainexp2idx[:,:,1].flatten(),
        trainexp2idx[:,:,2].flatten()
    ],
    epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=False
    )

In [None]:
test_head_e, test_rel_e, test_tail_e, test_exp_head_e, test_exp_rel_e, test_exp_tail_e = model.predict(x=[
        test2idx[:,0],
        test2idx[:,1],
        test2idx[:,2],
        testexp2idx[:,:,0].flatten(),
        testexp2idx[:,:,1].flatten(),
        testexp2idx[:,:,2].flatten()
        ]
    )

In [None]:
top_k = 1
pred_exp = []

for i in range(len(test2idx)):
    
    triple_h_e = test_head_e[i]
    triple_r_e = test_rel_e[i]
    triple_t_e = test_tail_e[i]
    
    squared_diff = np.square(triple_h_e - test_exp_head_e) + np.square(triple_r_e-test_exp_rel_e) + np.square(triple_t_e-test_exp_tail_e)

    l2_dist = np.sqrt(np.sum(squared_diff,axis=1))

    closest_l2 = np.argsort(l2_dist)[:top_k]

    k_closest = testexp2idx[closest_l2]

    pred_exp.append(k_closest)    

In [None]:
pred_exp = np.array(pred_exp).reshape(-1,1,3)

In [None]:
utils.jaccard_score(testexp2idx, pred_exp)

In [None]:
# # train_data = tf.data.Dataset.from_tensor_slices((train2idx[:,0], train2idx[:,1], train2idx[:,2],
# #                                                 exp2idx[:,0],exp2idx[:,1],exp2idx[:,2])).batch(batch_size)
train_data = tf.data.Dataset.from_tensor_slices((train2idx[:,0],train2idx[:,1],train2idx[:,2],
                                                trainexp2idx[:,:,0].reshape(-1),trainexp2idx[:,:,1].reshape(-1),
                                                 trainexp2idx[:,:,2].reshape(-1))).batch(BATCH_SIZE)
# optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)

In [None]:
for a,b,c,d,e,f in train_data:
    print(a.shape, b.shape, c.shape, d.shape, e.shape,f.shape)

In [None]:
# epoch_loss = []

# for epoch in range(num_epochs):

#     for pos_head, rel, pos_tail, pos_head_exp,rel_exp, pos_tail_exp in train_data:

#         neg_head, neg_tail = utils.get_negative_triples(
#             head=pos_head, 
#             rel=rel, 
#             tail=pos_tail,
#             num_entities=num_entities,
#             random_state=SEED
#             )

#         neg_head_exp, neg_tail_exp = utils.get_negative_triples(
#             head=pos_head_exp, 
#             rel=rel_exp, 
#             tail=pos_tail_exp,
#             num_entities=num_entities,
#             random_state=SEED
#             )

#         with tf.GradientTape() as tape:

#             pos_head_e, pos_tail_e, neg_head_e, neg_tail_e, rel_e = model([
#                 pos_head,
#                 pos_tail, 
#                 neg_head, 
#                 neg_tail, 
#                 rel
#                 ]
#             )

#             pos_head_exp_e, pos_tail_exp_e, _, _, rel_exp_e = model([
#                 pos_head_exp,
#                 pos_tail_exp, 
#                 tf.zeros_like(neg_head_exp),  
#                 tf.zeros_like(neg_tail_exp), 
#                 rel_exp
#                 ]
#             )

#             prediction_loss = transE.pred_loss(pos_head_e,pos_tail_e,neg_head_e,neg_tail_e,rel_e)
#             #explain_loss = transE.exp_loss(pos_head_exp_e,pos_tail_exp_e,neg_head_exp_e, neg_tail_exp_e, rel_exp_e)
#             explain_loss = transE.exp_loss(pos_head_e,pos_tail_exp_e,pos_head_exp_e,pos_tail_exp_e,rel_e,rel_exp_e)
#             #print(f"pred loss {prediction_loss}")
#             #print(f"explain loss {explain_loss}")
#             total_loss = prediction_loss + explain_loss

#         grads = tape.gradient(total_loss,model.trainable_variables)
#         optimizer.apply_gradients(zip(grads,model.trainable_variables))

#     #if not epoch % 10:
#     #print(f"Loss at epoch {epoch}: {total_loss.numpy()} ")
        
#     epoch_loss.append(np.round(total_loss.numpy(),5))

In [None]:
entity_embeddings = utils.get_entity_embeddings(model)
relation_embeddings = utils.get_relation_embeddings(model)

In [None]:

# model = transE.ExTransE(
#     num_entities=NUM_ENTITIES,
#     num_relations=NUM_RELATIONS,
#     embedding_size=EMBEDDING_SIZE,
#     margin=MARGIN,
#     random_state=SEED)
# optimizer=tf.keras.optimizers.SGD(learning_rate=LEARNING_RATE)
# train_data = tf.data.Dataset.from_tensor_slices((train2idx[:,0],train2idx[:,1],train2idx[:,2],
#                                                 trainexp2idx[:,:,0].reshape(-1),trainexp2idx[:,:,1].reshape(-1),
#                                                  trainexp2idx[:,:,2].reshape(-1))).batch(BATCH_SIZE)

# epoch_loss = []

# for epoch in range(NUM_EPOCHS):

#     for pos_head, pos_rel, pos_tail, pos_head_exp,pos_rel_exp, pos_tail_exp in train_data:

#         neg_head, neg_tail = utils.get_negative_triples(
#             head=pos_head, 
#             rel=pos_rel, 
#             tail=pos_tail,
#             num_entities=NUM_ENTITIES
#             )

#         neg_head_exp, neg_tail_exp = utils.get_negative_triples(
#             head=pos_head_exp, 
#             rel=pos_rel_exp, 
#             tail=pos_tail_exp,
#             num_entities=NUM_ENTITIES
#             )

#         with tf.GradientTape() as tape:

#             pos_head_e,pos_rel_e,pos_tail_e,pos_head_exp_e,pos_rel_exp_e,pos_tail_exp_e = model([
#                 pos_head,
#                 pos_rel,
#                 pos_tail,
#                 pos_head_exp,
#                 pos_rel_exp,
#                 pos_tail_exp
#                 ]
#             )

#             neg_head_e,neg_rel_e,neg_tail_e,neg_head_exp_e,neg_rel_exp_e,neg_tail_exp_e = model([
#                 neg_head,
#                 pos_rel,#pos_rel is correct, 
#                 neg_tail,
#                 neg_head_exp,
#                 pos_rel_exp,
#                 neg_tail_exp
#                 ]
#             )

#             prediction_loss = transE.pred_loss(
#                 pos_head_e,
#                 pos_rel_e,
#                 pos_tail_e,
#                 neg_head_e,
#                 neg_rel_e,
#                 neg_tail_e,
#                 margin=MARGIN
#             )

#             explain_loss = transE.exp_loss(
#                 pos_head_exp_e,
#                 pos_rel_exp_e,
#                 pos_tail_exp_e,
#                 neg_head_exp_e,
#                 neg_rel_exp_e,
#                 neg_tail_exp_e,
#                 margin=MARGIN
#             )

#             total_loss = prediction_loss + explain_loss

#         grads = tape.gradient(total_loss,model.trainable_variables)
#         optimizer.apply_gradients(zip(grads,model.trainable_variables))

#     epoch_loss.append(np.round(total_loss.numpy(),5))

# print(np.mean(epoch_loss))

In [None]:
# top_k = transE.exp_score(test2idx[0],k=2,data=test2idx,entity_embeddings=entity_embeddings,
#                          relation_embeddings=relation_embeddings)

In [None]:
# top_k

In [None]:
# top_k[:,0]

In [None]:
# def exp_score(triple,k,data,entity_embeddings,relation_embeddings):
    
#     triple_h_e = entity_embeddings[triple[0]]
#     triple_r_e = relation_embeddings[triple[1]]
#     triple_t_e = entity_embeddings[triple[2]]

#     h_e = entity_embeddings[data[:,0]]
#     r_e = relation_embeddings[data[:,1]]
#     t_e = entity_embeddings[data[:,2]]

#     squared_diff = np.square(triple_h_e - h_e) + np.square(triple_r_e-r_e) + np.square(triple_t_e-t_e)

#     l2_dist = np.sqrt(np.sum(squared_diff,axis=1))

#     closest_l2 = np.argsort(l2_dist)[:k]
    
#     return data[closest_l2]

#########account for padding
pred_exp = []

for i in range(len(test2idx)):
    
    top_k = transE.exp_score(test2idx[i],k=1,data=testexp2idx,
                             entity_embeddings=entity_embeddings,
                             relation_embeddings=relation_embeddings)

    h=np.array([idx2ent[i] for i in top_k[:,0]]).reshape(-1,1)
    r=np.array([idx2rel[i] for i in top_k[:,1]]).reshape(-1,1)
    t=np.array([idx2ent[i] for i in top_k[:,2]]).reshape(-1,1)

    pred = [list(i) for i in np.concatenate([h,r,t], axis=1)]
    
    pred_exp.append(pred)

#utils.jaccard_score(test_exp,pred_exp)

In [None]:
true_exp = np.array([
    np.array([['John_II_of_France', 'spouse', 'Bonne_of_Bohemia'],['Nanda_Bayin', 'spouse', 'Min_Htwe_of_Toungoo']]),
    np.array([['Keelikōlani', 'spouse', 'Leleiohoku_I'],['Saliha_Sultan', 'spouse', 'Mustafa_II']]),
    np.array([['Kalanipauahi', 'spouse', 'Kamehameha_II']])
])

#true_exp = test_exp[0:4]
#pred_exp[0:5]
true_exp

In [None]:
#test_exp[0:5]
pred_exp[0:3]

In [None]:
# import utils
# utils.jaccard_score(true_exp,pred_exp[0:3])
def jaccard_score(true_exp,pred_exp):

    assert len(true_exp) == len(pred_exp)

    scores = []

    for i in range(len(true_exp)):

        pred_i = pred_exp[i]
        true_i = true_exp[i]

        num_true_traces = min(true_i.ndim,true_i.shape[0])

        if isinstance(pred_i,np.ndarray):
            num_pred_traces = pred_i.ndim
        
        elif isinstance(pred_i,list):
            num_pred_traces = len(pred_i)
    
        bool_array = (pred_i == true_i)

        count = 0

        for row in bool_array:
            if row.all():
                count +=1

        score = count / (num_true_traces+num_pred_traces-count)

        scores.append(score)
    print(scores)
    return np.mean(scores)

jaccard_score(true_exp,pred_exp[0:3])

In [None]:
#what data to use for exp triple ranking

# scores = []

# for i in range(len(pred_exp[0:3])):
    
#     pred_i = pred_exp[i]
#     true_i = true_exp[i]
    
#     num_true_traces = min(true_i.ndim,true_i.shape[0])
    
#     if isinstance(pred_i,np.ndarray):
#         num_pred_traces = pred_i.ndim
        
#     elif isinstance(pred_i,list):
#         num_pred_traces = len(pred_i)
    
#     bool_array = (pred_i == true_i)

#     count = 0
#     print(pred_i,true_i)
#     for row in bool_array:
#         print(row)
#         if row.all():
#             count += 1
#     score = count / (num_true_traces+num_pred_traces-count)
#     scores.append(score)
        
#     if (num_pred_traces > num_true_traces):

#         count = 0
        
#         for j in range(num_true_traces):

#             for k in range(num_pred_traces):

#                 if j != k:
       
#                     if (true_i[j]==pred_i[k]).all():

#                         count +=1

#         score = count/num_pred_traces
#     elif (num_pred_traces < num_true_traces):
#         print('still no')
#         pass
#     elif (num_pred_traces == num_true_traces):
        
#         if (num_true_traces==1) and (num_pred_traces==1):
#             if (pred_i==true_i).all():
#                 score = 1
#             else:
#                 score = 0
#         else:
#             count = 0
            
#             for j in range(num_true_traces):
                
#                 if (pred_i[j] == true_i[j]).all():
                    
#                     count += 1
                    
#             score = count/num_true_traces
            
#     scores.append(score)

#     if (num_true_traces > 1) and (num_pred_traces > 1):
        
#         count = 0
        
#         if (num_true_traces > num_pred_traces):
            
#             for j in range(num_true_traces):

#                 for k in range(num_true_traces):

#                     if j != k:
#                         print('all pred', pred_i)
#                         print('pred',pred_i[j])
#                         print('true',test_exp[i][k])
#                         if (pred_i[j]==test_exp[i][k]).all():

#                             count +=1

#             score = count/num_traces

#             scores.append(score)
            
#         elif (num_true_traces < num_pred_traces):
        
#         else:

In [None]:
scores

In [None]:
pred_exp[1]

In [None]:
test_exp[i]

In [None]:


# data = np.load('./data/human_data.npz')
# train = data['X']

# g=Graph()
# g.parse("../CORESE-DATA/human-data.rdf", format="xml")

# triples = []

# for i,j,k in g:
    
#     head = str(i).split('#')
#     rel = str(j).split('#')
#     tail = str(k).split('#')
    
#     if head[0] == 'http://www.inria.fr/2015/humans-instances':
        
#         triples.append((head[-1], rel[-1], tail[-1]))

# triples = [('Eve', 'type', 'Lecturer'),
#            #('Eve', 'type', 'Person'), 
#            ('Lecturer', 'subClassOf', 'Person'), 
#            #('David', 'type', 'Person'),
#            ('David', 'type', 'Researcher'),
#            ('Researcher', 'subClassOf', 'Person'),
#            ('Flora', 'hasSpouse', 'Gaston'),
#            ('Gaston', 'type', 'Person'),
#            #('Flora', 'type', 'Person'),
#           ]
# g=Graph()
# g.parse("/Users/nhalliwe/Desktop/CORESE-DATA/human-data.rdf", format="xml")
# triples = []

# for i,j,k in g:
    
#     head = str(i).split('#')
#     rel = str(j).split('#')
#     tail = str(k).split('#')
    
#     if head[0] == 'http://www.inria.fr/2015/humans-instances':
        
#         triples.append((head[-1], rel[-1], tail[-1]))

# train = np.array(triples)
# traces = utils.parse_traces(file_name='../traces/entailment.ttl')
# exp_triples = utils.get_exp_triples(train,traces)

# entities = np.unique(np.concatenate((train[:,0], train[:,2], exp_triples[:,0], exp_triples[:,2]), axis=0)).tolist()
# relations = np.unique(np.concatenate((train[:,1], exp_triples[:,1])), axis=0).tolist()

# num_entities = len(entities)
# num_relations = len(relations)

# ent2idx = dict(zip(entities, range(num_entities)))
# rel2idx = dict(zip(relations, range(num_relations)))

# idx2ent = {idx:ent for ent,idx in ent2idx.items()}
# idx2rel = {idx:rel for rel,idx in rel2idx.items()}

# train2idx = utils.train2idx(train,ent2idx,rel2idx)
# idx2train = utils.idx2train(train2idx,idx2ent,idx2rel)

# def get_neighbor_idx(A):

#     A = sparse.coo_matrix(A)

#     indices = {}

#     for i,j in zip(A.row,A.col):

#         if i in indices:
#             indices[i].append(j)

#         else:
#             indices[i] = [j]

#     return indices

In [None]:
# for i in train:
#     if tuple(i) in traces:
#         print(tuple(i),traces[tuple(i)])

In [None]:
# for h,r,t in traces[('Eve', 'type', 'Person')]:
#     print(ent2idx[h],rel2idx[r],ent2idx[t])


        
        #exp_triples.append(list(traces[tuple(i)]))

In [None]:
#idx = tf.convert_to_tensor([6,4])
#tf.concat([tf.gather_nd(train2idx,tf.where(train2idx[:,0] == idx)), tf.gather_nd(train2idx,tf.where(train2idx[:,2] == idx))], axis=0)

#tf.where(train2idx[:,0]==idx)

In [None]:
# A = np.zeros(shape=(num_entities,num_entities,num_relations))

# for h,r,t in train:
    
#     h_idx = entities.index(h)
#     r_idx = relations.index(r)
#     t_idx = entities.index(t)
    
#     A[h_idx,t_idx,r_idx] = 1

In [None]:
## valid2idx = []

# for head, rel, tail in valid:
    
#     head_idx = ent2idx[head]
#     tail_idx = ent2idx[tail]
#     rel_idx = rel2idx[rel]

#     valid2idx.append([head_idx, rel_idx, tail_idx])
    
# valid2idx = np.array(valid2idx)

In [None]:
# transE
# EMBEDDING_SIZE = 30
# BATCH_SIZE = 2
# NUM_EPOCHS = 200
# MARGIN = 2
# SQRT_SIZE = 6 / np.sqrt(EMBEDDING_SIZE)

# model = transE.build_model(
#     embedding_size=EMBEDDING_SIZE,
#     num_entities=num_entities,
#     num_relations=num_relations,
#     batch_size=BATCH_SIZE,
#     num_epochs=NUM_EPOCHS,
#     margin=MARGIN,
#     sqrt_size=SQRT_SIZE,
#     seed=SEED
#     )

In [None]:
# complex
# EMBEDDING_SIZE = 30
# BATCH_SIZE = 3
# NUM_EPOCHS = 200
# MARGIN = 1
# SQRT_SIZE = 6 / np.sqrt(EMBEDDING_SIZE)

# real_head_input = tf.keras.layers.Input(shape=(1,), name='real_head_input')
# img_head_input = tf.keras.layers.Input(shape=(1,), name='img_head_input')
# real_tail_input = tf.keras.layers.Input(shape=(1,), name='real_tail_input')
# img_tail_input = tf.keras.layers.Input(shape=(1,), name='img_tail_input')
# real_rel_input = tf.keras.layers.Input(shape=(1,), name='real_rel_input')
# img_rel_input = tf.keras.layers.Input(shape=(1,), name='img_rel_input')

# real_entity_embeddings = tf.keras.layers.Embedding(
#     input_dim=num_entities,
#     output_dim=EMBEDDING_SIZE,
#     name='real_entity_embeddings',
#     embeddings_initializer=tf.keras.initializers.RandomUniform(minval=-SQRT_SIZE, maxval=SQRT_SIZE, 
#                                                                seed=tf.random.set_seed(SEED))
#     )

# img_entity_embeddings = tf.keras.layers.Embedding(
#     input_dim=num_entities,
#     output_dim=EMBEDDING_SIZE,
#     name='img_entity_embeddings',
#     embeddings_initializer=tf.keras.initializers.RandomUniform(minval=-SQRT_SIZE, maxval=SQRT_SIZE, 
#                                                                seed=tf.random.set_seed(SEED))
#     )

# real_relation_embedding = tf.keras.layers.Embedding(
#     input_dim=num_relations,
#     output_dim=EMBEDDING_SIZE,
#     name='real_relation_embeddings',
#     embeddings_initializer=tf.keras.initializers.RandomUniform(minval=-SQRT_SIZE, maxval=SQRT_SIZE, 
#                                                                seed=tf.random.set_seed(SEED)),
#     )

# img_relation_embedding = tf.keras.layers.Embedding(
#     input_dim=num_relations,
#     output_dim=EMBEDDING_SIZE,
#     name='img_relation_embeddings',
#     embeddings_initializer=tf.keras.initializers.RandomUniform(minval=-SQRT_SIZE, maxval=SQRT_SIZE, 
#                                                                seed=tf.random.set_seed(SEED)),
#     )

# real_head = real_entity_embeddings(real_head_input)
# img_head = img_entity_embeddings(img_head_input)
# real_tail = real_entity_embeddings(real_tail_input)
# img_tail = img_entity_embeddings(img_tail_input)
# real_rel = real_relation_embedding(real_rel_input)
# img_rel = real_relation_embedding(img_rel_input)

# model = tf.keras.models.Model(
#     inputs=[
#         real_head_input,
#         img_head_input, 
#         real_tail_input, 
#         img_tail_input, 
#         real_rel_input,
#         img_rel_input
#         ], 
#     outputs=[
#         real_head,
#         img_head, 
#         real_tail, 
#         img_tail, 
#         real_rel,
#         img_rel
#         ]
#     )

In [None]:
# def get_negative_triples(head, rel, tail, num_entities, seed):
    
#     cond = tf.random.uniform(head.shape, 0, 2, dtype=tf.int64, seed=seed) #1 means keep entity
#     rnd = tf.random.uniform(head.shape, 0, num_entities-1, dtype=tf.int64, seed=seed)
    
#     neg_head = tf.where(cond == 1, head, rnd)
#     neg_tail = tf.where(cond == 1, rnd, tail)   
    
#     return neg_head, neg_tail

# train_data = tf.data.Dataset.from_tensor_slices((train2idx[:,0], train2idx[:,1], train2idx[:,2])).batch(BATCH_SIZE)
# #train_data = train_data.shuffle(buffer_size=50000, seed=tf.random.set_seed(SEED)).batch(BATCH_SIZE)

# #exp_decay = tf.keras.optimizers.schedules.ExponentialDecay(.01, 1000, .05)
# optimizer = tf.keras.optimizers.SGD(learning_rate=.001)

In [None]:
# def score(h,r,t):
#     return tf.reduce_sum(tf.square(h + r - t))

In [None]:
# losses = []

# for epoch in range(NUM_EPOCHS):
    
#     for head, rel, tail in train_data:
                
#         neg_head, neg_tail = get_negative_triples(head, rel, tail,seed=tf.random.set_seed(SEED))
        
#         with tf.GradientTape() as tape:
            
#             real_head_e,img_head_e, real_tail_e, img_tail_e, real_rel_e,img_rel_e = model([head, 
#                                                                            neg_head, tail, neg_tail, rel, rel])

            
#             dot1 = tf.reduce_sum(tf.multiply(real_rel_e, tf.multiply(real_head_e, real_tail_e)),1)
#             dot2 = tf.reduce_sum(tf.multiply(real_rel_e, tf.multiply(img_head_e, img_tail_e)),1)
#             dot3 = tf.reduce_sum(tf.multiply(img_rel_e, tf.multiply(real_head_e, img_tail_e)),1)
#             dot4 = tf.reduce_sum(tf.multiply(img_rel_e, tf.multiply(img_head_e, real_tail_e)),1)
            
#             embedding_loss = tf.reduce_sum(dot1+dot2+dot3-dot4)

#         grads = tape.gradient(embedding_loss, model.trainable_variables)
#         optimizer.apply_gradients(zip(grads,model.trainable_variables))
    
#     if not epoch % 10:
        
#         print('Current loss' , embedding_loss.numpy(),'at epoch', epoch)
    
#     losses.append(embedding_loss.numpy())
#train2idx

In [None]:
# for head, rel, tail in train_data:
#     print(head, rel, tail)

In [None]:
# losses = []

# for epoch in range(NUM_EPOCHS):
    
#     for head, rel, tail in train_data:
                
#         neg_head, neg_tail = utils.get_negative_triples(head, rel, tail,num_entities=num_entities,seed=SEED)
        
#         with tf.GradientTape() as tape:
                        
# #             pos_head_e,neg_head_e, pos_tail_e, neg_tail_e, rel_e= model([head, 
# #                                                                            neg_head, tail, neg_tail, rel])
            
#             pos_head_0,neg_head_0, pos_tail_0, neg_tail_0, rel_0 = model([head[0], 
#                                                                           neg_head[0], tail[0], 
#                                                                           neg_tail[0], rel[0]])
            
#             pos_head_1,neg_head_1, pos_tail_1, neg_tail_1, rel_1 = model([head[1], 
#                                                                           neg_head[1], tail[1], neg_tail[1], rel[1]])
            
# #             pos_head_2,neg_head_2, pos_tail_2, neg_tail_2, rel_2 = model([head[2], 
# #                                                                           neg_head[2], tail[2], neg_tail[2], rel[2]])
            
#             #pos = score(pos_head_0,rel_0, pos_tail_0) + \
#             #score(pos_head_1,rel_1, pos_tail_1) #+score(pos_head_2,rel_2, pos_tail_2)
            
#             #neg = score(neg_head_0,rel_0, neg_tail_0) + \
#             #score(neg_head_1,rel_1, neg_tail_1) #+\score(neg_head_2,rel_2, neg_tail_2)
            
            
#             pos = tf.reduce_sum(tf.square(pos_head_e + rel_e - pos_tail_e), axis=1)
#             neg = tf.reduce_sum(tf.square(neg_head_e + rel_e - neg_tail_e), axis=1)    
            
#             embedding_loss = tf.reduce_sum(tf.maximum(pos - neg + MARGIN, 0))
#             #pred_loss = tf.reduce_sum(tf.maximum(pos - neg + MARGIN, 0))
            
 
#         grads = tape.gradient(embedding_loss, model.trainable_variables)
#         optimizer.apply_gradients(zip(grads,model.trainable_variables))
    
#     if not epoch % 10:
        
#         print('Current loss' , embedding_loss.numpy(),'at epoch', epoch)
    
#     losses.append(embedding_loss.numpy())

In [None]:
#plt.plot(range(len(losses)), losses)

In [None]:
# all_names = np.concatenate((entities,relations))

# all_entities = model.get_layer('entity_embeddings').get_weights()[0]
# all_relations = model.get_layer('relation_embeddings').get_weights()[0]

# all_embeddings = np.concatenate((all_entities,all_relations))

In [None]:
# import numpy as np
# import os
# np.load(os.path.join('.','data','transE_embeddings.npz'))['entity_embeddings'].shape

In [None]:
# embeddings_2d = PCA(n_components=2, random_state=SEED).fit_transform(all_embeddings)
# fig, ax = plt.subplots(figsize=(12,12))
# ax.scatter(embeddings_2d[:,0], embeddings_2d[:,1])
# for i, txt in enumerate(all_names):
#     ax.annotate(txt, (embeddings_2d[i, 0], embeddings_2d[i, 1]))

In [None]:
#train

In [None]:
# all_names = list(all_names)
# all_names

In [None]:
# true_head = 'Eve'
# true_tail = 'Person'

# head_idx = all_names.index(true_head)
# tail_idx = all_names.index(true_tail)

In [None]:
# min_score = -100000000
# min_idx = -100000000

# for rel in relations:
    
#     rel_idx = all_names.index(rel)
    
#     current_score = -score(all_embeddings[head_idx],
#                            all_embeddings[rel_idx],
#                            all_embeddings[tail_idx]).numpy()
    
#     if current_score > min_score:
#         min_score = current_score
#         min_idx = rel_idx
        
#     print(all_names[rel_idx], current_score)

In [None]:
#print(true_head, all_names[min_idx], true_tail)

In [None]:
# for h,r,t in train:
    
#     h_idx = all_names.index(h)
#     r_idx = all_names.index(r)
#     t_idx = all_names.index(t)
    
#     print(h,r,t,-score(all_embeddings[h_idx], all_embeddings[r_idx], all_embeddings[t_idx]).numpy())

In [None]:
# def closest_l2(source_head, source_rel, source_tail,k, data, entity_rel_names):
    
#     for h,r,t in data:
        
#         h_idx = entity_rel_names.index(h)
#         r_idx = entity_rel_names.index(r)
#         t_idx = entity_rel_names.index(t)
    
#     l2 = np.sqrt(np.sum((source_head - all_embeddings[h_idx])**2 + (rel- all_embeddings[r_idx])**2
#                 + (person-all_embeddings[t_idx])**2))

In [None]:
#find closest l2 triple
# eve = all_embeddings[head_idx]
# person = all_embeddings[tail_idx]
# rel = all_embeddings[min_idx]
# scores = []

# for h,r,t in train:
    
#     h_idx = all_names.index(h)
#     r_idx = all_names.index(r)
#     t_idx = all_names.index(t)
    
#     l2 = np.sqrt(np.sum((eve - all_embeddings[h_idx])**2 + (rel- all_embeddings[r_idx])**2
#                 + (person-all_embeddings[t_idx])**2))
#     scores.append(((h,r,t), l2))
#     print(h,r,t,l2)

In [None]:
# trace = set([('Eve', 'type', 'Lecturer'), ('Lecturer', 'subClassOf', 'Person')])
# gen = set([('Eve', 'type', 'Lecturer')])

In [None]:
#sorted(scores, key=lambda x:x[1])

In [None]:
# eve = all_embeddings[all_names.index('Eve')]
# person = all_embeddings[all_names.index('Person')]
# rel = all_embeddings[all_names.index('type')]

In [None]:
#head_idx, rel_idx, tail_idx = train2idx[0]

# def get_grad(head_idx, tail_idx, rel_idx, A):
    
#     with tf.GradientTape(persistent=True) as g:

# #         head_idx = tf.convert_to_tensor(head_idx)
# #         tail_idx = tf.convert_to_tensor(tail_idx)
# #         rel_idx = tf.convert_to_tensor(rel_idx)    
#         A = tf.convert_to_tensor(A)

#         head = tf.argmax(A[:,tail_idx, rel_idx])
#         tail = tf.argmax(A[head_idx,:, rel_idx])
#         rel = tf.argmax(A[head_idx,tail_idx,:])
 
#         head_e,_,tail_e,_,rel_e= model([head,head,tail,tail,rel])

#         get_score = score(head_e,rel_e,tail_e)
        
        
#     nabla = g.gradient(get_score, head_e)
#     return nabla
#get_grad(head_idx, tail_idx, rel_idx, A)

In [None]:
# import json

# with open('entity2wikidata.json','r') as f:
    
#     entities_dict = json.load(f)

# for k, d in entities_dict.items():
    
#     if 'France' in d['label']:
        
#         print(k)

# embeddings = model.get_layer('entity_embeddings').get_weights()[0]
# relations = model.get_layer('relation_embeddings').get_weights()[0]

# paris = '/m/05qtj'
# france = '/m/0f8l9c'

# paris_idx = ent2idx[paris]
# france_idx = ent2idx[france]
# capital = rel2idx['/location/country/capital']

# head, tail = embeddings[[paris_idx, france_idx], :]
# rel = relations[capital]

# #-np.linalg.norm((head+rel - tail),ord=2)

# scores = []

# for i in range(len(relations)):
    
#     temp_rel = relations[i]
    
#     score = -np.linalg.norm((head+temp_rel - tail),ord=2)
    
#     scores.append(score)

# idx2rel[np.argmax(scores)]

# for i in np.argsort(scores)[-10:]:
    
#     print(idx2rel[i])

In [None]:
import numpy as np
import random as rn
import os
import utils
from sklearn.model_selection import train_test_split
import tensorflow as tf
import transE

SEED = 123
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
rn.seed(SEED)

In [None]:
data = np.load(os.path.join('.','data','royalty_spouse.npz'))

train = data['X_train']
test = data['X_test']

train_exp = data['train_exp']
test_exp = data['test_exp']

full_train = np.concatenate((train,train_exp.reshape(-1,3)), axis=0)
#full_test = np.concatenate((test,test_exp.reshape(-1,3)), axis=0)
#full_data = np.concatenate((full_train,full_test), axis=0)

entities = data['entities'].tolist()
relations = data['relations'].tolist()

num_entities = len(entities)
num_relations = len(relations)

ent2idx = dict(zip(entities, range(num_entities)))
rel2idx = dict(zip(relations, range(num_relations)))

#idx2ent = {idx:ent for ent,idx in ent2idx.items()}
#idx2rel = {idx:rel for rel,idx in rel2idx.items()}

train2idx = utils.array2idx(train,ent2idx,rel2idx)
test2idx = utils.array2idx(test,ent2idx,rel2idx)

trainexp2idx = utils.array2idx(train_exp,ent2idx,rel2idx)
testexp2idx = utils.array2idx(test_exp,ent2idx,rel2idx)

In [None]:
NUM_ENTITIES = len(entities)
NUM_RELATIONS = len(relations)
EMBEDDING_SIZE = 50
BATCH_SIZE = 128
NUM_EPOCHS = 10
MARGIN = 2
LEARNING_RATE = .001

In [None]:
optimizer = tf.keras.optimizers.SGD(learning_rate=LEARNING_RATE)

model = transE.ExTransE(
    num_entities=NUM_ENTITIES,
    num_relations=2,
    embedding_size=EMBEDDING_SIZE,
    margin=MARGIN,
    random_state=SEED)

In [None]:
# model = ExTransE(
#     num_entities=NUM_ENTITIES,
#     num_relations=NUM_RELATIONS,
#     embedding_size=EMBEDDING_SIZE,
#     margin=MARGIN,
#     random_state=SEED)
#model.compile(optimizer=optimizer)

In [None]:
model.summary()

In [None]:
train_data = tf.data.Dataset.from_tensor_slices((train2idx[:,0],train2idx[:,1],train2idx[:,2],
                                                trainexp2idx[:,:,0].reshape(-1),trainexp2idx[:,:,1].reshape(-1),
                                                 trainexp2idx[:,:,2].reshape(-1))).batch(3)

train_data

In [None]:
# MAX_PADDING = 2
# spouse_triples,spouse_traces = utils.parse_ttl(os.path.join('.','data','traces','spouse.ttl'),
#                                               max_padding=MAX_PADDING)

In [None]:
# train, test, train_exp, test_exp = train_test_split(spouse_triples,
#     spouse_traces,test_size=0.30, random_state=42)

In [None]:
# exp_entities = np.array([[spouse_traces[:,i,:][:,0],
#     spouse_traces[:,i,:][:,2]] for i in range(MAX_PADDING)]).flatten()

# exp_relations = np.array([spouse_traces[:,i,:][:,1] for i in range(MAX_PADDING)]).flatten()

# entities = np.unique(np.concatenate([spouse_triples[:,0], spouse_triples[:,2], exp_entities],axis=0))
# relations = np.unique(np.concatenate([spouse_triples[:,1], exp_relations],axis=0))

In [None]:
# num_entities = len(entities)
# num_relations = len(relations)

# ent2idx = dict(zip(entities, range(num_entities)))
# rel2idx = dict(zip(relations, range(num_relations)))

# idx2ent = {idx:ent for ent,idx in ent2idx.items()}
# idx2rel = {idx:rel for rel,idx in rel2idx.items()}

# train2idx = utils.array2idx(train,ent2idx,rel2idx)
# test2idx = utils.array2idx(test,ent2idx,rel2idx)

# trainexp2idx = utils.array2idx(train_exp,ent2idx,rel2idx)
# testexp2idx = utils.array2idx(test_exp,ent2idx,rel2idx)

In [None]:
# train_data = tf.data.Dataset.from_tensor_slices((train2idx[:,0],train2idx[:,1],train2idx[:,2],
#                                                  tuple((((traces2idx[:,i,:][:,0],traces2idx[:,i,:][:,1],
#                                                           traces2idx[:,i,:][:,2]) for i in range(MAX_PADDING)))))).batch(2)

# train_data = tf.data.Dataset.from_tensor_slices((train2idx[:,0],train2idx[:,1],train2idx[:,2],
#                                                 trainexp2idx[:,:,0],trainexp2idx[:,:,1],trainexp2idx[:,:,2])).batch(3)

# for h,r,t,he,re,te in train_data:
#     print(h,r,t,he,re,te)
#     break

In [None]:
# embedding_size = 50
# batch_size = 128
# margin = 2
# learning_rate = .001

In [None]:
#optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
#model = transE.ExTransE(num_entities,num_relations,embedding_size,random_state=SEED)

In [None]:
# epoch_loss = []

# for epoch in range(num_epochs):

#     for pos_head, rel, pos_tail, pos_head_exp,rel_exp, pos_tail_exp in train_data:

#         neg_head, neg_tail = utils.get_negative_triples(
#             head=pos_head, 
#             rel=rel, 
#             tail=pos_tail,
#             num_entities=num_entities
#             )

#         neg_head_exp, neg_tail_exp = utils.get_negative_triples(
#             head=pos_head_exp, 
#             rel=rel_exp, 
#             tail=pos_tail_exp,
#             num_entities=num_entities
#             )

#         with tf.GradientTape() as tape:

#             pos_head_e, pos_tail_e, neg_head_e, neg_tail_e, rel_e = model([
#                 pos_head,
#                 pos_tail, 
#                 neg_head, 
#                 neg_tail, 
#                 rel
#                 ]
#             )

#             pos_head_exp_e, pos_tail_exp_e, neg_head_exp_e, neg_tail_exp_e, rel_exp_e = model([
#                 pos_head_exp,
#                 pos_tail_exp, 
#                 neg_head_exp,  
#                 neg_tail_exp, 
#                 rel_exp
#                 ]
#             )

#             prediction_loss = transE.pred_loss(pos_head_e,pos_tail_e,neg_head_e,neg_tail_e,rel_e)
#             #explain_loss = transE.exp_loss(pos_head_exp_e,pos_tail_exp_e,neg_head_exp_e, neg_tail_exp_e, rel_exp_e)
            
            
#             #explain_loss = transE.exp_loss(pos_head_e,pos_tail_exp_e,pos_head_exp_e,pos_tail_exp_e,rel_e,rel_exp_e)
#             explain_loss = transE.exp_loss(pos_head_exp_e,
#                                            pos_tail_exp_e,
#                                            neg_head_exp_e,
#                                            neg_tail_exp_e,
#                                            rel_exp_e,
#                                            rel_exp_e)
#             #print(f"pred loss {prediction_loss}")
#             #print(f"explain loss {explain_loss}")
#             total_loss = prediction_loss + explain_loss

#         grads = tape.gradient(total_loss,model.trainable_variables)
#         optimizer.apply_gradients(zip(grads,model.trainable_variables))

#     #if not epoch % 10:
#     #print(f"Loss at epoch {epoch}: {total_loss.numpy()} ")
        
#     epoch_loss.append(np.round(total_loss.numpy(),5))

In [None]:
entity_embeddings = utils.get_entity_embeddings(model)
relation_embeddings = utils.get_relation_embeddings(model)

In [None]:
test2idx

In [None]:
top_k = 1
pred_exp = []

#testexp2idx[:,:,0].flatten()
for i in range(len(testexp2idx)):

    h_idx, r_idx, t_idx = test2idx[i]

    triple_h_e = entity_embeddings[h_idx]
    triple_r_e = relation_embeddings[r_idx]
    triple_t_e = entity_embeddings[t_idx]

    h_e = entity_embeddings[testexp2idx[:,:,0].flatten()]
    r_e = relation_embeddings[testexp2idx[:,:,1].flatten()]
    t_e = entity_embeddings[testexp2idx[:,:,2].flatten()]
    
    squared_diff = np.square(triple_h_e - h_e) + np.square(triple_r_e-r_e) + np.square(triple_t_e-t_e)

    l2_dist = np.sqrt(np.sum(squared_diff,axis=1))

    closest_l2 = np.argsort(l2_dist)[:top_k]

    k_closest = testexp2idx[closest_l2]

    pred_exp.append(k_closest) 

pred_exp = np.array(pred_exp).reshape(-1,top_k,3)

In [None]:
np.square(triple_h_e - h_e).shape
#testexp2idx

In [None]:
# def parse_ttl(file_name,max_padding):
    
#     lines = []

#     with open(file_name, 'r') as f:
#         for line in f:
#             lines.append(line)

#     ground_truth = []
#     traces = []

#     for idx in range(len(lines)):

#         if "graph us:construct" in lines[idx]:

#             source_tup = utils.get_tup(lines[idx+1])

#         exp_triples = []

#         if 'graph us:where' in lines[idx]:

#             while lines[idx+1] != '} \n':

#                 exp_tup = utils.get_tup(lines[idx+1])
#                 exp_triples.append(np.array(exp_tup))

#                 idx+=1

#         if len(source_tup) != 0 and len(exp_triples) != 0:
            
#             no_name_entity = False
            
#             if ("no_name_entry" in source_tup[0]) or ("no_name_entry" in source_tup[2]):
#                 no_name_entity = True
            
#             for h,r,t in exp_triples:
#                 if ("no_name_entry" in h) or ("no_name_entry" in t):
#                     no_name_entity = True
            
#             if not no_name_entity:
                
#                 if len(exp_triples) < max_padding:
                    
#                     while len(exp_triples) != max_padding:
                        
#                         #pad = np.zeros((3))
#                         #pad[:] = None
#                         pad = np.array(['UNK_ENT', 'UNK_REL', 'UNK_ENT'])
#                         exp_triples.append(pad)
                        
#                 ground_truth.append(np.array(source_tup))
#                 traces.append(np.array(exp_triples))

#     return np.array(ground_truth), np.array(traces)

In [None]:
#spouse_triples,spouse_traces = parse_ttl(os.path.join('.','data','traces','spouse.ttl'),max_padding=2)

In [None]:
#jaccard_score(np.array([np.array(i) for i in spouse_triples]),np.array([np.array(i) for i in spouse_triples]))
# X_train, X_test, train_exp, test_exp = train_test_split(spouse_triples,
#     spouse_traces,test_size=0.30, random_state=42)

In [None]:
#np.concatenate([X_train[:,0],X_train[:,2],train_exp[:,0][:,0],train_exp[:,0][:,2]],axis=0)

In [None]:
# train_exp_entities = np.concatenate([train_exp[:,0][:,0],train_exp[:,0][:,2],train_exp[:,1][:,0],train_exp[:,1][:,2]])

# train_exp_entities = np.array([i for i in train_exp_entities if i != '0.0'])
# train_entities = np.unique(np.concatenate([X_train[:,0],X_train[:,2],train_exp_entities]))

#update 2idx functions (array2idx,idx2train)
# #update train data variable

# train_data = tf.data.Dataset.from_tensor_slices((train2idx[:,0], train2idx[:,1], train2idx[:,2],
#                                                 exp2idx[:,0],exp2idx[:,1],exp2idx[:,2])).batch(batch_size)
#train_data = tf.data.Dataset.from_tensor_slices(spouse_traces[:])

#spouse_traces[:,0]

#np.concatenate([spouse_triples[:,0], spouse_triples[:,2]],axis=0).shape

# max_traces = 2
# exp_entities = np.array([[spouse_traces[:,i,:][:,0],spouse_traces[:,i,:][:,2]] for i in range(max_traces)]).flatten()
# #exp_entities = np.array([i for i in exp_entities if i != "0.0"])

# exp_relations = np.array([spouse_traces[:,i][:,1] for i in range(max_traces)]).flatten()
# #exp_relations = np.array([i for i in exp_relations if i != "0.0"])

# entities = np.unique(np.concatenate([spouse_triples[:,0], spouse_triples[:,2], exp_entities],axis=0))
# relations = np.unique(np.concatenate([spouse_triples[:,1], exp_relations],axis=0))

In [None]:
# num_entities = len(entities)
# num_relations = len(relations)

# ent2idx = dict(zip(entities, range(num_entities)))
# rel2idx = dict(zip(relations, range(num_relations)))

# idx2ent = {idx:ent for ent,idx in ent2idx.items()}
# idx2rel = {idx:rel for rel,idx in rel2idx.items()}

In [None]:
# train2idx = array2idx(spouse_triples,ent2idx,rel2idx)
# train2idx

In [None]:

train_data = tf.data.Dataset.from_tensor_slices((train2idx[:,0],train2idx[:,1],train2idx[:,2],
                                                 tuple((((traces2idx[:,i,:][:,0],traces2idx[:,i,:][:,1],
                                                          traces2idx[:,i,:][:,2]) for i in range(2)))))).batch(2)

#train_data = tf.data.Dataset.from_tensor_slices((train2idx[:,0],train2idx[:,1],train2idx[:,2],traces2idx)).batch(2)

In [None]:
traces2idx = array2idx(spouse_traces,ent2idx,rel2idx)

In [None]:
traces2idx

In [None]:
#data = tf.data.Dataset.from_tensor_slices((traces2idx[:,0,:][:,0],traces2idx[:,0,:][:,1],traces2idx[:,0,:][:,2])).batch(1)
#tuple(((traces2idx[:,i,:][:,0],traces2idx[:,i,:][:,1],traces2idx[:,i,:][:,2]) for i in range(2)))

# data = tf.data.Dataset.from_tensor_slices(traces2idx).batch(1)
# for d in data:
#     print(d)
# data = tf.data.Dataset.from_tensor_slices(
#     ).batch(2)

# for d in data:
#     print(d)
#     break
#     for i in d:
#         print(i)

In [None]:
for h,r,t,ex in train_data:
    print('h',h,'r',r,'t',t)
    exps = []
    for h1,r1,t1 in ex:
        #print("h ex",h1,"r ex",r1,"t ex",t1)
        exps.append((h1,r1,t1))
    print(exps)
    break

In [None]:
# def jaccard_score(true_exp,pred_exp):

#     scores = []

#     for i in range(len(true_exp)):

#         pred_i = pred_exp[i]
#         true_i = true_exp[i]

#         if isinstance(true_i,np.ndarray):
#             num_true_traces = min(true_i.ndim,true_i.shape[0])

#         elif isinstance(true_i,list):
#             num_true_traces = len(true_i)

#         if isinstance(pred_i,np.ndarray):
#             num_pred_traces = min(pred_i.ndim,pred_i.shape[0])
        
#         elif isinstance(pred_i,list):
#             num_pred_traces = len(pred_i)
        
#         bool_array = (pred_i == true_i).reshape(num_true_traces,3)

#         count = 0

#         for row in bool_array:
#             if row.all():
#                 count +=1

#         score = count / (num_true_traces+num_pred_traces-count)

#         scores.append(score)

#     return np.mean(scores)

In [None]:
# pred_exp = np.array([[[136,0,932],[136,0,932]],[[502,0,972],[972,0,502]]])
# true_exp = traces2idx[0:2]
# def jaccard_score(true_exp,pred_exp):

#     assert len(true_exp) == len(pred_exp)

#     scores = []

#     for i in range(len(true_exp)):

#         true_i = true_exp[i][true_exp[i] != np.array([-1, -1, -1])].reshape(-1,3)    
#         pred_i = pred_exp[i]#[pred_exp[i] != np.array([-1, -1, -1])].reshape(-1,3)    

#         num_true_traces = true_i.shape[0]
#         num_pred_traces = pred_i.shape[0]

#         if num_true_traces < num_pred_traces:

#             pred_i = pred_i[:num_true_traces]
#             num_pred_traces = pred_i.shape[0]

#         bool_array = (pred_i == true_i)

#         count = 0
#         for row in bool_array:
#             if row.all():
#                 count += 1

#         score = count / (num_true_traces + num_pred_traces-count)

#         scores.append(score)
        
#     return np.mean(scores)

In [None]:


    #print(pred_i)
    
#     pred_i = pred_exp[i][pred_exp[i] != np.array([-1, -1, -1])].reshape(-1,3)

#     num_true_traces = true_i.shape[0]
#     num_pred_traces = pred_i.shape[0]
    
#     print(pred_i == true_i)
    
#     count = 0
    
#     for row in bool_array:
#         if row.all():
#             count +=1
#     print(count, num_true_traces, num_pred_traces)        
#     score = count / (num_true_traces+num_pred_traces - count)
    
#     scores.append(score)


In [None]:
def array2idx(dataset, ent2idx,rel2idx):
    
    if dataset.ndim == 2:
        
        data = []
        
        for head, rel, tail in dataset:
            
            head_idx = ent2idx[head]
            tail_idx = ent2idx[tail]
            rel_idx = rel2idx[rel]
            
            data.append((head_idx, rel_idx, tail_idx))

        data = np.array(data)

    elif dataset.ndim == 3:
        
        data = []

        for i in range(len(dataset)):
            
            temp_array = []
        
            for head,rel,tail in dataset[i,:,:]:
#                 if (head == '0.0') or (tail == '0.0') or (rel == '0.0'):
#                     temp_array.append((-1,-1,-1))
#                     continue

                head_idx = ent2idx[head]
                tail_idx = ent2idx[tail]
                rel_idx = rel2idx[rel]

                temp_array.append((head_idx,rel_idx,tail_idx))

            data.append(temp_array)
            
        data = np.array(data).reshape(-1,dataset.shape[1],3)

    return data

In [None]:
# num_traces = spouse_traces.shape[1]

# data = []
# #spouse_traces[-1,:,:]

# #for i in range(len(spouse_traces)):
# for i in range(2):
#     temp_array = []
#     for head,rel,tail in spouse_traces[i,:,:]:
        
#         if (head == '0.0') or (tail == '0.0') or (rel == '0.0'):
#             temp_array.append((-1,-1,-1))
#             continue
            
#         head_idx = ent2idx[head]
#         tail_idx = ent2idx[tail]
#         rel_idx = rel2idx[rel]
        
#         temp_array.append((head_idx,rel_idx,tail_idx))
        
#     data.append(temp_array)

# for i in range(num_traces):

#     temp_array = []

#     for head,rel,tail in spouse_traces[:,i]:

#         if head == '0.0' or tail == '0.0':
#             temp_array.append((-1,-1,-1))
#             continue

#         head_idx = ent2idx[head]
#         tail_idx = ent2idx[tail]
#         rel_idx = rel2idx[rel]

#         temp_array.append((head_idx,rel_idx,tail_idx))

#     data.append(temp_array)

In [None]:
### lines = []

# with open(os.path.join('.' ,'data','traces','spouse.ttl'), 'r') as f:
#     for line in f:
#         lines.append(line)
        
# ground_truth = []
# traces = []

# for idx in range(len(lines)):
    
#     if "graph us:construct" in lines[idx]:

#         source_tup = utils.get_tup(lines[idx+1])            
    
#     exp_triples = []
    
#     if 'graph us:where' in lines[idx]:
        
#         while lines[idx+1] != '} \n':
#             exp_tup = utils.get_tup(lines[idx+1])
#             exp_triples.append(exp_tup)

#             idx+=1
        
#     if len(source_tup) != 0 and len(exp_triples) != 0:
        
#         no_name_entity = False
        
#         if ("no_name_entry" in source_tup[0]) or ("no_name_entry" in source_tup[2]):
#             no_name_entity = True
        
#         for h,r,t in exp_triples:
#             if ("no_name_entry" in h) or ("no_name_entry" in t):
#                 no_name_entity = True
        
#         if not no_name_entity:
#             ground_truth.append(source_tup)
#             traces.append(exp_triples)



In [None]:


#[tf.convert_to_tensor(i) for i in spouse_traces]
#tf.convert_to_tensor(spouse_traces[0])
# for i in spouse_traces:
#     print(i.shape)
# for i in range(2):
    
#     if (tf.convert_to_tensor(spouse_traces)[1][i]).all():
#         print(tf.convert_to_tensor(spouse_traces)[1][i])

# for t1,t2 in spouse_traces:
    
    
#     e1,r1,e2 = t1
#     e3,r2,e4 = t2

#     print(e1,r1,e2)
#     print(e3,r2,e4)

In [None]:
# from collections import defaultdict

# traces = defaultdict(list)

# for idx in range(len(lines)):
        
#     if "graph us:construct" in lines[idx] and 'dbe' in lines[idx+1]:

#         source_tup = utils.get_tup(lines[idx+1])
                 
#         assert len(source_tup) == 3
        
#         traces[source_tup] = []

# for i,j in traces.items():
    
#     i = eval(i)

#     if (i[0] != j[0][2]) or (i[2] != j[0][0]):
#         print(i,j)


In [None]:
# for i in truth:
#     if i not in all_spouses:
#         print(i)
# for i,j,k in all_spouses:
#     if i == 'Elena_Cuza' or k =='Elena_Cuza':
#         print(i,j,k)

In [None]:
# from collections import defaultdict

# traces = defaultdict(list)

# for idx,line in enumerate(lines[0:100]):
#     print(line)
#     if ('dbe' in line or 'dbo' in line):
#         #print(line)
#         source_tup = utils.get_tup(line)
        
#         traces[source_tup] = []