In [1]:
import numpy as np
import utils
import os
import pandas as pd

In [3]:
DATASET = 'paul_dataset'
RULE = 'brother'

In [4]:
rule_file = 'brother_sister'
MAX_PADDING = 3

In [5]:
triples,traces, weights = utils.parse_ttl(
    file_name=os.path.join('..','data','traces',rule_file+'.ttl'),
    max_padding=MAX_PADDING
)

In [6]:
_, unique_traces_idx = np.unique(traces, axis=0,return_index=True)

In [7]:
triples = triples[unique_traces_idx]
traces = traces[unique_traces_idx]
weights = weights[unique_traces_idx]

In [8]:
if rule_file == 'brother_sister':

    gender_indices = (traces[:,:,1] == 'gender').any(axis=1)

    triples = triples[~gender_indices]
    traces = traces[~gender_indices]
    weights = weights[~gender_indices]

    MAX_PADDING = 2
    
    traces = traces[:,0:MAX_PADDING,:]

In [9]:
UNK = np.array([['UNK_ENT','UNK_REL','UNK_ENT']])
UNK_WEIGHT_STR = 'UNK_WEIGHT'
UNK_WEIGHT = np.array([[UNK_WEIGHT_STR] * MAX_PADDING])

In [10]:
_, unique_triples_idx = np.unique(triples,axis=0,return_index=True)

triple_lookup = {}
longest_trace = -1

for i in unique_triples_idx:

    triple = triples[i]

    indices = (triples == triple).all(axis=1)

    triple_lookup[str(triple)] = indices

    sum_indices = indices.sum()

    if sum_indices > longest_trace:

        longest_trace = sum_indices

In [11]:
processed_triples = []
processed_weights = []
processed_traces = []

for idx in unique_triples_idx:

    triple = triples[idx]

    trace_indices = triple_lookup[str(triple)]
    trace = traces[trace_indices]
    weight = weights[trace_indices]

    per_trace_weights = []

    for i in range(len(trace)):

        num_triples = trace[i].shape[0]
        current_weight = weights[trace_indices][i]

        num_unk = (trace[i] == UNK).all(axis=1).sum()

        current_weights = [current_weight] * (num_triples-num_unk)

        while len(current_weights) != num_triples:

            current_weights.append(UNK_WEIGHT_STR)

        per_trace_weights.append(current_weights)

    per_trace_weights = np.array(per_trace_weights)

    while per_trace_weights.shape[0] != longest_trace:
        per_trace_weights = np.concatenate([per_trace_weights,UNK_WEIGHT],axis=0)

    padded_trace = utils.pad_trace(trace,max_padding=MAX_PADDING,longest_trace=longest_trace,unk=UNK)

    processed_triples.append(triple)
    processed_traces.append(padded_trace)
    processed_weights.append(per_trace_weights)

In [12]:
del triples
del traces
del weights

triples = np.array(processed_triples)
traces = np.array(processed_traces)
weights = np.array(processed_weights)

del processed_triples
del processed_traces
del processed_weights

In [47]:
utils.pad_trace(traces[0],2,8,UNK)

array([[['<http://example.org/data#AuntPaul1>', 'sister',
         '<http://example.org/data#UnclePaul1>'],
        ['<http://example.org/data#UnclePaul1>', 'brother',
         '<http://example.org/data#FatherPaul>']],

       [['<http://example.org/data#FatherPaul>', 'sister',
         '<http://example.org/data#UnclePaul1>'],
        ['<http://example.org/data#AuntPaul1>', 'sister',
         '<http://example.org/data#UnclePaul1>']],

       [['UNK_ENT', 'UNK_REL', 'UNK_ENT'],
        ['UNK_ENT', 'UNK_REL', 'UNK_ENT']],

       [['UNK_ENT', 'UNK_REL', 'UNK_ENT'],
        ['UNK_ENT', 'UNK_REL', 'UNK_ENT']],

       [['UNK_ENT', 'UNK_REL', 'UNK_ENT'],
        ['UNK_ENT', 'UNK_REL', 'UNK_ENT']],

       [['UNK_ENT', 'UNK_REL', 'UNK_ENT'],
        ['UNK_ENT', 'UNK_REL', 'UNK_ENT']],

       [['UNK_ENT', 'UNK_REL', 'UNK_ENT'],
        ['UNK_ENT', 'UNK_REL', 'UNK_ENT']],

       [['UNK_ENT', 'UNK_REL', 'UNK_ENT'],
        ['UNK_ENT', 'UNK_REL', 'UNK_ENT']]], dtype='<U42')

In [None]:
# data = np.load(os.path.join('..','data',DATASET+'.npz'))

# triples,traces,weights,entities,relations = utils.get_data(data,RULE)

# NUM_ENTITIES = len(entities)
# NUM_RELATIONS = len(relations)

# ent2idx = dict(zip(entities, range(NUM_ENTITIES)))
# rel2idx = dict(zip(relations, range(NUM_RELATIONS)))

# idx2ent = dict(zip(range(NUM_ENTITIES),entities))
# idx2rel = dict(zip(range(NUM_RELATIONS),relations))

# unk_ent_id = ent2idx['UNK_ENT']
# unk_rel_id = rel2idx['UNK_REL']

In [None]:
#weights = np.array([.99,.99,.5,.5,.5,.99,.5,.99,.5,.99,.5,.99,.5,.99,.5,.5,.5,.99,.5,.5,.99,.99,.99,.99])

In [None]:
#_,unique_idx = np.unique(triples,axis=0,return_index=True)

In [None]:
# triple_lookup = {}
# longest_trace = -1
# max_padding = 3

# for i in unique_idx:
    
#     triple = triples[i]
    
#     indices = (triples == triple).all(axis=1)
        
#     triple_lookup[str(triple)] = indices
    
#     sum_indices = indices.sum()
    
#     if sum_indices > longest_trace:
        
#         longest_trace = sum_indices

In [None]:
# processed_triples = []
# processed_weights = []
# processed_traces = []
# unk = np.array([['UNK_ENT','UNK_REL','UNK_ENT']])
# unk_weight_str = 'UNK_WEIGHT'
# unk_weight = np.array([[unk_weight_str] * max_padding])

# for idx in unique_idx:
    
#     triple = triples[idx]
    
#     trace_indices = triple_lookup[str(triple)]
#     trace = traces[trace_indices]
#     weight = weights[trace_indices]
    
#     per_trace_weights = []

#     for i in range(len(trace)):

#         num_triples = trace[i].shape[0]
#         current_weight = weights[trace_indices][i]

#         num_unk = (trace[i] == unk).all(axis=1).sum()

#         current_weights = [current_weight] * (num_triples-num_unk)

#         while len(current_weights) != num_triples:

#             current_weights.append(unk_weight_str)
            
#         per_trace_weights.append(current_weights)
          
#     per_trace_weights = np.array(per_trace_weights)
    
#     while per_trace_weights.shape[0] != longest_trace:
#         per_trace_weights = np.concatenate([per_trace_weights,unk_weight],axis=0)
        
#     padded_trace = utils.pad_trace(trace,max_padding=max_padding,longest_trace=longest_trace,unk=unk)
    
#     processed_triples.append(triple)
#     processed_traces.append(padded_trace)
#     processed_weights.append(per_trace_weights)

In [None]:
# all_triples = np.array(processed_triples)
# all_traces = np.array(processed_traces)
# all_weights = np.array(processed_weights)
#traces: (NUM_TRIPLES,LONGEST_TRACE,MAX_PADDING,3)
#weights: (NUM_TRIPLES,LONGEST_TRACE,MAX_PADDING)

In [None]:
# idx = 2
# current_traces = all_traces[idx]
# current_weights = all_weights[idx]

### pred_exp = np.array([['<http://example.org/data#MotherPaul>', 'child',
#          '<http://example.org/data#BrotherPaul>'],
#         ['<http://example.org/data#FatherPaul>', 'child',
#          '<http://example.org/data#BrotherPaul>']
#         ])

# pred_exp = np.array(
#     [['<http://example.org/data#MotherPaul>', 'spouse',
#          '<http://example.org/data#FatherPaul>']])
# def precision_recall(pred_exp,current_traces,current_weights):
    
#     n = len(pred_exp)

#     relevance_scores = np.zeros(longest_trace) #numerator of graded recall

#     for i in range(n):

#         current_pred = pred_exp[i]

#         for j in range(len(current_traces)):

#             unpadded_traces = remove_padding_np(current_traces[j],'UNK_ENT','UNK_REL')
#             unpadded_weights = current_weights[j][current_weights[j] != 'UNK_WEIGHT']

#             indices = (unpadded_traces == current_pred).all(axis=1)

#             sum_weights = sum([float(num) for num in unpadded_weights[indices]])

#             relevance_scores[j] += sum_weights

#     max_relevance_score = max(relevance_scores)
#     max_idx = np.argmax(relevance_scores)

#     total_sum = sum([float(weight) for weight in current_weights[max_idx] if weight != 'UNK_WEIGHT'])

#     precision = max_relevance_score/n
#     recall = max_relevance_score/total_sum
    
#     return precision, recall

In [None]:
#d1,d2,_ = (all_traces[2] != ['UNK_ENT','UNK_REL','UNK_ENT']).nonzero()
#all_traces[2][d1,d2]

In [None]:
#import tensorflow as tf

In [None]:
#tf_pred_exp = tf.convert_to_tensor(pred_exp)

In [None]:
#tf_traces_i = tf.convert_to_tensor(all_traces[2])

In [None]:
#triples2idx = utils.array2idx(all_triples,ent2idx,rel2idx)

In [None]:
#traces2idx = utils.array2idx(all_traces,ent2idx,rel2idx)

In [None]:
#import RGCN

In [None]:
# model = RGCN.get_RGCN_Model(
#     num_entities=NUM_ENTITIES,
#     num_relations=NUM_RELATIONS,
#     embedding_dim=10,
#     output_dim=10,
#     seed=123
# )

# model.load_weights(os.path.join('..','data','weights',DATASET,DATASET+'_'+RULE+'.h5'))

In [None]:
# ALL_INDICES = tf.reshape(tf.range(0,NUM_ENTITIES,1,dtype=tf.int64), (1,-1))

# ADJACENCY_DATA = tf.concat([triples2idx,traces2idx.reshape(-1,3)],axis=0)
# adj_mats = utils.get_adj_mats(ADJACENCY_DATA,NUM_ENTITIES,NUM_RELATIONS)