In [1]:
import numpy as np
import utils
import os
import pandas as pd

In [2]:
DATASET = 'paul'
RULES = ['brother','sister','spouse','grandparent','parent','child','aunt','uncle']

In [3]:
MAX_PADDING = 2

UNK = np.array([['UNK_ENT','UNK_REL','UNK_ENT']])
UNK_WEIGHT_STR = 'UNK_WEIGHT'

In [4]:
data = dict()
all_triples = []
all_traces = []
all_weights = []

for rule in RULES:

    if rule == 'uncle' or rule == 'aunt':
        rule_file = 'uncle_aunt'

    elif rule == 'brother' or rule == 'sister':
        rule_file = 'brother_sister'
        MAX_PADDING = 3
    else:
        rule_file = rule

    triples,traces, weights = utils.parse_ttl(
        file_name=os.path.join('..','data',f'{DATASET}_traces',f'{rule_file}.ttl'),
        max_padding=MAX_PADDING
    )

    _, unique_traces_idx = np.unique(traces, axis=0,return_index=True)

    triples = triples[unique_traces_idx]
    traces = traces[unique_traces_idx]
    weights = weights[unique_traces_idx]

    if rule_file == 'brother_sister':

        gender_indices = (traces[:,:,1] == 'gender').any(axis=1)

        triples = triples[~gender_indices]
        traces = traces[~gender_indices]
        weights = weights[~gender_indices]

        MAX_PADDING = 2

        traces = traces[:,0:MAX_PADDING,:]

    _, unique_triples_idx = np.unique(triples,axis=0,return_index=True)

    triple_lookup = {}
    longest_trace = -1

    for i in unique_triples_idx:

        triple = triples[i]

        indices = (triples == triple).all(axis=1)

        triple_lookup[str(triple)] = indices

        sum_indices = indices.sum()

        if sum_indices > longest_trace:

            longest_trace = sum_indices

    processed_triples = []
    processed_weights = []
    processed_traces = []

    for idx in unique_triples_idx:

        triple = triples[idx]

        trace_indices = triple_lookup[str(triple)]
        trace = traces[trace_indices]
        weight = weights[trace_indices]

        per_trace_weights = []

        for i in range(len(trace)):

            num_triples = trace[i].shape[0]
            current_weight = weights[trace_indices][i]

            num_unk = (trace[i] == UNK).all(axis=1).sum()

            current_weights = [current_weight] * (num_triples-num_unk)

            while len(current_weights) != num_triples:

                current_weights.append(UNK_WEIGHT_STR)

            per_trace_weights.append(current_weights)

        per_trace_weights = np.array(per_trace_weights)

        unk_weight = np.array([[UNK_WEIGHT_STR] * per_trace_weights.shape[1]])

        while per_trace_weights.shape[0] != longest_trace:
            per_trace_weights = np.concatenate([per_trace_weights,unk_weight],axis=0)

        padded_trace = utils.pad_trace(trace,max_padding=MAX_PADDING,longest_trace=longest_trace,unk=UNK)

        processed_triples.append(triple)
        processed_traces.append(padded_trace)
        processed_weights.append(per_trace_weights)

    triples = np.array(processed_triples)
    traces = np.array(processed_traces)
    weights = np.array(processed_weights)

    del processed_triples
    del processed_traces
    del processed_weights

    idx = triples[:,1] == rule

    triples = triples[idx]
    traces = traces[idx]
    weights = weights[idx]

    exp_entities = np.array([
        [traces[:,i,j,0],traces[:,i,j,2]] for i in range(longest_trace) for j in range(MAX_PADDING)]).flatten()

    exp_relations = np.array([
        [traces[:,i,j,1]] for i in range(longest_trace) for j in range(MAX_PADDING)]).flatten()

    all_triples.append(triples)
    all_traces.append(traces)
    all_weights.append(weights)

    data[rule + '_triples'] = triples
    data[rule + '_traces'] = traces
    data[rule + '_weights'] = weights
    data[rule + '_entities'] = np.unique(np.concatenate([triples[:,0], triples[:,2], exp_entities],axis=0))
    data[rule + '_relations'] = np.unique(np.concatenate([triples[:,1], exp_relations],axis=0))
    data[rule + '_longest_trace'] = longest_trace

In [5]:
UNK_WEIGHT = np.array([[UNK_WEIGHT_STR] * MAX_PADDING])

In [6]:
MAX_TRACE = max([data[rule + '_longest_trace'] for rule in RULES])

all_triples = []
all_traces = []
all_weights = []

for rule in RULES:

    triple_name = rule + '_triples'
    traces_name = rule + '_traces'
    weights_name = rule + '_weights'

    traces_i = data[traces_name]
    weights_i = data[weights_name]
    
    padded_traces = []
    padded_weights = []

    for i in range(len(traces_i)):

        padded_trace = utils.pad_trace(traces_i[i],MAX_TRACE,MAX_PADDING,UNK)
        padded_weight = utils.pad_weight(weights_i[i],MAX_TRACE,UNK_WEIGHT)

        padded_traces.append(padded_trace)
        padded_weights.append(padded_weight)

    all_triples.append(data[triple_name])
    all_traces.append(np.array(padded_traces))
    all_weights.append(np.array(padded_weights))

all_triples = np.concatenate(all_triples, axis=0)
all_traces = np.concatenate(all_traces, axis=0)
all_weights = np.concatenate(all_weights,axis=0)

data['all_triples'] = all_triples
data['all_traces'] = all_traces
data['all_weights'] = all_weights

data['max_trace'] = MAX_TRACE

print(f"all_triples shape: {all_triples.shape}")

print(f"all_traces shape: {all_traces.shape}")

print(f"all_weights shape: {all_weights.shape}")

all_exp_entities = np.array([
        [all_traces[:,i,j,0],all_traces[:,i,j,2]] for i in range(MAX_TRACE) for j in range(MAX_PADDING)]).flatten()

all_exp_relations = np.array([
        [all_traces[:,i,j,1]] for i in range(MAX_TRACE) for j in range(MAX_PADDING)]).flatten()

all_entities = np.unique(np.concatenate([all_triples[:,0], all_triples[:,2], all_exp_entities],axis=0))
all_relations = np.unique(np.concatenate([all_triples[:,1], all_exp_relations],axis=0))

data['all_entities'] = all_entities
data['all_relations'] = all_relations
data['rules'] = RULES

all_triples shape: (106, 3)
all_traces shape: (106, 7, 2, 3)
all_weights shape: (106, 7, 2)


In [7]:
pred_exp = utils.remove_padding_np(all_traces[0][0], 'UNK_ENT','UNK_REL')

In [8]:
utils.graded_precision_recall(pred_exp,all_traces[0],all_weights[0],MAX_TRACE,'UNK_ENT','UNK_REL','UNK_WEIGHT')

(0.75, 1.0)

In [10]:
# data = np.load(os.path.join('..','data',DATASET+'.npz'))

# triples,traces,weights,entities,relations = utils.get_data(data,RULE)

# NUM_ENTITIES = len(entities)
# NUM_RELATIONS = len(relations)

# ent2idx = dict(zip(entities, range(NUM_ENTITIES)))
# rel2idx = dict(zip(relations, range(NUM_RELATIONS)))

# idx2ent = dict(zip(range(NUM_ENTITIES),entities))
# idx2rel = dict(zip(range(NUM_RELATIONS),relations))

# unk_ent_id = ent2idx['UNK_ENT']
# unk_rel_id = rel2idx['UNK_REL']

In [11]:
#weights = np.array([.99,.99,.5,.5,.5,.99,.5,.99,.5,.99,.5,.99,.5,.99,.5,.5,.5,.99,.5,.5,.99,.99,.99,.99])

In [12]:
#_,unique_idx = np.unique(triples,axis=0,return_index=True)

In [13]:
# triple_lookup = {}
# longest_trace = -1
# max_padding = 3

# for i in unique_idx:
    
#     triple = triples[i]
    
#     indices = (triples == triple).all(axis=1)
        
#     triple_lookup[str(triple)] = indices
    
#     sum_indices = indices.sum()
    
#     if sum_indices > longest_trace:
        
#         longest_trace = sum_indices

In [14]:
# processed_triples = []
# processed_weights = []
# processed_traces = []
# unk = np.array([['UNK_ENT','UNK_REL','UNK_ENT']])
# unk_weight_str = 'UNK_WEIGHT'
# unk_weight = np.array([[unk_weight_str] * max_padding])

# for idx in unique_idx:
    
#     triple = triples[idx]
    
#     trace_indices = triple_lookup[str(triple)]
#     trace = traces[trace_indices]
#     weight = weights[trace_indices]
    
#     per_trace_weights = []

#     for i in range(len(trace)):

#         num_triples = trace[i].shape[0]
#         current_weight = weights[trace_indices][i]

#         num_unk = (trace[i] == unk).all(axis=1).sum()

#         current_weights = [current_weight] * (num_triples-num_unk)

#         while len(current_weights) != num_triples:

#             current_weights.append(unk_weight_str)
            
#         per_trace_weights.append(current_weights)
          
#     per_trace_weights = np.array(per_trace_weights)
    
#     while per_trace_weights.shape[0] != longest_trace:
#         per_trace_weights = np.concatenate([per_trace_weights,unk_weight],axis=0)
        
#     padded_trace = utils.pad_trace(trace,max_padding=max_padding,longest_trace=longest_trace,unk=unk)
    
#     processed_triples.append(triple)
#     processed_traces.append(padded_trace)
#     processed_weights.append(per_trace_weights)

In [15]:
# all_triples = np.array(processed_triples)
# all_traces = np.array(processed_traces)
# all_weights = np.array(processed_weights)
#traces: (NUM_TRIPLES,LONGEST_TRACE,MAX_PADDING,3)
#weights: (NUM_TRIPLES,LONGEST_TRACE,MAX_PADDING)

In [16]:
# idx = 2
# current_traces = all_traces[idx]
# current_weights = all_weights[idx]

### pred_exp = np.array([['<http://example.org/data#MotherPaul>', 'child',
#          '<http://example.org/data#BrotherPaul>'],
#         ['<http://example.org/data#FatherPaul>', 'child',
#          '<http://example.org/data#BrotherPaul>']
#         ])

# pred_exp = np.array(
#     [['<http://example.org/data#MotherPaul>', 'spouse',
#          '<http://example.org/data#FatherPaul>']])
# def precision_recall(pred_exp,current_traces,current_weights):
    
#     n = len(pred_exp)

#     relevance_scores = np.zeros(longest_trace) #numerator of graded recall

#     for i in range(n):

#         current_pred = pred_exp[i]

#         for j in range(len(current_traces)):

#             unpadded_traces = remove_padding_np(current_traces[j],'UNK_ENT','UNK_REL')
#             unpadded_weights = current_weights[j][current_weights[j] != 'UNK_WEIGHT']

#             indices = (unpadded_traces == current_pred).all(axis=1)

#             sum_weights = sum([float(num) for num in unpadded_weights[indices]])

#             relevance_scores[j] += sum_weights

#     max_relevance_score = max(relevance_scores)
#     max_idx = np.argmax(relevance_scores)

#     total_sum = sum([float(weight) for weight in current_weights[max_idx] if weight != 'UNK_WEIGHT'])

#     precision = max_relevance_score/n
#     recall = max_relevance_score/total_sum
    
#     return precision, recall

In [17]:
#d1,d2,_ = (all_traces[2] != ['UNK_ENT','UNK_REL','UNK_ENT']).nonzero()
#all_traces[2][d1,d2]

In [18]:
#import tensorflow as tf

In [19]:
#tf_pred_exp = tf.convert_to_tensor(pred_exp)

In [20]:
#tf_traces_i = tf.convert_to_tensor(all_traces[2])

In [21]:
#triples2idx = utils.array2idx(all_triples,ent2idx,rel2idx)

In [22]:
#traces2idx = utils.array2idx(all_traces,ent2idx,rel2idx)

In [23]:
#import RGCN

In [24]:
# model = RGCN.get_RGCN_Model(
#     num_entities=NUM_ENTITIES,
#     num_relations=NUM_RELATIONS,
#     embedding_dim=10,
#     output_dim=10,
#     seed=123
# )

# model.load_weights(os.path.join('..','data','weights',DATASET,DATASET+'_'+RULE+'.h5'))

In [25]:
# ALL_INDICES = tf.reshape(tf.range(0,NUM_ENTITIES,1,dtype=tf.int64), (1,-1))

# ADJACENCY_DATA = tf.concat([triples2idx,traces2idx.reshape(-1,3)],axis=0)
# adj_mats = utils.get_adj_mats(ADJACENCY_DATA,NUM_ENTITIES,NUM_RELATIONS)