In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import torch
import rdflib
from sklearn.metrics import precision_score, recall_score, f1_score

from src.utils import *
from src.gnn import *
from src.sparql_queries import *

# 1. Data

In [None]:
g = rdflib.Graph()
g.parse('datasets/family.owl')

print(f'Triplets found: %d' % len(g))

In [None]:
relations = list(set(g.predicates()))
nodes = list(set(g.subjects()).union(set(g.objects())))

relations_dict = {rel: i for i, rel in enumerate(relations)}
nodes_dict = {node: i for i, node in enumerate(nodes)}

nodes_dict_rev = {value: key for key, value in nodes_dict.items()}
relations_dict_rev = {value: key for key, value in relations_dict.items()}

In [None]:
data = get_data(g, nodes_dict, relations_dict)
data = split_edges(data)

In [None]:
data

# 2. GNN

**Train**

In [None]:
st = time.time()
model = GNN()

for epoch in range(300+1):
    loss = model._train(data, len(nodes), len(relations))
    if (epoch % 100) == 0:
        hits1, hits10 = model._eval(data)
        print(f'Epoch: {epoch}, Loss: {loss:.4f}, Hits@1: {hits1:.3f}, Hits@10: {hits10:.3f}')

torch.save(model, f'models/RGCN')
et = time.time()
elapsed_time = et - st
print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')

**Eval**

In [None]:
model = torch.load(f'models/RGCN')
hits1, hits10 = model._eval(data)
print(f'Hits@1: {hits1:.3f}, Hits@10: {hits10:.3f}')

# 3. Generate New Links

In [None]:
keys = [URIRef('http://www.co-ode.org/roberts/family-tree.owl#isBrotherOf'),
        URIRef('http://www.co-ode.org/roberts/family-tree.owl#isFatherOf'),
        URIRef('http://www.co-ode.org/roberts/family-tree.owl#isSonOf'),
        URIRef('http://www.co-ode.org/roberts/family-tree.owl#isMalePartnerIn'),
        URIRef('http://www.co-ode.org/roberts/family-tree.owl#isSisterOf'),
        URIRef('http://www.co-ode.org/roberts/family-tree.owl#isMotherOf'),
        URIRef('http://www.co-ode.org/roberts/family-tree.owl#isDaughterOf'),
        URIRef('http://www.co-ode.org/roberts/family-tree.owl#isFemalePartnerIn')]
edge_types = [relations_dict[key] for key in keys]

In [None]:
persons = []
qres = g.query("""
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX fo: <http://www.co-ode.org/roberts/family-tree.owl#>
SELECT ?s WHERE {
 ?s rdf:type owl:NamedIndividual .
}
""")
for row in qres:
    persons.append(row.s)
person_id = [nodes_dict[person] for person in persons]
person_names = [nodes_dict_rev[person_id] for person_id in person_id]
person_names_dict = dict(zip(list(set(np.arange(len(person_id)))), person_names))

### GNN: we add new links with a low prediction score to the ontology

In [None]:
def add_triples_gnn(g, data, edge_types, k):
    noisy_g_gnn = rdflib.Graph()
    new_g_gnn = copy_graph(g)
    #for etype in tqdm(range(len(relations))):   
    for etype in tqdm(edge_types):   
        mask = data.edge_type == etype
        edge_index = torch.tensor([data.edge_index[0,mask].tolist(),data.edge_index[1,mask].tolist()])
        edge_type = data.edge_type[mask]

        output = model.model.encode(edge_index, edge_type)

        link_pred_scores = torch.matmul(output, output.T)
        output_norm = torch.norm(output, dim=1, keepdim=True)
        link_pred_scores_norm = link_pred_scores / (output_norm * output_norm.T)
        
        # We do not want to generate links that already exists
        # We want the subject and object to be of type Person
        link_pred_scores_norm[edge_index[0,:],edge_index[1,:]] = 1
        subset = link_pred_scores_norm[person_id][:, person_id]

        # Find the indices of the top k smallest elements
        _, topk_indices = torch.topk(subset.flatten(), k*2, largest=False)
        row_indices = topk_indices // subset.size(1)
        col_indices = topk_indices % subset.size(1)

        # Filter out indices where row index is greater than column index
        valid_indices_mask = row_indices < col_indices
        row_indices = row_indices[valid_indices_mask]
        col_indices = col_indices[valid_indices_mask]
        
        # Add generated triples
        node1_lst = [person_names_dict[key] for key in row_indices.tolist()]
        node2_lst = [person_names_dict[key] for key in col_indices.tolist()]
        edge_type_uri = relations_dict_rev[etype]
        noisy_g_gnn = add_links(noisy_g_gnn, node1_lst, node2_lst, edge_type_uri)
        new_g_gnn = add_links(new_g_gnn, node1_lst, node2_lst, edge_type_uri)
        
    return noisy_g_gnn, new_g_gnn

### Random: we add random links to the ontology

In [None]:
def add_triples_random(g, data, edge_types, k):
    noisy_g_random = rdflib.Graph()
    new_g_random = copy_graph(g)
    #for etype in tqdm(range(len(relations))):  
    for etype in tqdm(edge_types):   
        mask = data.edge_type == etype
        edge_index = torch.tensor([data.edge_index[0,mask].tolist(),data.edge_index[1,mask].tolist()])
        
        # We do not want to generate links that already exists
        # We want the subject and object to be of type Person
        num_neg_samples = 0
        candidate_heads = []
        candidate_tails = []
        new_person_id = person_id * (int(k/len(person_id)) + 1)
        heads = new_person_id.copy()
        tails = new_person_id.copy()
        random.shuffle(heads)
        random.shuffle(tails)
        
        while num_neg_samples < k:    
            h = heads[num_neg_samples]
            t = tails[num_neg_samples]
            if h not in edge_index[0] or t not in edge_index[1]:
                candidate_heads.append(h)
                candidate_tails.append(t)
            num_neg_samples += 1
        
        # Add generated triples
        node1_lst = [nodes_dict_rev[key] for key in candidate_heads]
        node2_lst = [nodes_dict_rev[key] for key in candidate_tails]
        edge_type_uri = relations_dict_rev[etype]
        noisy_g_random = add_links(noisy_g_random, node1_lst, node2_lst, edge_type_uri)
        new_g_random = add_links(new_g_random, node1_lst, node2_lst, edge_type_uri)

    return noisy_g_random, new_g_random

# 4. Experiments

In [None]:
# Add k triples per edge_type
k = 100
model = torch.load(f'models/RGCN')

noisy_g_gnn, new_g_gnn = add_triples_gnn(g, data, edge_types, k)
noisy_g_gnn.serialize(destination=f"datasets/family_noisy_gnn_{k}.owl")
noisy_g_random, new_g_random = add_triples_random(g, data, edge_types, k)
noisy_g_random.serialize(destination=f"datasets/family_noisy_random_{k}.owl")

In [None]:
query1, query2, query3, query4, query5, query6, query7 = get_queries()

print(f'Triplets found: %d' % len(new_g_gnn))
print('Contradictions:')
for q in [query1, query2, query3, query4, query5, query6, query7]:
    print_result(new_g_gnn, q)

print(f'Triplets found: %d' % len(new_g_random))
print('Contradictions:')
for q in [query1, query2, query3, query4, query5, query6, query7]:
    print_result(new_g_random, q)