In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import torch
from torch_geometric.utils import negative_sampling
import rdflib
from rdflib import RDF, Namespace, BNode
from owlready2 import get_ontology, Thing, AllDisjoint, FunctionalProperty
from sklearn.metrics import precision_score, recall_score, f1_score

from src.utils import *
from src.gnn import *
from src.sparql_queries import *





In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
#dataset_name = 'family'
#dataset_name = 'pizza'
dataset_name = 'OWL2DL-1'

In [4]:
if dataset_name == 'family':
    uri = Namespace("http://www.co-ode.org/roberts/family-tree.owl#")
elif dataset_name == 'pizza':
    uri = Namespace("http://www.co-ode.org/ontologies/pizza/pizza.owl#")
elif dataset_name.startswith('OWL2DL-'):
    uri = Namespace("http://benchmark/OWL2Bench#")

# 1. Data

In [5]:
g = rdflib.Graph()
g.parse(f'datasets/{dataset_name}.owl')
num_triples = len(g)
print(f'Triplets found in {dataset_name}.owl: %d' % num_triples)

onto = get_ontology(f'datasets/{dataset_name}.owl').load()

Triplets found in OWL2DL-1.owl: 55215


In [6]:
relations = list(set(g.predicates()))
nodes = list(set(g.subjects()).union(set(g.objects())))

relations_dict = {rel: i for i, rel in enumerate(relations)}
nodes_dict = {node: i for i, node in enumerate(nodes)}

nodes_dict_rev = {value: key for key, value in nodes_dict.items()}
relations_dict_rev = {value: key for key, value in relations_dict.items()}

In [7]:
data = get_data(g, nodes_dict, relations_dict)
data = split_edges(data)

In [8]:
data

HeteroData(
  edge_index=[2, 55215],
  edge_type=[55215],
  val_pos_edge_index=[2, 0],
  val_edge_type=[0],
  test_pos_edge_index=[2, 11043],
  test_edge_type=[11043],
  train_pos_edge_index=[2, 44172],
  train_edge_type=[44172]
)

# 2. GNN

**Train**

In [32]:
st = time.time()
model = GNN(device, len(nodes), len(relations))

for epoch in range(10+1):
    loss = model._train(data.to(device))
    print(f'Epoch: {epoch}, Loss: {loss:.4f}')

torch.save(model, f'models/RGCN_{dataset_name}')
et = time.time()
elapsed_time = et - st
print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')

Epoch: 0, Loss: 0.6932
Epoch: 1, Loss: 0.6921
Epoch: 2, Loss: 0.6787
Epoch: 3, Loss: 0.6240
Epoch: 4, Loss: 0.4950
Epoch: 5, Loss: 0.3812
Epoch: 6, Loss: 0.2562
Epoch: 7, Loss: 0.1760
Epoch: 8, Loss: 0.3110
Epoch: 9, Loss: 0.1692
Epoch: 10, Loss: 0.1697
Run time: 33 seconds, 1 minutes


**Eval**

In [33]:
#model = torch.load(f'models/RGCN_{dataset_name}')
#mrr, mean_rank, median_rank, hits_at_5, hits_at_10 = model._eval(data.to(device))
#print(f'MRR: {mrr:.3f}, Mean Rank: {mean_rank:.3f}, Median Rank: {median_rank:.3f}, Hits@5: {hits_at_5:.3f}, Hits@10: {hits_at_10:.3f}')

KeyboardInterrupt: 

# 3. Noise Generation

In [9]:
keys = keys = [rel for rel in relations if not rel.startswith('http://www.w3.org/')]
edge_types = [relations_dict[key] for key in keys]

In [10]:
individuals = []

qres = g.query("""
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?s WHERE {
 ?s rdf:type owl:NamedIndividual .
}
""")

for row in qres:
    individuals.append(row.s)

individual_id = [nodes_dict[individual] for individual in individuals]
individual_names = [nodes_dict_rev[individual_id] for individual_id in individual_id]
individual_names_dict = dict(zip(list(set(np.arange(len(individual_id)))), individual_names))

## 3.1. GNN: we add k triples with a low prediction score to the ontology

In [11]:
def add_triples_gnn(g, data, edge_types, noise_percentage):
    k = int((noise_percentage * num_triples) / len(edge_types))
    noisy_g_gnn = rdflib.Graph()
    new_g_gnn = copy_graph(g)
    for etype in tqdm(edge_types):   
        mask = data.edge_type == etype
        edge_index = torch.tensor([data.edge_index[0,mask].tolist(),data.edge_index[1,mask].tolist()])
        edge_type = data.edge_type[mask]

        output = model.model.encode(edge_index.to(device), edge_type.to(device))

        link_pred_scores = torch.matmul(output, output.T)
        output_norm = torch.norm(output, dim=1, keepdim=True)
        link_pred_scores_norm = link_pred_scores / (output_norm * output_norm.T)
        
        # We do not want to generate links that already exists
        # We want the subject and object to be an individual 
        link_pred_scores_norm[edge_index[0,:],edge_index[1,:]] = 1
        subset = link_pred_scores_norm[individual_id][:, individual_id]

        # Find the indices of the top k smallest elements
        _, topk_indices = torch.topk(subset.flatten(), k*2, largest=False)
        row_indices = topk_indices // subset.size(1)
        col_indices = topk_indices % subset.size(1)

        # Filter out indices where row index is greater than column index
        valid_indices_mask = row_indices < col_indices
        row_indices = row_indices[valid_indices_mask]
        col_indices = col_indices[valid_indices_mask]
        
        # Add generated triples
        node1_lst = [individual_names_dict[key] for key in row_indices.tolist()]
        node2_lst = [individual_names_dict[key] for key in col_indices.tolist()]
        edge_type_uri = relations_dict_rev[etype]
        noisy_g_gnn = add_links(noisy_g_gnn, node1_lst, node2_lst, edge_type_uri)
        new_g_gnn = add_links(new_g_gnn, node1_lst, node2_lst, edge_type_uri)
        
    return noisy_g_gnn, new_g_gnn

## 3.2. Random: we add k random triples to the ontology

In [12]:
def add_triples_random(g, data, edge_types, noise_percentage):
    k = int((noise_percentage * num_triples) / len(edge_types))
    noisy_g_random = rdflib.Graph()
    new_g_random = copy_graph(g)
    for etype in tqdm(edge_types):   
        mask = data.edge_type == etype
        edge_index = torch.tensor([data.edge_index[0,mask].tolist(),data.edge_index[1,mask].tolist()])
        neg_edge_index = negative_sampling(edge_index, num_neg_samples=k)
        
        # Add generated triples
        node1_lst = [nodes_dict_rev[key.item()] for key in neg_edge_index[0]]
        node2_lst = [nodes_dict_rev[key.item()] for key in neg_edge_index[1]]
        edge_type_uri = relations_dict_rev[etype]
        noisy_g_random = add_links(noisy_g_random, node1_lst, node2_lst, edge_type_uri)
        new_g_random = add_links(new_g_random, node1_lst, node2_lst, edge_type_uri)

    return noisy_g_random, new_g_random

## 3.3. DL: we add individuals to the ontology that belong to disjoint classes/properties

In [13]:
def add_disjoint_axioms(g, noise_percentage):
    if len(all_disjoint_classes) > 0 and len(all_disjoint_properties) > 0: 
        k = int((noise_percentage * num_triples)/2)
    else: 
        k = int(noise_percentage * num_triples)
    noisy_g_disjoint = rdflib.Graph()
    new_g_disjoint = copy_graph(g)
    
    #classes
    for i in range(int((k/len(all_disjoint_classes)))):
        try: individual = individual_names[i]
        except IndexError: individual = uri[f'I{i}']
        for axiom_ in range(len(all_disjoint_classes)):
            for class_ in all_disjoint_classes[axiom_]:
                noisy_g_disjoint.add((individual, RDF.type, uri[class_]))
                new_g_disjoint.add((individual, RDF.type, uri[class_]))
                
    #properties
    for i in range(len(all_disjoint_properties)):
        prop1, prop2 = all_disjoint_properties[i]
        prop1_lst = list(g.triples((None, uri[prop1], None)))
        prop2_lst = list(g.triples((None, uri[prop2], None)))
        
        j=0
        while j < int((k/len(all_disjoint_properties))):
            try: 
                noisy_g_disjoint.add((prop1_lst[j][0], uri[prop2], prop1_lst[j][2]))
                noisy_g_disjoint.add((prop2_lst[j][0], uri[prop1], prop2_lst[j][2]))
                new_g_disjoint.add((prop1_lst[j][0], uri[prop2], prop1_lst[j][2]))
                new_g_disjoint.add((prop2_lst[j][0], uri[prop1], prop2_lst[j][2]))
            except IndexError:
                noisy_g_disjoint.add((uri[f'S{j}P1'], uri[prop2], uri[f'O{j}P1']))
                noisy_g_disjoint.add((uri[f'S{j}P2'], uri[prop1], uri[f'O{j}P2']))
                new_g_disjoint.add((uri[f'S{j}P1'], uri[prop2], uri[f'O{j}P1']))
                new_g_disjoint.add((uri[f'S{j}P2'], uri[prop1], uri[f'O{j}P2']))
            j+=1

    return noisy_g_disjoint, new_g_disjoint

**Extracting Disjoint Class Axioms**

In [14]:
disjoint_classes = []
for disjoint in onto.disjoint_classes():
    if isinstance(disjoint, AllDisjoint):
        disjoint_classes.append(disjoint)

all_disjoint_classes = []
for disjoint in disjoint_classes:
    all_disjoint_classes.append([cls.name for cls in disjoint.entities])




































**Extracting Disjoint Property Axioms**

In [15]:
disjoint_properties = []
for disjoint in onto.disjoint_properties():
    if isinstance(disjoint, AllDisjoint):
        disjoint_properties.append(disjoint)

all_disjoint_properties = []
for disjoint in disjoint_properties:
    all_disjoint_properties.append([cls.name for cls in disjoint.entities])

# 4. Results

In [16]:
model = torch.load(f'models/RGCN_{dataset_name}')

for noise_percentage in [0.25, 0.5, 0.75, 1.0]:

    noisy_g_gnn, new_g_gnn = add_triples_gnn(g, data, edge_types, noise_percentage)
    noisy_g_gnn.serialize(destination=f"datasets/noise/{dataset_name}_noisy_gnn_{noise_percentage}.owl", format='xml')
    
    noisy_g_random, new_g_random = add_triples_random(g, data, edge_types, noise_percentage)
    noisy_g_random.serialize(destination=f"datasets/noise/{dataset_name}_noisy_random_{noise_percentage}.owl", format='xml')
    
    noisy_g_disjoint, new_g_disjoint = add_disjoint_axioms(g, noise_percentage)
    noisy_g_disjoint.serialize(destination=f"datasets/noise/{dataset_name}_noisy_disjoint_{noise_percentage}.owl", format='xml')

  0%|                                                                                           | 0/48 [00:04<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.03 GiB (GPU 0; 4.00 GiB total capacity; 2.25 GiB already allocated; 142.65 MiB free; 2.49 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF