In [1]:
from tqdm import tqdm
import time
import torch
import rdflib
from rdflib import Namespace, Literal
from owlready2 import get_ontology

from src.utils import *
from src.gnn import *
from src.sparql_queries import *
from src.noise import *





In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'
print(device)

cpu


In [None]:
dataset_name = 'family'
# dataset_name = 'pizza'
# dataset_name = 'OWL2DL-1'

In [5]:
if dataset_name == 'family':
    uri = Namespace("http://www.co-ode.org/roberts/family-tree.owl#")
elif dataset_name == 'pizza':
    uri = Namespace("http://www.co-ode.org/ontologies/pizza/pizza.owl#")
elif dataset_name.startswith('OWL2DL-'):
    uri = Namespace("http://benchmark/OWL2Bench#")

# 1. Data

In [6]:
g = rdflib.Graph()
g.parse(f'datasets/{dataset_name}.owl')
num_triples = len(g)
print(f'Triplets found in {dataset_name}.owl: %d' % num_triples)

g_no_noise = rdflib.Graph()
g_no_noise.parse(f'datasets/bin/{dataset_name}_train.owl')
num_triples_train = len(g_no_noise)
print(f'Triplets found in {dataset_name}_train.owl: %d' % num_triples_train)

ontology = get_ontology(f'datasets/{dataset_name}.owl').load()

Triplets found in OWL2DL-1.owl: 55215
Triplets found in OWL2DL-1_train.owl: 40588


# 2. GNN

In [None]:
# relations_dict = {rel: i for i, rel in enumerate(relations)}
# nodes_dict = {node: i for i, node in enumerate(nodes)}

# nodes_dict_rev = {value: key for key, value in nodes_dict.items()}
# relations_dict_rev = {value: key for key, value in relations_dict.items()}

In [7]:
data = get_data(g_no_noise)
data = split_edges(data)

In [8]:
data

HeteroData(
  edge_index=[2, 55215],
  edge_type=[55215],
  val_pos_edge_index=[2, 0],
  val_edge_type=[0],
  test_pos_edge_index=[2, 11043],
  test_edge_type=[11043],
  train_pos_edge_index=[2, 44172],
  train_edge_type=[44172]
)

**Train**

In [None]:
relations = list(set(g_no_noise.predicates()))
nodes = list(set(g_no_noise.subjects()).union(set(g_no_noise.objects())))

In [9]:
st = time.time()
model = GNN(device, len(nodes), len(relations))

for epoch in range(10+1):
    loss = model._train(data.to(device))
    print(f'Epoch: {epoch}, Loss: {loss:.4f}')

torch.save(model, f'models/RGCN_{dataset_name}_train')
et = time.time()
elapsed_time = et - st
print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')

**Eval**

In [10]:
# model = torch.load(f'models/RGCN_{dataset_name}_train')
# mrr, mean_rank, median_rank, hits_at_5, hits_at_10 = model._eval(data.to(device))
# print(f'MRR: {mrr:.3f}, Mean Rank: {mean_rank:.3f}, Median Rank: {median_rank:.3f}, Hits@5: {hits_at_5:.3f}, Hits@10: {hits_at_10:.3f}')

# 3. Noise Generation

## 3.1. GNN: we add k triples with a low prediction score to the ontology

In [14]:
def add_triples_gnn(g, data, edge_types, noise_percentage):
    k = int((noise_percentage * num_triples) / len(edge_types))
    noisy_g_gnn = rdflib.Graph()
    new_g_gnn = copy_graph(g)
    for etype in tqdm(edge_types):   
        mask = data.edge_type == etype
        edge_index = torch.tensor([data.edge_index[0,mask].tolist(),data.edge_index[1,mask].tolist()])
        edge_type = data.edge_type[mask]

        output = model.model.encode(edge_index.to(device), edge_type.to(device))

        link_pred_scores = torch.matmul(output, output.T)
        output_norm = torch.norm(output, dim=1, keepdim=True)
        link_pred_scores_norm = link_pred_scores / (output_norm * output_norm.T)
        
        # We do not want to generate links that already exists
        # We want the subject and object to be an individual 
        link_pred_scores_norm[edge_index[0,:],edge_index[1,:]] = 1
        subset = link_pred_scores_norm[individual_id][:, individual_id]

        # Find the indices of the top k smallest elements
        _, topk_indices = torch.topk(subset.flatten(), k*2, largest=False)
        row_indices = topk_indices // subset.size(1)
        col_indices = topk_indices % subset.size(1)

        # Filter out indices where row index is greater than column index
        valid_indices_mask = row_indices < col_indices
        row_indices = row_indices[valid_indices_mask]
        col_indices = col_indices[valid_indices_mask]
        
        # Add generated triples
        node1_lst = [individual_names_dict[key] for key in row_indices.tolist()]
        node2_lst = [individual_names_dict[key] for key in col_indices.tolist()]
        edge_type_uri = relations_dict_rev[etype]
        noisy_g_gnn = add_links(noisy_g_gnn, node1_lst, node2_lst, edge_type_uri)
        new_g_gnn = add_links(new_g_gnn, node1_lst, node2_lst, edge_type_uri)
        
    return noisy_g_gnn, new_g_gnn

## 3.2. Random: we add k random triples to the ontology

In [7]:
def add_triples_random(g_no_noise, uri, noise_percentage):
    max_triples = int(noise_percentage * len(g_no_noise)) 

    noisy_g_random = rdflib.Graph()
    new_g_random = copy_graph(g_no_noise)
    num_triples = 0

    possible_predicates = get_possible_predicates(g_no_noise)
    subjects, objects = get_subjects_objects_given_predicate(g_no_noise, [str(uri).split("#")[-1] for uri in possible_predicates], uri)

    while num_triples < max_triples:
        s = random.choice(subjects)
        p = random.choice(possible_predicates)
        o = random.choice(objects)

        triple = (s, URIRef(uri + p), o)

        if triple not in g_no_noise:
            noisy_g_random.add(triple)
            new_g_random.add(triple)
            num_triples += 1
    return noisy_g_random, new_g_random

In [None]:
### TEST ###
noisy_g_random, new_g_random = add_triples_random(g_no_noise, uri, 0.25)
print(0.25 * len(g_no_noise))
print(len(noisy_g_random))

## 3.3. DL: we add individuals to the ontology that belong to disjoint classes/properties

In [12]:
all_disjoint_classes = get_disjoint_classes(ontology)
all_disjoint_properties = get_disjoint_properties(ontology)

In [1]:
# len()

In [None]:
def add_disjoint_axioms(g, g_no_noise, all_disjoint_classes, all_disjoint_properties, uri, noise_percentage):    
    max_triples = int((noise_percentage * len(g_no_noise))/2)

    noisy_g_disjoint = rdflib.Graph()
    noisy_g_disjoint += add_noise_disjoint_classes(g_no_noise, max_triples, all_disjoint_classes, uri)
    noisy_g_disjoint += add_noise_disjoint_properties(g, g_no_noise, max_triples, all_disjoint_properties, uri)

    new_g_disjoint = copy_graph(g_no_noise)
    new_g_disjoint += add_noise_disjoint_classes(g_no_noise, max_triples, all_disjoint_classes, uri)
    new_g_disjoint += add_noise_disjoint_properties(g, g_no_noise, max_triples, all_disjoint_properties, uri)
    return noisy_g_disjoint, new_g_disjoint

In [None]:
### TEST ###
noisy_g_disjoint, new_g_disjoint = add_disjoint_axioms(g, g_no_noise, all_disjoint_classes, all_disjoint_properties, uri, 0.25)
print(0.25 * len(g_no_noise))
print(len(noisy_g_disjoint))

# 4. Results

In [None]:
model = torch.load(f'models/RGCN_{dataset_name}')

for noise_percentage in [0.25, 0.5, 0.75, 1.0]:

    noisy_g_gnn, new_g_gnn = add_triples_gnn(g, data, edge_types, noise_percentage)
    noisy_g_gnn.serialize(destination=f"datasets/noise/{dataset_name}_gnn_{noise_percentage}.owl", format='xml')
    
    noisy_g_random, new_g_random = add_triples_random(g_no_noise, uri, noise_percentage)
    noisy_g_random.serialize(destination=f"datasets/noise/{dataset_name}_random_{noise_percentage}.owl", format='xml')
    
    noisy_g_disjoint, new_g_disjoint = add_disjoint_axioms(g, g_no_noise, all_disjoint_classes, all_disjoint_properties, uri, noise_percentage)
    noisy_g_disjoint.serialize(destination=f"datasets/noise/{dataset_name}_disjoint_{noise_percentage}.owl", format='xml')

  model = torch.load(f'models/RGCN_{dataset_name}')
100%|██████████| 38/38 [01:09<00:00,  1.84s/it]
100%|██████████| 38/38 [00:00<00:00, 58.56it/s]
100%|██████████| 38/38 [01:14<00:00,  1.97s/it]
100%|██████████| 38/38 [00:01<00:00, 32.87it/s]
100%|██████████| 38/38 [01:11<00:00,  1.88s/it]
100%|██████████| 38/38 [00:02<00:00, 15.92it/s]
100%|██████████| 38/38 [01:32<00:00,  2.42s/it]
100%|██████████| 38/38 [00:02<00:00, 16.31it/s]
