In [1]:
from tqdm import tqdm
import time
import torch
import rdflib
from rdflib import Namespace, Literal
from owlready2 import get_ontology

from src.utils import *
from src.gnn import *
from src.sparql_queries import *
from src.noise import *





In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'
print(device)

cpu


In [3]:
# dataset_name = 'family'
# dataset_name = 'pizza'
dataset_name = 'OWL2DL-1'
# dataset_name = 'lubm_filtered'

In [4]:
if dataset_name == 'family':
    uri = Namespace("http://www.co-ode.org/roberts/family-tree.owl#")
elif dataset_name == 'pizza':
    uri = Namespace("http://www.co-ode.org/ontologies/pizza/pizza.owl#")
elif dataset_name.startswith('OWL2DL-'):
    uri = Namespace("https://kracr.iiitd.edu.in/OWL2Bench#")
elif dataset_name.startswith('lubm'):
    uri = Namespace("http://swat.cse.lehigh.edu/onto/univ-bench.owl#")

# 1. Data

In [5]:
g = rdflib.Graph()
g.parse(f'datasets/{dataset_name}.owl')
num_triples = len(g)
print(f'Triplets found in {dataset_name}.owl: %d' % num_triples)

g_no_noise = rdflib.Graph()
g_no_noise.parse(f'datasets/{dataset_name}_train.owl', format='turtle')
num_triples_train = len(g_no_noise)
print(f'Triplets found in {dataset_name}.owl: %d' % num_triples_train)

ontology = get_ontology(f'datasets/{dataset_name}.owl').load()

Triplets found in OWL2DL-1.owl: 55101
Triplets found in OWL2DL-1.owl: 87662


# 2. GNN

In [6]:
data, nodes, nodes_dict, relations, relations_dict = get_data(g_no_noise)
nodes_dict_rev = {value: key for key, value in nodes_dict.items()}
relations_dict_rev = {value: key for key, value in relations_dict.items()}
data = split_edges(data)

In [7]:
data

HeteroData(
  edge_index=[2, 87662],
  edge_type=[87662],
  val_pos_edge_index=[2, 0],
  val_edge_type=[0],
  test_pos_edge_index=[2, 17532],
  test_edge_type=[17532],
  train_pos_edge_index=[2, 70130],
  train_edge_type=[70130]
)

**Train**

In [8]:
st = time.time()
model = GNN(device, len(nodes), len(relations))

for epoch in range(10+1):
    loss = model._train(data.to(device))
    print(f'Epoch: {epoch}, Loss: {loss:.4f}')

torch.save(model, f'models/RGCN_{dataset_name}')
et = time.time()
elapsed_time = et - st
print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')

Epoch: 0, Loss: 0.6932
Epoch: 1, Loss: 0.6778
Epoch: 2, Loss: 0.5739
Epoch: 3, Loss: 0.5082
Epoch: 4, Loss: 0.4547
Epoch: 5, Loss: 0.3819
Epoch: 6, Loss: 0.5966
Epoch: 7, Loss: 0.3212
Epoch: 8, Loss: 0.3092
Epoch: 9, Loss: 0.2815
Epoch: 10, Loss: 0.2302
Run time: 55 seconds, 1 minutes


**Eval**

In [9]:
# model = torch.load(f'models/RGCN_{dataset_name}')
# mrr, mean_rank, median_rank, hits_at_5, hits_at_10 = model._eval(data.to(device))
# print(f'MRR: {mrr:.3f}, Mean Rank: {mean_rank:.3f}, Median Rank: {median_rank:.3f}, Hits@5: {hits_at_5:.3f}, Hits@10: {hits_at_10:.3f}')

# 3. Noise Generation

## 3.1. GNN: we add k triples with a low prediction score to the ontology

In [10]:
# Create new dictionary of nodes that only contains individuals
_, individual_names, _ = get_individuals(g_no_noise)

nodes_dict_rev_new = {}
lst = []
i = 0
for k,v in nodes_dict_rev.items():
    if v in individual_names:
        nodes_dict_rev_new[i] = v
        lst.append(k)
        i += 1

In [11]:
def add_triples_gnn(g_no_noise, data, noise_percentage):

    possible_predicates = get_possible_predicates(g_no_noise)
    max_triples = int((noise_percentage * len(g_no_noise)) / len(possible_predicates))
    
    noisy_g_gnn = rdflib.Graph()
    new_g_gnn = copy_graph(g_no_noise)
    
    for key, value in tqdm(relations_dict_rev.items()): 
        if str(value) in possible_predicates: 
            mask = data.edge_type == key
            edge_index = torch.tensor([data.edge_index[0, mask].tolist(), data.edge_index[1, mask].tolist()])
            edge_type = data.edge_type[mask]

            output = model.model.encode(edge_index.to(model.device), edge_type.to(model.device))

            link_pred_scores = torch.matmul(output, output.T)
            output_norm = torch.norm(output, dim=1, keepdim=True)
            link_pred_scores_norm = link_pred_scores / (output_norm * output_norm.T)

            # We do not want to generate links that already exists
            link_pred_scores_norm[edge_index[0,:],edge_index[1,:]] = 1

            # We want the subject and object to be an individual 
            subset = link_pred_scores_norm[lst][:, lst]

            # Find the indices of the top k smallest elements
            _, topk_indices = torch.topk(subset.flatten(), max_triples*2, largest=False)
            row_indices = topk_indices // subset.size(1)
            col_indices = topk_indices % subset.size(1)

            # Filter out indices where row index is greater than column index
            valid_indices_mask = row_indices < col_indices
            row_indices = row_indices[valid_indices_mask]
            col_indices = col_indices[valid_indices_mask]

            # Add generated triples
            node1_lst = [nodes_dict_rev_new[row] for row in row_indices.tolist()]
            node2_lst = [nodes_dict_rev_new[col] for col in col_indices.tolist()]
            edge_type_uri = relations_dict_rev[key]
            noisy_g_gnn = add_links(noisy_g_gnn, node1_lst, node2_lst, edge_type_uri)
            new_g_gnn = add_links(new_g_gnn, node1_lst, node2_lst, edge_type_uri)
            
    return noisy_g_gnn, new_g_gnn

## 3.2. Random: we add k random triples to the ontology

In [12]:
def add_triples_random(g_no_noise, uri, noise_percentage):
    max_triples = int(noise_percentage * len(g_no_noise)) 

    noisy_g_random = rdflib.Graph()
    new_g_random = copy_graph(g_no_noise)
    num_triples = 0

    possible_predicates = get_possible_predicates(g_no_noise)
    subjects, objects = get_subjects_objects_given_predicate(g_no_noise, [str(uri).split("#")[-1] for uri in possible_predicates], uri)

    while num_triples < max_triples:
        s = random.choice(subjects)
        p = random.choice(possible_predicates)
        o = random.choice(objects)

        triple = (s, URIRef(uri + p), o)

        if triple not in g_no_noise:
            noisy_g_random.add(triple)
            new_g_random.add(triple)
            num_triples += 1
    return noisy_g_random, new_g_random

## 3.3. DL: we add individuals to the ontology that belong to disjoint classes/properties

In [13]:
all_disjoint_classes = get_disjoint_classes(ontology)
all_disjoint_properties = get_disjoint_properties(ontology)




































In [14]:
def add_disjoint_axioms(g, g_no_noise, all_disjoint_classes, all_disjoint_properties, uri, noise_percentage):    
    max_triples = int((noise_percentage * len(g_no_noise))/2)

    noisy_g_disjoint = rdflib.Graph()
    noisy_g_disjoint += add_noise_disjoint_classes(g_no_noise, max_triples, all_disjoint_classes, uri)
    noisy_g_disjoint += add_noise_disjoint_properties(g, g_no_noise, max_triples, all_disjoint_properties, uri)

    new_g_disjoint = copy_graph(g_no_noise)
    new_g_disjoint += add_noise_disjoint_classes(g_no_noise, max_triples, all_disjoint_classes, uri)
    new_g_disjoint += add_noise_disjoint_properties(g, g_no_noise, max_triples, all_disjoint_properties, uri)
    return noisy_g_disjoint, new_g_disjoint

# 4. Get files

In [15]:
for noise_percentage in [0.25, 0.5, 0.75, 1.0]:    
    noisy_g_random, new_g_random = add_triples_random(g_no_noise, uri, noise_percentage)
    noisy_g_random.serialize(destination=f"datasets/noise/{dataset_name}_random_{noise_percentage}.owl", format='xml')

In [17]:
for noise_percentage in [0.25, 0.5, 0.75, 1.0]:      
    noisy_g_disjoint, new_g_disjoint = add_disjoint_axioms(g, g_no_noise, all_disjoint_classes, all_disjoint_properties, uri, noise_percentage)
    noisy_g_disjoint.serialize(destination=f"datasets/noise/{dataset_name}_disjoint_{noise_percentage}.owl", format='xml')

We created new individuals...
We created new individuals...
We created new individuals...
We created new individuals...
We created new individuals...
We created new individuals...
We created new individuals...
We created new individuals...


In [18]:
model = torch.load(f'models/RGCN_{dataset_name}')

for noise_percentage in [0.25, 0.5, 0.75, 1.0]:
    noisy_g_gnn, new_g_gnn = add_triples_gnn(g_no_noise, data, noise_percentage)
    noisy_g_gnn.serialize(destination=f"datasets/noise/{dataset_name}_gnn_{noise_percentage}.owl", format='xml')

100%|██████████| 94/94 [03:11<00:00,  2.04s/it]
100%|██████████| 94/94 [03:34<00:00,  2.28s/it]
100%|██████████| 94/94 [03:33<00:00,  2.27s/it]
100%|██████████| 94/94 [03:32<00:00,  2.26s/it]


In [19]:
experiments = get_experimets(dataset_name)

In [20]:
for experiment in experiments[1:]: 
    dataset_name = experiment['dataset_name']
    file_name = experiment['file_name']

    g_train = rdflib.Graph()
    g_train.parse(f'datasets/{dataset_name}_train.owl', format='turtle')
    print(f'# G_train: {len(g_train)}')

    g_noise = rdflib.Graph()
    g_noise.parse(f'datasets/noise/{file_name}.owl')
    print(f'# G_noise: {len(g_noise)}')

    g_train += g_noise
    g_train.serialize(destination=f'datasets/{file_name}_train.owl')
    print(f'# G_train + G_noise: {len(g_train)}')

    print()

# G_train: 87662
# G_noise: 21900
# G_train + G_noise: 109562

# G_train: 87662
# G_noise: 21913
# G_train + G_noise: 109575

# G_train: 87662
# G_noise: 21278
# G_train + G_noise: 108526

# G_train: 87662
# G_noise: 43803
# G_train + G_noise: 131465

# G_train: 87662
# G_noise: 43831
# G_train + G_noise: 131493

# G_train: 87662
# G_noise: 41532
# G_train + G_noise: 128441

# G_train: 87662
# G_noise: 65751
# G_train + G_noise: 153413

# G_train: 87662
# G_noise: 65741
# G_train + G_noise: 153403

# G_train: 87662
# G_noise: 60592
# G_train + G_noise: 147230

# G_train: 87662
# G_noise: 87650
# G_train + G_noise: 175312

# G_train: 87662
# G_noise: 87650
# G_train + G_noise: 175312

# G_train: 87662
# G_noise: 79169
# G_train + G_noise: 165633

