In [222]:
import time
import torch
from tqdm import tqdm
import rdflib
from rdflib import URIRef, Literal, Namespace, RDFS, BNode

from src.utils import *
from src.gnn import *
from src.sparql_queries import *
from src.noise import *

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'
print(device)

In [224]:
dataset_name = 'OWL2DL-1'
# dataset_name = 'family'

In [225]:
if dataset_name == 'family':
    uri = Namespace("http://www.example.com/genealogy.owl#")
elif dataset_name.startswith('OWL2DL-'):
    uri = Namespace("https://kracr.iiitd.edu.in/OWL2Bench#")

# 1. Data

In [None]:
g = rdflib.Graph()
g.parse(f'datasets/{dataset_name}_modified.owl', format='turtle')
num_triples = len(g)
print(f'Triplets found in {dataset_name}.owl: %d' % num_triples)

g_no_noise = rdflib.Graph()
g_no_noise.parse(f'datasets/{dataset_name}_train_modified.owl', format='turtle')
num_triples_train = len(g_no_noise)

g_no_noise_unmodified = rdflib.Graph()
g_no_noise_unmodified.parse(f'datasets/{dataset_name}_train.owl', format='turtle')

# 2. GNN

In [227]:
data, nodes, nodes_dict, relations, relations_dict = get_data(g_no_noise_unmodified)
nodes_dict_rev = {value: key for key, value in nodes_dict.items()}
relations_dict_rev = {value: key for key, value in relations_dict.items()}
data = split_edges(data)

In [None]:
data

**Train**

In [None]:
st = time.time()
model = GNN(device, len(nodes), len(relations))

for epoch in range(10+1):
    loss = model._train(data.to(device))
    print(f'Epoch: {epoch}, Loss: {loss:.4f}')

torch.save(model, f'models/RGCN_{dataset_name}')
et = time.time()
elapsed_time = et - st
print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')

**Eval**

In [230]:
# model = torch.load(f'models/RGCN_{dataset_name}')
# mrr, mean_rank, median_rank, hits_at_5, hits_at_10 = model._eval(data.to(device))
# print(f'MRR: {mrr:.3f}, Mean Rank: {mean_rank:.3f}, Median Rank: {median_rank:.3f}, Hits@5: {hits_at_5:.3f}, Hits@10: {hits_at_10:.3f}')

# 3. Noise Generation

## 3.1. GNN: we add k triples with a low prediction score to the ontology

In [231]:
def add_triples_gnn(g_no_noise, data, noise_percentage):
    max_triples = int((noise_percentage * len(g_no_noise)) / len(relations_dict_rev))
    
    noisy_g_gnn = rdflib.Graph()
    new_g_gnn = copy_graph(g_no_noise)
    
    for key, _ in tqdm(relations_dict_rev.items()): 
        mask = data.edge_type == key
        edge_index = torch.tensor([data.edge_index[0, mask].tolist(), data.edge_index[1, mask].tolist()])
        edge_type = data.edge_type[mask]

        output = model.model.encode(edge_index.to(model.device), edge_type.to(model.device))

        link_pred_scores = torch.matmul(output, output.T)
        output_norm = torch.norm(output, dim=1, keepdim=True)
        link_pred_scores_norm = link_pred_scores / (output_norm * output_norm.T)

        link_pred_scores_norm[edge_index[0, :], edge_index[1, :]] = 1

        _, topk_indices = torch.topk(link_pred_scores_norm.flatten(), max_triples * 2, largest=False)
        row_indices = topk_indices // link_pred_scores_norm.size(1)
        col_indices = topk_indices % link_pred_scores_norm.size(1)

        valid_indices_mask = row_indices < col_indices
        row_indices = row_indices[valid_indices_mask]
        col_indices = col_indices[valid_indices_mask]

        node1_lst = [nodes_dict_rev[row.item()] for row in row_indices]
        node2_lst = [nodes_dict_rev[col.item()] for col in col_indices]
        edge_type_uri = relations_dict_rev[key]

        for s, o in zip(node1_lst, node2_lst):
            existing_triples = list(g_no_noise.triples((None, URIRef(edge_type_uri), None)))
            if existing_triples:
                triple_to_corrupt = random.choice(existing_triples)
                subject, predicate, object = triple_to_corrupt 

                if random.choice([True, False]):
                    corrupted_triple = (s, predicate, object)
                else:
                    corrupted_triple = (subject, predicate, o)

                if corrupted_triple not in g_no_noise:
                    noisy_g_gnn.add(corrupted_triple)
                    new_g_gnn.add(corrupted_triple)

    return noisy_g_gnn, new_g_gnn

## 3.2. Random: we add k random triples to the ontology

In [232]:
def add_triples_random(g_no_noise, noise_percentage):
    max_triples = int(noise_percentage * len(g_no_noise)) 

    noisy_g_random = rdflib.Graph()
    new_g_random = copy_graph(g_no_noise)
    num_triples = 0

    subjects = list(set(g_no_noise.subjects()))
    objects = list(set(g_no_noise.objects()))
    triples_list = list(g_no_noise)

    while num_triples < max_triples:
        triple = random.choice(triples_list)
        s, p, o = triple

        if random.choice([True, False]):  
            new_s = random.choice(subjects)
            corrupted_triple = (new_s, p, o)
        else:  
            new_o = random.choice(objects)
            corrupted_triple = (s, p, new_o)

        if corrupted_triple not in g_no_noise:
            noisy_g_random.add(corrupted_triple)
            new_g_random.add(corrupted_triple)
            num_triples += 1
    return noisy_g_random, new_g_random

## 3.3. DL: we add individuals to the ontology that belong to disjoint classes/properties + violate range

In [233]:
all_disjoint_classes = get_disjoint_classes(g_no_noise)
all_disjoint_properties = get_disjoint_properties(g_no_noise)

# 4. Get files

In [234]:
for noise_percentage in [0.25, 0.5, 0.75, 1.0]:    
    noisy_g_random, new_g_random = add_triples_random(g_no_noise_unmodified, noise_percentage)
    noisy_g_random.serialize(destination=f"datasets/noise/{dataset_name}_random_{noise_percentage}.owl", format='xml')

In [None]:
model = torch.load(f'models/RGCN_{dataset_name}')

for noise_percentage in [0.25, 0.5, 0.75, 1.0]:
    noisy_g_gnn, new_g_gnn = add_triples_gnn(g_no_noise_unmodified, data, noise_percentage)
    noisy_g_gnn.serialize(destination=f"datasets/noise/{dataset_name}_gnn_{noise_percentage}.owl", format='xml')

In [236]:
for noise_percentage in [0.25, 0.5, 0.75, 1.0]:
    noisy_triples, new_graph = add_triples_logical(g_no_noise, noise_percentage, all_disjoint_classes, all_disjoint_properties)
    noisy_triples.serialize(destination=f"datasets/noise/{dataset_name}_logical_{noise_percentage}.owl", format='xml')

In [237]:
experiments = get_experimets(dataset_name)

In [None]:
for experiment in experiments[1:]: 
    
    dataset_name = experiment['dataset_name']
    file_name = experiment['file_name']
    print(file_name)

    g_train = rdflib.Graph()
    g_train.parse(f'datasets/{dataset_name}_train.owl', format='turtle')
    print(f'# G_train: {len(g_train)}')

    g_noise = rdflib.Graph()
    g_noise.parse(f'datasets/noise/{file_name}.owl', format='xml')
    print(f'# G_noise: {len(g_noise)}')

    g_train += g_noise
    g_train.serialize(destination=f'datasets/{file_name}_train.owl')
    print(f'# G_train + G_noise: {len(g_train)}')

    print()