In [1]:
import pandas as pd
import os
import logging
from rdflib import URIRef, OWL, Literal, RDF, RDFS, BNode
from owlready2 import get_ontology
from tqdm import tqdm
from glob import glob
from sklearn.model_selection import train_test_split
from src.utils import *
from src.noise import *





In [2]:
dataset_name = 'OWL2DL-1'

In [3]:
ontology = get_ontology(f'datasets/{dataset_name}.owl').load()

**Filter unnecessary triples from inferred graphs**

In [None]:
object_properties = list(ontology.object_properties())
object_properties = [URIRef(x.iri) for x in object_properties]

In [None]:
def filter_inferred_triples(input_path, inferred_path, filtered_inferred_path):  
    g = rdflib.Graph()
    g.parse(input_path)  
    g_inferred = rdflib.Graph()
    g_inferred.parse(inferred_path)

    g_inferred_filtered = rdflib.Graph()
    for triple in g_inferred: 
        if ((triple[1] in object_properties or triple[1] in {RDFS.subClassOf, RDF.type}) and 
            not isinstance(triple[2], Literal) and                           
            triple[2] != OWL.Thing and                                 
            triple not in g):
            g_inferred_filtered.add(triple)
    g_inferred_filtered.serialize(filtered_inferred_path, format="turtle")

In [None]:
for input_graph_path in tqdm(sorted(glob('datasets/inferred_graphs/' + "*"))):
    file = input_graph_path.replace('datasets/inferred_graphs\\', '').replace('.xml', '')
    filter_inferred_triples('datasets/input_graphs/' + file + '.ttl', 
                            input_graph_path, 
                            'datasets/inferred_graphs_filtered/' + file + '.ttl')

**Create train, test, val sets**

In [None]:
def get_graph_type(s):
    s = s.split('\\')[-1]
    s = s.split('_')[:-1]
    s = "_".join(s)
    return s

In [None]:
def get_files_df(INPUT_GRAPHS_FOLDER, INFERENCE_GRAPHS_FOLDER):
    logging.info(f"Creating dataframe for {dataset_name} input/inference pairs")
    rdf_files = []
    for input_graph_path in tqdm(sorted(glob(INPUT_GRAPHS_FOLDER + "*"))):
        input_graph_file = os.path.basename(input_graph_path)
        inference_path = INFERENCE_GRAPHS_FOLDER + input_graph_file
        graph_type = get_graph_type(input_graph_path)
        rdf_pair = {"input_graph_file": input_graph_path, "inference_file": inference_path, "graph_type": graph_type}
        rdf_files.append(rdf_pair)
    files_df = pd.DataFrame.from_dict(rdf_files)
    return files_df

In [None]:
files_df = get_files_df('datasets/input_graphs_filtered_1hop/', 'datasets/inferred_graphs_filtered/')

In [None]:
# Remove classes for which only one instance exists
df_count = pd.DataFrame(files_df['graph_type'].value_counts())
graph_type_2_keep = df_count[df_count['graph_type'] > 1].index
files_df = files_df[files_df['graph_type'].isin(graph_type_2_keep)]

In [None]:
def train_validate_test_split(df, train_percent=0.6, validate_percent=0.2, stratify=None, seed=1):
    val_test_percent = 1 - train_percent
    test_percent = (1 - (train_percent + validate_percent))
    test_percent = test_percent / (test_percent + validate_percent)
    if stratify:
        df_train, df_val_test = train_test_split(df, test_size=val_test_percent, random_state=seed,
                                                 stratify=df[stratify])
        df_val, df_test = train_test_split(df_val_test, test_size=test_percent, random_state=seed,
                                           stratify=df_val_test[stratify])
    else:
        df_train, df_val_test = train_test_split(df, test_size=val_test_percent, random_state=seed)
        df_val, df_test = train_test_split(df_val_test, test_size=test_percent, random_state=seed)
    return df_train, df_val, df_test

In [None]:
rdf_data_train, rdf_data_val, rdf_data_test = train_validate_test_split(files_df,
                                                                        train_percent=0.6,
                                                                        validate_percent=0.2,
                                                                        stratify="graph_type",
                                                                        seed=1)

In [None]:
def merge_nt_files(data, output_file):
    merged_graph = rdflib.Graph()

    nt_files_orig = data.get('input_graph_file', pd.Series(dtype=str))
    nt_files_inferred = data.get('inference_file', pd.Series(dtype=str))
    
    nt_files = pd.concat([nt_files_orig, nt_files_inferred], ignore_index=True)

    for nt_file in nt_files:
        try:
            graph = rdflib.Graph()
            if 'TBOX' in nt_file: graph.parse(nt_file)
            else: graph.parse(nt_file, format="turtle")
            merged_graph += graph  
        except Exception as e:
            print(f"Warning: Could not parse {nt_file} - {e}")

    merged_graph.serialize(destination=output_file)
    print(f"Merged file created at {output_file}")

In [None]:
merge_nt_files(rdf_data_train, f'datasets/{dataset_name}_train_complete.owl')
merge_nt_files(rdf_data_val, f'datasets/{dataset_name}_val_complete.owl')
merge_nt_files(rdf_data_test, f'datasets/{dataset_name}_test_complete.owl')

**Manage duplicates and drop BNodes in train, test and val sets**

In [None]:
rdf_data_train['resource'] = rdf_data_train['input_graph_file'].str.extract(r'_([^_]+)\.ttl$')
rdf_data_test['resource'] = rdf_data_test['input_graph_file'].str.extract(r'_([^_]+)\.ttl$')
rdf_data_val['resource'] = rdf_data_val['input_graph_file'].str.extract(r'_([^_]+)\.ttl$')

In [None]:
G_train = rdflib.Graph()
G_train.parse(f'datasets/{dataset_name}_train_complete.owl', format='turtle')
print(f'# triples in G_train: {len(G_train)}')

G_test = rdflib.Graph()
G_test.parse(f'datasets/{dataset_name}_test_complete.owl', format='turtle')
print(f'# triples in G_test: {len(G_test)}')

G_val = rdflib.Graph()
G_val.parse(f'datasets/{dataset_name}_val_complete.owl', format='turtle')
print(f'# triples in G_val: {len(G_val)}')

G_tbox = rdflib.Graph()
G_tbox.parse(f'datasets/{dataset_name}_TBOX.owl')
print(f'# triples in TBox: {len(G_tbox)}')

In [None]:
G_train_unique = set(G_train) - set(G_test) - set(G_val) 
G_test_unique = set(G_test) - set(G_train) - set(G_val) 
G_val_unique = set(G_val) - set(G_train) - set(G_test) 

In [None]:
def add_duplicates_to_set(G1, G2):    
    intersection = G1 & G2

    df_intersection = pd.DataFrame(list(intersection), columns=["subject", "predicate", "object"])
    df_intersection['subject_name'] = df_intersection['subject'].apply(lambda x: x.replace('https://kracr.iiitd.edu.in/OWL2Bench#', ''))
    df_intersection['source'] = (df_intersection['subject_name'].apply(lambda x: ', '.join([name for name, df in [('train', rdf_data_train), ('test', rdf_data_test), ('val', rdf_data_val)]
                                                if x in df['resource'].values])))
    df_intersection['source'] = df_intersection['source'].apply(lambda x: 'test' if x == '' else x)

    set_train = set(df_intersection[df_intersection['source'] == 'train'].apply(lambda row: (row['subject'], row['predicate'], row['object']), axis=1))
    set_test = set(df_intersection[df_intersection['source'] == 'test'].apply(lambda row: (row['subject'], row['predicate'], row['object']), axis=1))
    set_val = set(df_intersection[df_intersection['source'] == 'val'].apply(lambda row: (row['subject'], row['predicate'], row['object']), axis=1))

    return set_train, set_test, set_val

In [None]:
set_train, set_test, set_val = add_duplicates_to_set(G_train, G_test)
G_train_unique.update(set_train)
G_test_unique.update(set_test)
G_val_unique.update(set_val)

set_train, set_test, set_val = add_duplicates_to_set(G_train, G_val)
G_train_unique.update(set_train)
G_test_unique.update(set_test)
G_val_unique.update(set_val)

set_train, set_test, set_val = add_duplicates_to_set(G_test, G_val)
G_train_unique.update(set_train)
G_test_unique.update(set_test)
G_val_unique.update(set_val)

In [None]:
def remove_bnodes(graph):
    new_graph = rdflib.Graph()
    for s, p, o in graph:
        if isinstance(s, BNode) or isinstance(p, BNode) or isinstance(o, BNode):
            continue  
        new_graph.add((s, p, o))
    return new_graph

In [None]:
filtered_G_train = remove_bnodes(G_train_unique)
filtered_G_test = remove_bnodes(G_test_unique)
filtered_G_val = remove_bnodes(G_val_unique)

In [None]:
filtered_G_train += G_tbox
filtered_G_test += G_tbox
filtered_G_val += G_tbox

In [None]:
filtered_G_train.serialize(f'datasets/{dataset_name}_train.owl')
filtered_G_test.serialize(f'datasets/{dataset_name}_test.owl')
filtered_G_val.serialize(f'datasets/{dataset_name}_val.owl')