In [4]:
import pandas as pd
import rdflib
from owlready2 import get_ontology
from tqdm import tqdm
from glob import glob
import logging
import os
import re
from sklearn.model_selection import train_test_split
import xml.etree.ElementTree as ET

In [39]:
def get_graph_type(s):
    s = s.split('/')[-1]
    s = s.split('_')[:-1]
    s = "_".join(s)
    return s

In [40]:
def get_files_df(INPUT_GRAPHS_FOLDER, INFERENCE_GRAPHS_FOLDER):
    logging.info("Creating dataframe for OWL2DL-1 input/inference pairs")
    rdf_files = []
    for input_graph_path in tqdm(sorted(glob(INPUT_GRAPHS_FOLDER + "*"))):
        input_graph_file = os.path.basename(input_graph_path)
        inference_path = INFERENCE_GRAPHS_FOLDER + input_graph_file.replace('.owl','_inferred.owl')
        graph_type = get_graph_type(input_graph_path)
        rdf_pair = {"input_graph_file": input_graph_path, "inference_file": inference_path, "graph_type": graph_type}
        rdf_files.append(rdf_pair)
    files_df = pd.DataFrame.from_dict(rdf_files)
    return files_df

In [41]:
files_df = get_files_df('MyJenaProject/input/', 'MyJenaProject/output/')

100%|██████████| 3662/3662 [00:00<00:00, 129995.95it/s]


In [42]:
# Remove classes for which only one instance exists
df_count = pd.DataFrame(files_df['graph_type'].value_counts())
graph_type_2_keep = df_count[df_count['graph_type'] > 1].index
files_df = files_df[files_df['graph_type'].isin(graph_type_2_keep)]

In [43]:
def train_validate_test_split(df, train_percent=0.6, validate_percent=0.2, stratify=None, seed=1):
    val_test_percent = 1 - train_percent
    test_percent = (1 - (train_percent + validate_percent))
    test_percent = test_percent / (test_percent + validate_percent)
    if stratify:
        df_train, df_val_test = train_test_split(df, test_size=val_test_percent, random_state=seed,
                                                 stratify=df[stratify])
        df_val, df_test = train_test_split(df_val_test, test_size=test_percent, random_state=seed,
                                           stratify=df_val_test[stratify])
    else:
        df_train, df_val_test = train_test_split(df, test_size=val_test_percent, random_state=seed)
        df_val, df_test = train_test_split(df_val_test, test_size=test_percent, random_state=seed)
    return df_train, df_val, df_test

In [44]:
rdf_data_train, rdf_data_val, rdf_data_test = train_validate_test_split(files_df,
                                                                        train_percent=0.6,
                                                                        validate_percent=0.2,
                                                                        stratify="graph_type",
                                                                        seed=1)

In [45]:
def merge_owl_files(data, output_file):
    # Define input files
    nt_files_orig = data['input_graph_file']
    nt_files_inferred = data['inference_file']
    nt_files = pd.concat([nt_files_orig, nt_files_inferred])


    # Start with an empty root element for OWL/RDF
    rdf_root = None
    namespaces = {}
    
    for file in nt_files:
        tree = ET.parse(file)
        root = tree.getroot()
        
        # Gather namespaces
        for k, v in root.attrib.items():
            if k.startswith("xmlns"):
                namespaces[k] = v
        
        if rdf_root is None:
            # First file, initialize the root
            rdf_root = ET.Element(root.tag, attrib=namespaces)
        
        # Append the children from the current file to the root
        for child in root:
            rdf_root.append(child)
    
    # Write the merged content to the output file
    tree = ET.ElementTree(rdf_root)
    tree.write(output_file)

In [46]:
merge_owl_files(rdf_data_train, 'datasets/bin/OWL2DL-1_train.owl')
merge_owl_files(rdf_data_val, 'datasets/bin/OWL2DL-1_val.owl')
merge_owl_files(rdf_data_test, 'datasets/bin/OWL2DL-1_test.owl')

In [2]:
def change_iri(file_path, new_iri):
    onto = get_ontology(file_path).load()
    onto.base_iri = new_iri
    onto.save(file_path)

In [5]:
# Change the IRIs of the train, validation, and test datasets
change_iri('datasets/bin/OWL2DL-1_train.owl', 'http://new-iri/train')
change_iri('datasets/bin/OWL2DL-1_val.owl', 'http://new-iri/val')
change_iri('datasets/bin/OWL2DL-1_test.owl', 'http://new-iri/test')

: 