In [66]:
import pandas as pd
import rdflib
from rdflib import URIRef, RDF
from owlready2 import get_ontology
from tqdm import tqdm
from glob import glob
import logging
import os
from sklearn.model_selection import train_test_split
from src.utils import *


In [67]:
dataset_name = 'OWL2DL-1'

**Filter unnecessary triples from g and i**

In [68]:
ontology = get_ontology(f'datasets/{dataset_name}.owl').load()
subject_resources = list(ontology.individuals())
named_individuals = [URIRef(x.iri) for x in subject_resources]
print(f'# Subject-Resources: {len(subject_resources)}')

# Subject-Resources: 3668


In [69]:
def filter_triples(inferred_path, filtered_inferred_path):    
    g_inferred = rdflib.Graph()
    g_inferred.parse(inferred_path, format="ttl")
    g_inferred_filtered = rdflib.Graph()
    for triple in g_inferred: 
        if triple[0] in named_individuals or triple[2] in named_individuals:
            g_inferred_filtered.add(triple)
    g_inferred_filtered.serialize(filtered_inferred_path, format="ttl")

In [None]:
for input_graph_path in tqdm(sorted(glob('MyJenaProject/input/' + "*"))):
    filter_triples(input_graph_path, input_graph_path.replace('input','input_filtered'))

In [None]:
for input_graph_path in tqdm(sorted(glob('MyJenaProject/output/' + "*"))):
    filter_triples(input_graph_path, input_graph_path.replace('output','output_filtered'))

**Create train, test, val sets**

In [70]:
def get_graph_type(s):
    s = s.split('/')[-1]
    s = s.split('_')[:-1]
    s = "_".join(s)
    return s

In [71]:
def get_files_df(INPUT_GRAPHS_FOLDER, INFERENCE_GRAPHS_FOLDER):
    logging.info(f"Creating dataframe for {dataset_name} input/inference pairs")
    rdf_files = []
    for input_graph_path in tqdm(sorted(glob(INPUT_GRAPHS_FOLDER + "*"))):
        input_graph_file = os.path.basename(input_graph_path)
        inference_path = INFERENCE_GRAPHS_FOLDER + input_graph_file.replace('.ttl','_inferred.ttl')
        graph_type = get_graph_type(input_graph_path)
        rdf_pair = {"input_graph_file": input_graph_path, "inference_file": inference_path, "graph_type": graph_type}
        rdf_files.append(rdf_pair)
    files_df = pd.DataFrame.from_dict(rdf_files)
    return files_df

In [72]:
files_df = get_files_df('MyJenaProject/input_filtered/', 'MyJenaProject/output_filtered/')

100%|██████████| 3662/3662 [00:00<00:00, 197289.01it/s]


In [73]:
# Remove classes for which only one instance exists
df_count = pd.DataFrame(files_df['graph_type'].value_counts())
graph_type_2_keep = df_count[df_count['count'] > 1].index
files_df = files_df[files_df['graph_type'].isin(graph_type_2_keep)]

In [74]:
def train_validate_test_split(df, train_percent=0.6, validate_percent=0.2, stratify=None, seed=1):
    val_test_percent = 1 - train_percent
    test_percent = (1 - (train_percent + validate_percent))
    test_percent = test_percent / (test_percent + validate_percent)
    if stratify:
        df_train, df_val_test = train_test_split(df, test_size=val_test_percent, random_state=seed,
                                                 stratify=df[stratify])
        df_val, df_test = train_test_split(df_val_test, test_size=test_percent, random_state=seed,
                                           stratify=df_val_test[stratify])
    else:
        df_train, df_val_test = train_test_split(df, test_size=val_test_percent, random_state=seed)
        df_val, df_test = train_test_split(df_val_test, test_size=test_percent, random_state=seed)
    return df_train, df_val, df_test

In [75]:
rdf_data_train, rdf_data_val, rdf_data_test = train_validate_test_split(files_df,
                                                                        train_percent=0.6,
                                                                        validate_percent=0.2,
                                                                        stratify="graph_type",
                                                                        seed=1)

In [76]:
def merge_nt_files_with_rdflib(data, output_file, old_iri, new_iri):
    merged_graph = rdflib.Graph()

    nt_files_orig = data.get('input_graph_file', pd.Series(dtype=str))
    nt_files_inferred = data.get('inference_file', pd.Series(dtype=str))
    tbox_path = pd.Series([f'datasets/{dataset_name}_TBOX.owl'])
    
    nt_files = pd.concat([nt_files_orig, nt_files_inferred, tbox_path], ignore_index=True)

    for nt_file in nt_files:
        try:
            graph = rdflib.Graph()
            if 'TBOX' in nt_file: graph.parse(nt_file)
            else: graph.parse(nt_file, format="turtle")
            if old_iri and new_iri:
                for s, p, o in graph:
                    s = URIRef(str(s).replace(old_iri, new_iri)) if isinstance(s, URIRef) else s
                    p = URIRef(str(p).replace(old_iri, new_iri)) if isinstance(p, URIRef) else p
                    o = URIRef(str(o).replace(old_iri, new_iri)) if isinstance(o, URIRef) else o
                    merged_graph.add((s, p, o))
            else:
                merged_graph += graph  
        except Exception as e:
            print(f"Warning: Could not parse {nt_file} - {e}")

    merged_graph.serialize(destination=output_file)
    print(f"Merged file created at {output_file}")

In [77]:
merge_nt_files_with_rdflib(rdf_data_train, f'datasets/bin/{dataset_name}_train.owl', 
              'https://kracr.iiitd.edu.in/OWL2Bench', 'https://kracr.iiitd.edu.in/OWL2Bench_train')
merge_nt_files_with_rdflib(rdf_data_val, f'datasets/bin/{dataset_name}_val.owl', 
              'https://kracr.iiitd.edu.in/OWL2Bench', 'https://kracr.iiitd.edu.in/OWL2Bench_val')
merge_nt_files_with_rdflib(rdf_data_test, f'datasets/bin/{dataset_name}_test.owl', 
              'https://kracr.iiitd.edu.in/OWL2Bench', 'https://kracr.iiitd.edu.in/OWL2Bench_test')

Merged file created at datasets/bin/OWL2DL-1_train.owl
Merged file created at datasets/bin/OWL2DL-1_val.owl
Merged file created at datasets/bin/OWL2DL-1_test.owl


In [78]:
# # We do not want that validation and test sets contain individuals or classes that were unseen in the training set
# train_graph = rdflib.Graph()
# train_graph.parse(f'datasets/bin/{dataset_name}_train.owl', format='turtle')

# valid_graph = rdflib.Graph()
# valid_graph.parse(f'datasets/bin/{dataset_name}_val.owl', format='turtle')

# test_graph = rdflib.Graph()
# test_graph.parse(f'datasets/bin/{dataset_name}_test.owl', format='turtle')

# train_individuals = set(train_graph.subjects())
# train_classes = set(train_graph.objects(predicate=URIRef("http://www.w3.org/2002/07/owl#type")))

# def filter_graph(original_graph, train_individuals, train_classes):
#     filtered_graph = rdflib.Graph()
#     for s, p, o in original_graph:
#         if s in train_classes or o in train_classes:
#             filtered_graph.add((s, p, o))
#         if s in train_individuals or o in train_individuals:
#             filtered_graph.add((s, p, o))
#         if p == URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf'):
#             filtered_graph.add((s, p, o))
#     return filtered_graph

# valid_filtered_graph = filter_graph(valid_graph, train_individuals, train_classes)
# valid_filtered_graph.serialize(f'datasets/bin/{dataset_name}_val.owl', format='turtle')
# print(f'Valid graph before filtering: {len(valid_graph)}')
# print(f'Valid graph before filtering: {len(valid_filtered_graph)}')

# test_filtered_graph = filter_graph(test_graph, train_individuals, train_classes)
# test_filtered_graph.serialize(f'datasets/bin/{dataset_name}_test.owl', format='turtle')
# print(f'Test graph before filtering: {len(test_graph)}')
# print(f'Test graph before filtering: {len(test_filtered_graph)}')

**Add noise to training set**

In [79]:
experiments = get_experimets(dataset_name)

In [80]:
for experiment in experiments[1:]: 
    dataset_name = experiment['dataset_name']
    file_name = experiment['file_name']
    format_ = experiment['format_']
    add_noise = experiment['add_noise']  

    g_train = rdflib.Graph()
    g_train.parse(f'datasets/bin/{dataset_name}_train.owl', format='turtle')
    print(f'# G_train: {len(g_train)}')

    g_noise = rdflib.Graph()
    g_noise.parse(f'datasets/noise/{file_name}.owl')
    print(f'# G_noise: {len(g_noise)}')

    g_train += g_noise
    g_train.serialize(destination=f'datasets/bin/{file_name}_train.owl')
    print(f'# G_train + G_noise: {len(g_train)}')

    print()

# G_train: 50230
# G_noise: 13801
# G_train + G_noise: 64031

# G_train: 50230
# G_noise: 27604
# G_train + G_noise: 77834

# G_train: 50230
# G_noise: 41405
# G_train + G_noise: 91635

# G_train: 50230
# G_noise: 55242
# G_train + G_noise: 105472

# G_train: 50230
# G_noise: 13683
# G_train + G_noise: 63913

# G_train: 50230
# G_noise: 27362
# G_train + G_noise: 77592

# G_train: 50230
# G_noise: 41013
# G_train + G_noise: 91243

# G_train: 50230
# G_noise: 54711
# G_train + G_noise: 104941

# G_train: 50230
# G_noise: 13873
# G_train + G_noise: 62853

# G_train: 50230
# G_noise: 27673
# G_train + G_noise: 76653

# G_train: 50230
# G_noise: 41477
# G_train + G_noise: 90457

# G_train: 50230
# G_noise: 55277
# G_train + G_noise: 104257

