In [None]:
import pandas as pd
from rdflib import Graph, RDF
from src.gnn import get_namespace

In [None]:
def get_statistics(dataset_name: str):

    def stats_for_split(name, path, NS):
        g = Graph()
        g.parse(path)

        triples = len(g)
        membership = len(list(g.triples((None, RDF.type, None))))
        obj_prop = sum(1 for s, p, o in g if str(p).startswith(str(NS)))
        remaining = triples - (membership + obj_prop)

        # proportions
        prop_membership = membership / triples if triples else 0
        prop_obj_prop = obj_prop / triples if triples else 0
        prop_remaining = remaining / triples if triples else 0

        # ---- PRINT BLOCK ----
        print(f"--- {name} ---")
        print(f"Triples: {triples}")
        print(f"Membership: {membership} ({prop_membership:.3f})")
        print(f"Object Property Assertions: {obj_prop} ({prop_obj_prop:.3f})")
        print(f"Remaining Triples: {remaining} ({prop_remaining:.3f})")
        print()

        # ---- DATAFRAME ROW ----
        return {
            "Split": name,
            "Triples": triples,
            "Membership": membership,
            "Membership %": round(prop_membership, 3),
            "Object Property Assertions": obj_prop,
            "Object Property Assertions %": round(prop_obj_prop, 3),
            "Remaining Triples": remaining,
            "Remaining %": round(prop_remaining, 3)
        }

    NS = get_namespace(dataset_name)

    print(f"\nDataset: {dataset_name}\n")

    stats = [
        stats_for_split("Train", f"datasets/{dataset_name}_train.owl", NS),
        stats_for_split("Test",  f"datasets/{dataset_name}_test.owl",  NS),
        stats_for_split("Val",   f"datasets/{dataset_name}_val.owl",   NS)
    ]

    df = pd.DataFrame(stats)
    return df

In [None]:
df = get_statistics('family')
df.to_csv('models/results/statistics_family.csv', index=False)

In [None]:
df = get_statistics('pizza_100')
df.to_csv('models/results/statistics_pizza_100.csv', index=False)

In [None]:
df = get_statistics('pizza_250')
df.to_csv('models/results/statistics_pizza_250.csv', index=False)

In [None]:
df = get_statistics('OWL2DL-1')
df.to_csv('models/results/statistics_OWL2DL-1.csv', index=False)