# **Return of the Schema** for *Apulia Travel KG* 

## Path Definition Basic Elements

In [None]:
from rdflib import Graph, RDF, RDFS, OWL, Namespace
from urllib.parse import quote
from rdflib.namespace import split_uri
from rdflib.term import URIRef
from pathlib import Path
import pickle
import csv
import ast
import json

def serialize(graph, path):
    graph.serialize(path.with_suffix(".xml"), format="xml")
    !/home/navis/robot/robot merge --input {path.with_suffix(".xml")} --output {path.with_suffix(".owl")}
    path.with_suffix(".xml").unlink()

In [None]:
MATERIALIZE = True
DATASET_NAME = "APULIATRAVEL"
DATASET_NAME += f"-{'MATERIALIZE' if MATERIALIZE else "BASE"}"

home_path = Path().cwd().absolute().parent.parent 
dataset_path = home_path / "kgsaf_data" / f"{'materialize' if MATERIALIZE else "base"}" / "unpack" / DATASET_NAME
onto_path = home_path / "kgsaf_data" / "ontologies"/ "unpack" / "APULIATRAVEL"

print("Base Path", home_path)
print("Ontology", onto_path)
print("Dataset", dataset_path)

print("")

if MATERIALIZE:
    print("Loading MATERIALIZED Ontology")
    onto_file = onto_path / "apulia_travel_merged_materialized.owl"
else:
    print("Loading BASE Ontology")
    onto_file = onto_path / "apulia_travel_merged.owl"

print("\tLoading Ontology")

apulia_onto = Graph()
apulia_onto.parse(onto_file)

print("\tOntology Loaded")

# [O] ABOX Triple Cleaning

Removal of triples with individuals that are also classes

In [None]:
data_triples = Graph()
data_triples.parse(onto_path / "ApuliaTravelABox.ttl")

In [None]:
predicates = set(data_triples.predicates())
len(predicates)

In [None]:
final_pred = set()

for pred in predicates:
    if (pred, RDF.type, OWL.ObjectProperty) in apulia_ontology:
        final_pred.add(pred)
    else:
        print(f"Removing {pred}")


In [None]:
print(len(final_pred))

In [None]:
out_graph = Graph()

with open(onto_path / "tsv_triples.tsv", "w") as f:
    for s,p,o in data_triples:
        if (p in final_pred):
            if (not ((s, RDF.type, OWL.Class) in apulia_ontology)) and (not ((o, RDF.type, OWL.Class) in apulia_ontology)):
                out_graph.add((s,p,o))
                f.write(f"{str(s)}\t{str(p)}\t{str(o)}\n")

out_graph.serialize(onto_path / "apulia_clean_abox.nt", format="nt")

In [None]:
import pykeen
from pykeen.triples import TriplesFactory
from pykeen.triples.splitting import CoverageSplitter
import numpy as np


triples = TriplesFactory.from_path(onto_path / "tsv_triples.tsv")

triples

In [None]:
entity_mappings = {v:k for k,v in triples.entity_id_to_label.items()}
relation_mappings = {v:k for k,v in triples.relation_id_to_label.items()}

In [None]:
train, valid, test = triples.split(
    ratios=[0.85, 0.05, 0.1],
    random_state=42,
    method=CoverageSplitter(),      
)

In [None]:
train_clean = TriplesFactory.from_labeled_triples(
    triples=train.triples,
    entity_to_id=entity_mappings,
    relation_to_id=relation_mappings
)

valid_clean = TriplesFactory.from_labeled_triples(
    triples=valid.triples,
    entity_to_id=entity_mappings,
    relation_to_id=relation_mappings
)

test_clean = TriplesFactory.from_labeled_triples(
    triples=test.triples,
    entity_to_id=entity_mappings,
    relation_to_id=relation_mappings
)

In [None]:
print(train_clean)
print(test_clean)
print(valid_clean)

In [None]:
from pykeen.triples.leakage import unleak

train_unleak, valid_unleak, test_unleak = unleak(
    train_clean,
    *[valid_clean, test_clean],
    n=None,
    minimum_frequency=0.97
    )

In [None]:
print(train_unleak)
print(test_unleak)
print(valid_unleak)

In [None]:
targets = [
    (dataset_path / "abox/splits/train", train_unleak.triples),
    (dataset_path / "abox/splits/valid", valid_unleak.triples),
    (dataset_path / "abox/splits/test", test_unleak.triples)
]


for path, split in targets:
    out_graph = Graph()
    for triple in split:
        s = URIRef(triple[0])
        p = URIRef(triple[1])
        o = URIRef(triple[2])
        out_graph.add((URIRef(s), URIRef(p), URIRef(o)))

    out_graph.serialize(path.with_suffix(".nt"), format="nt")

!cat {dataset_path}/abox/splits/*.nt > {dataset_path}/abox/triples.nt

In [None]:

del out_graph
del data_triples

# ABOX Individuals and Class Assertions

In [None]:
data_triples = Graph()
data_triples.parse(dataset_path / "abox" / "triples.nt")

individuals = set(data_triples.subjects()) | set(data_triples.objects())

print("Len Individuals", len(individuals))
del data_triples

In [None]:
out_graph = Graph()

for ind in individuals:
    out_graph.add((ind, RDF.type, OWL.NamedIndividual))

serialize(out_graph, dataset_path / "abox" / "individuals")
del out_graph

In [None]:
ca_graph = Graph()
ca_graph.parse(onto_path / "ApuliaTravelABox.ttl")

### [BASE] RDF Lib Class Assertions

In [None]:
out_graph = Graph()

for ind in individuals:
    for ca in  set(ca_graph.objects(ind, RDF.type)) - BUILTIN_URI:
        if (ca, RDF.type, OWL.Class) in apulia_ontology:
            out_graph.add((ind, RDF.type, ca))
        else:
            print(f"Not a Class {ca}")

serialize(out_graph, dataset_path / "abox" / "class_assertions")
del out_graph
     

### [REASONED] Reasoner Class Assertions

In [None]:
out_graph = Graph()


for ind in individuals:
    for ca in  set(ca_graph.objects(ind, RDF.type)) - BUILTIN_URI:
        if (ca, RDF.type, OWL.Class) in apulia_ontology:
            out_graph.add((ind, RDF.type, ca))
        else:
            print(f"Not a Class {ca}")

serialize(out_graph, dataset_path / "abox" / "unreasoned_class_assertions")
del out_graph
     

In [None]:
!java -Xmx16G -jar /home/navis/robot/robot.jar merge -vvv \
    --input {dataset_path / "abox" / "unreasoned_class_assertions.owl"} \
    --input {dataset_path / "abox" / "individuals.owl"} \
    --input {dataset_path / "abox" / "triples.nt"} \
    --input {apulia_path / "apulia_travel_merged_materialized.owl"} \
    --output {dataset_path / "abox" / "intermediate_abox_tbox.owl"}

!java -Xmx16G -jar /home/navis/robot/robot.jar reason -vvv \
  --reasoner HermiT \
  --create-new-ontology true \
  --input {dataset_path / "abox" / "intermediate_abox_tbox.owl"} \
  --output {dataset_path / "abox" / "inferred_class_assertions.owl"} \
  --axiom-generators "ClassAssertion" \
  --remove-redundant-subclass-axioms false \
  --exclude-tautologies structural \
  --include-indirect true \
  -D {dataset_path / "class_assertions_debug.owl"}

In [None]:
ca = Graph()
ca.parse(dataset_path / "abox" / "unreasoned_class_assertions.owl")
ca.parse(dataset_path / "abox" / "inferred_class_assertions.owl")

In [None]:
out_graph = Graph()

for ind in individuals:
    for o in set(ca.objects(ind, RDF.type)) - BUILTIN_URI:
        out_graph.add((ind,RDF.type, o))

serialize(out_graph, dataset_path / "abox" / "class_assertions")

In [None]:
(dataset_path / "abox" / "inferred_class_assertions.owl").unlink()
(dataset_path / "abox" / "unreasoned_class_assertions.owl").unlink()
(dataset_path / "abox" / "intermediate_abox_tbox.owl").unlink()

In [None]:
del(out_graph)
del(ca)

# TBOX and RBOX Extraction

In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
from rdflib import Graph, URIRef, BNode, RDF, RDFS, OWL
import  utils.conventions.paths as pc
from utils.conventions.builtins import BUILTIN_URIS

class SignatureModularizer:
    def __init__(self, schema, seed):
        self.schema = schema
        self.seed = seed

    def modularize(self):
        return self._extract_recursive_description()

    def _extract_recursive_description(self) -> Graph:

        extracted_graph = Graph()
        elem_to_process = set(self.seed)
        processed = set()

        while elem_to_process:

            e = elem_to_process.pop()
            processed.add(e)

            print(f"Processing {e}")

            for s,p,o in self.schema.triples((e, None, None)):
                extracted_graph.add((s,p,o))

                if (o not in BUILTIN_URIS) and (o not in processed):

                    if isinstance(o, BNode):
                        elem_to_process.add(o)

                    if (o, RDF.type, OWL.Class) in self.schema:
                        elem_to_process.add(o)

                    if (o, RDF.type, OWL.ObjectProperty) in self.schema:
                        elem_to_process.add(o)

                    if (o, RDF.type, OWL.DatatypeProperty) in self.schema:
                        elem_to_process.add(o)

        return extracted_graph
    

class SchemaDecomposition:
    def __init__(self, input_graph):
        self.onto_graph = input_graph
    
    def decompose(self):
        return self._rbox_decompose(), self._taxonomy_decompose(), self._schema_decompose()


    def _rbox_decompose(self):
        rbox_graph = Graph()
        for prop in set(self.onto_graph.subjects(RDF.type, OWL.ObjectProperty)) - BUILTIN_URI:
            rbox_graph += self._extract_description(prop)

        for prop in set(self.onto_graph.subjects(RDF.type, OWL.DatatypeProperty)) - BUILTIN_URI:
            rbox_graph += self._extract_description(prop)
        return rbox_graph


    def _taxonomy_decompose(self):
        taxonomy_graph = Graph()

        for c in set(self.onto_graph.subjects(RDF.type, OWL.Class)) - BUILTIN_URI:
            for s,p,o in self.onto_graph.triples((c, None, None)):
                if p == RDFS.subClassOf:
                    taxonomy_graph.add((s,p,o))
                    if isinstance(o, BNode):
                        taxonomy_graph += self._extract_description(o)

        return taxonomy_graph

    def _schema_decompose(self):
        schema_graph = Graph()

        for c in set(self.onto_graph.subjects(RDF.type, OWL.Class)) - BUILTIN_URI:
            if not isinstance(c, BNode):
                for s,p,o in self.onto_graph.triples((c, None, None)):
                    if p != RDFS.subClassOf:
                        
                        schema_graph.add((s,p,o))

                        for elem in self.onto_graph.objects(o, RDF.type):
                            schema_graph.add((o, RDF.type, elem))

                        if isinstance(o, BNode):
                            print(f"Found BNODE in Triple {s, p, o}")
                            schema_graph += self._extract_description(o)

        return schema_graph

    def _extract_description(self, elem: URIRef) -> Graph:

        extracted_graph = Graph()
        elem_to_process = {elem}
        processed = set()

        while elem_to_process:

            e = elem_to_process.pop()
            processed.add(e)

            print(f"Processing {e}")

            for s,p,o in self.onto_graph.triples((e, None, None)):
                extracted_graph.add((s,p,o))

                if (o not in BUILTIN_URI) and (o not in processed):
                    if isinstance(o, BNode):
                        elem_to_process.add(o)

                    if (o, RDF.type, OWL.Class) in self.onto_graph:
                        extracted_graph.add((o, RDF.type, OWL.Class))

                    if (o, RDF.type, OWL.ObjectProperty) in self.onto_graph:
                        extracted_graph.add((o, RDF.type, OWL.ObjectProperty))

                    if (o, RDF.type, OWL.DatatypeProperty) in self.onto_graph:
                        extracted_graph.add((o, RDF.type, OWL.DatatypeProperty))

        return extracted_graph


In [None]:
data_triples = Graph()
data_triples.parse(dataset_path / "abox" / "triples.nt")

class_assertions = Graph()
class_assertions.parse(dataset_path / "abox" / "class_assertions.owl")

In [None]:
seed_obj_props = set(data_triples.predicates())
print("Seed Object Properties", len(seed_obj_props))

seed_classes =  set(class_assertions.subjects(RDF.type, OWL.Class))
print("Seed Classes", len(seed_classes))

In [None]:

modularizer = SignatureModularizer(apulia_ontology, seed_classes | seed_obj_props)
out_graph = modularizer.modularize()

serialize(out_graph, dataset_path / "ontology")

In [None]:
onto_graph = Graph()
onto_graph.parse(dataset_path / "ontology.owl")

decomposer = SchemaDecomposition(onto_graph)
rbox_graph, taxonomy_graph, schema_graph = decomposer.decompose()

serialize(rbox_graph, dataset_path / "rbox" / "roles")
serialize(taxonomy_graph, dataset_path / "tbox" / "taxonomy")
serialize(schema_graph, dataset_path / "tbox" / "schema")


In [None]:
from rdflib import BNode





def extract_description(graph: Graph, elem: URIRef) -> Graph:

    extracted_graph = Graph()
    elem_to_process = {elem}
    processed = set()


    while elem_to_process:

        e = elem_to_process.pop()
        processed.add(e)

        print(f"Processing {e}")

        for s,p,o in graph.triples((e, None, None)):
            extracted_graph.add((s,p,o))

            if (o not in BUILTIN_URI) and (o not in processed):
                if isinstance(o, BNode):
                    elem_to_process.add(o)

                if (o, RDF.type, OWL.Class) in graph:
                    extracted_graph.add((o, RDF.type, OWL.Class))

                if (o, RDF.type, OWL.ObjectProperty) in graph:
                    extracted_graph.add((o, RDF.type, OWL.ObjectProperty))

                if (o, RDF.type, OWL.DatatypeProperty) in graph:
                    extracted_graph.add((o, RDF.type, OWL.DatatypeProperty))

    return extracted_graph


rbox_graph = Graph()
for prop in set(onto_graph.subjects(RDF.type, OWL.ObjectProperty)) - BUILTIN_URI:
    rbox_graph += extract_description(onto_graph, prop)

for prop in set(onto_graph.subjects(RDF.type, OWL.DatatypeProperty)) - BUILTIN_URI:
    rbox_graph += extract_description(onto_graph, prop)


serialize(rbox_graph, dataset_path / "rbox" / "roles")

In [None]:
taxonomy_graph = Graph()

for c in set(onto_graph.subjects(RDF.type, OWL.Class)) - BUILTIN_URI:
    for s,p,o in onto_graph.triples((c, None, None)):
        if p == RDFS.subClassOf:
            taxonomy_graph.add((s,p,o))
            if isinstance(o, BNode):
                taxonomy_graph += extract_description(onto_graph, o)

serialize(taxonomy_graph, dataset_path / "tbox" / "taxonomy")

In [None]:
schema_graph = Graph()


for c in set(onto_graph.subjects(RDF.type, OWL.Class)) - BUILTIN_URI:
    if not isinstance(c, BNode):
        for s,p,o in onto_graph.triples((c, None, None)):
            if p != RDFS.subClassOf:
                
                schema_graph.add((s,p,o))

                for elem in onto_graph.objects(o, RDF.type):
                    schema_graph.add((o, RDF.type, elem))

                if isinstance(o, BNode):
                    print(f"Found BNODE in Triple {s, p, o}")
                    schema_graph += extract_description(onto_graph, o)
            

serialize(schema_graph, dataset_path / "tbox" / "schema")

# Final Ontology and Knowledge Graph

In [None]:
!/home/navis/robot/robot merge \
--input  {dataset_path / "ontology.owl"} \
--input  {dataset_path / "abox" / "individuals.owl"} \
--input {dataset_path / "abox" / "triples.nt"} \
--input {dataset_path / "abox" / "class_assertions.owl"} \
--output {dataset_path / "knowledge_graph.owl"}