# **Return of the Schema** for *WHOW* 


## Path Definition Basic Elements

In [None]:
from rdflib import Graph, RDF, RDFS, OWL, Namespace
from urllib.parse import quote
from rdflib.namespace import split_uri
from rdflib.term import URIRef
from pathlib import Path
import pickle
import csv
import ast
import json

def serialize(graph, path):
    graph.serialize(path.with_suffix(".xml"), format="xml")
    !/home/navis/robot/robot merge --input {path.with_suffix(".xml")} --output {path.with_suffix(".owl")}
    path.with_suffix(".xml").unlink()

In [None]:
MATERIALIZE = True
DBPEDIA_RESOURCE = "http://dbpedia.org/resource/"
DBPEDIA_ONTOLOGY = "http://dbpedia.org/ontology/"
DATASET_NAME = "WHOW-5"
DATASET_NAME += f"-{'MATERIALIZE' if MATERIALIZE else "BASE"}"

home_path = Path().cwd().absolute().parent.parent 
dataset_path = home_path / "kgsaf_data" / f"{'materialize' if MATERIALIZE else "base"}" / "unpack" / DATASET_NAME
onto_path = home_path / "kgsaf_data" / "ontologies"/ "unpack" / "WHOW"

print("Base Path", home_path)
print("Ontology", onto_path)
print("Dataset", dataset_path)

print("")

if MATERIALIZE:
    print("Loading MATERIALIZED Ontology")
    onto_file = onto_path / "whow_merged_repaired_materialized.owl"
else:
    print("Loading BASE Ontology")
    onto_file = onto_path / "whow_merged_repaired.owl"

print("\tLoading Ontology")

whow_ontology = Graph()
whow_ontology.parse(onto_file, format="xml")

print("\tOntology Loaded")

# Triple Cleaning and Splitting, Deprecated Check

In [None]:
import pykeen
from pykeen.triples import TriplesFactory
from pykeen.triples.splitting import CoverageSplitter
import numpy as np
from rdflib import Graph, OWL, Literal
from rdflib.namespace import XSD    

triples = TriplesFactory.from_path(onto_path / "WHOW_ABOX" / "filtered5_giorno.txt")

In [None]:
real_obj_props = {str(s) for s in set(whow_ontology.subjects(RDF.type, OWL.ObjectProperty)) - BUILTIN_URI }

print(len(real_obj_props))

In [None]:
obj_props = set(triples.relation_id_to_label.values())
print(len(obj_props))
print(obj_props)

reals = real_obj_props & obj_props

print(len(reals))


In [None]:
import pykeen
from pykeen.triples import TriplesFactory
from pykeen.triples.splitting import CoverageSplitter
import numpy as np

MIN_TRIPLES_RELATION = 5



rels, counts = np.unique(triples.mapped_triples[:, 1], return_counts=True)
rel_counts = dict(zip(rels, counts))

keep_relations = [r for r, c in rel_counts.items() if c >= MIN_TRIPLES_RELATION]

triples_clean = triples.new_with_restriction(
    relations=keep_relations
)

print("Original triples:", triples.num_triples)
print("Cleaned triples:", triples_clean.num_triples)

In [None]:
entity_mappings = {v:k for k,v in triples_clean.entity_id_to_label.items()}
relation_mappings = {v:k for k,v in triples_clean.relation_id_to_label.items()}

In [None]:
train, valid, test = triples_clean.split(
    ratios=[0.7, 0.1, 0.2],
    random_state=42,
    method=CoverageSplitter(),      
)


train_clean = TriplesFactory.from_labeled_triples(
    triples=train.triples,
    entity_to_id=entity_mappings,
    relation_to_id=relation_mappings
)

valid_clean = TriplesFactory.from_labeled_triples(
    triples=valid.triples,
    entity_to_id=entity_mappings,
    relation_to_id=relation_mappings
)

test_clean = TriplesFactory.from_labeled_triples(
    triples=test.triples,
    entity_to_id=entity_mappings,
    relation_to_id=relation_mappings
)

print(train_clean)
print(test_clean)
print(valid_clean)

In [None]:
from pykeen.triples.leakage import unleak

train_unleak, valid_unleak, test_unleak = unleak(
    train_clean,
    *[valid_clean, test_clean],
    n=None,
    minimum_frequency=0.97
    )

print(train_unleak)
print(test_unleak)
print(valid_unleak)

In [None]:
(dataset_path / "abox" / "splits").mkdir(exist_ok=True, parents=True)


targets = [
    (dataset_path / "abox/splits/train", train_unleak.triples),
    (dataset_path / "abox/splits/valid", valid_unleak.triples),
    (dataset_path / "abox/splits/test", test_unleak.triples)
]


for path, split in targets:
    out_graph = Graph()
    for triple in split:
        s = URIRef(triple[0].strip())
        p = URIRef(triple[1].strip())
        o = URIRef(triple[2].strip())
        out_graph.add((URIRef(s), URIRef(p), URIRef(o)))

    out_graph.serialize(path.with_suffix(".nt"), format="nt")

!cat {dataset_path}/abox/splits/*.nt > {dataset_path}/abox/triples.nt

# [R] ABOX Individuals and Class Assertions

In [None]:
data_triples = Graph()
data_triples.parse(dataset_path / "abox" / "triples.nt")

individuals = set(data_triples.subjects()) | set(data_triples.objects())

print("Len Individuals", len(individuals))
del data_triples

In [None]:
out_graph = Graph()

for ind in individuals:
    out_graph.add((ind, RDF.type, OWL.NamedIndividual))

serialize(out_graph, dataset_path / "abox" / "individuals")
del out_graph

### [BASE] RDF Lib Class Assertions

In [None]:
out_graph = Graph()

with open(onto_path / "WHOW_ABOX" / "class_assertions_filtered5.json", "r") as ca_file:
    ca_data = json.load(ca_file)
    for key in ca_data.keys():
        ind = URIRef(key.strip())
        for c in ca_data[key]:
            cls = URIRef(c.strip())
            if (cls, RDF.type, OWL.Class) in whow_ontology:
                out_graph.add((ind, RDF.type, cls))

serialize(out_graph, dataset_path / "abox" / "class_assertions")

#### [REASONED] Reasoner Class Assertions 

In [None]:
out_graph = Graph()

with open(onto_path / "WHOW_ABOX" / "class_assertions_filtered5.json", "r") as ca_file:
    ca_data = json.load(ca_file)
    for key in ca_data.keys():
        ind = URIRef(key.strip())
        for c in ca_data[key]:
            cls = URIRef(c.strip())
            if (cls, RDF.type, OWL.Class) in whow_ontology:
                out_graph.add((ind, RDF.type, cls))

serialize(out_graph, dataset_path / "abox" / "unreasoned_class_assertions")

In [None]:
!java -Xmx16G -jar /home/navis/robot/robot.jar merge -vvv \
    --input {dataset_path / "abox" / "unreasoned_class_assertions.owl"} \
    --input {dataset_path / "abox" / "individuals.owl"} \
    --input {dataset_path / "abox" / "triples.nt"} \
    --input {apulia_path / "apulia_travel_merged_materialized.owl"} \
    --output {dataset_path / "abox" / "intermediate_abox_tbox.owl"}


!java -Xmx16G -jar /home/navis/robot/robot.jar reason -vvv \
  --reasoner HermiT \
  --create-new-ontology true \
  --input {dataset_path / "abox" / "intermediate_abox_tbox.owl"} \
  --output {dataset_path / "abox" / "inferred_class_assertions.owl"} \
  --axiom-generators "ClassAssertion" \
  --remove-redundant-subclass-axioms false \
  --exclude-tautologies structural \
  --include-indirect true \
  -D {dataset_path / "class_assertions_debug.owl"}

In [None]:
ca = Graph()
ca.parse(dataset_path / "abox" / "unreasoned_class_assertions.owl")
ca.parse(dataset_path / "abox" / "inferred_class_assertions.owl")

In [None]:
out_graph = Graph()

for ind in individuals:
    for o in set(ca.objects(ind, RDF.type)) - BUILTIN_URI:
        out_graph.add((ind,RDF.type, o))

serialize(out_graph, dataset_path / "abox" / "class_assertions")

In [None]:
(dataset_path / "abox" / "inferred_class_assertions.owl").unlink()
(dataset_path / "abox" / "unreasoned_class_assertions.owl").unlink()
(dataset_path / "abox" / "intermediate_abox_tbox.owl").unlink()

In [None]:
del(out_graph)
del(ca)

### [REASONED] Server Reasoned Class Assetions

In [None]:
ca_graph = Graph()
ca_graph.parse(onto_path / "WHOW_ABOX" / "server_reasoned_class_assertions.owl")

In [None]:
out_graph = Graph()

for ind in individuals:
    for o in set(ca_graph.objects(ind, RDF.type)) - BUILTIN_URI:
        out_graph.add((ind,RDF.type, o))

serialize(out_graph, dataset_path / "abox" / "class_assertions")

# TBOX and RBOX Extraction

In [None]:
data_triples = Graph()
data_triples.parse(dataset_path / "abox" / "triples.nt")

class_assertions = Graph()
class_assertions.parse(dataset_path / "abox" / "class_assertions.owl")

In [None]:
seed_obj_props = set(data_triples.predicates())
print("Seed Object Properties", len(seed_obj_props))

seed_classes =  set(class_assertions.subjects(RDF.type, OWL.Class))
print("Seed Classes", len(seed_classes))

In [None]:
from rdflib import Graph, URIRef, BNode, RDF, RDFS, OWL

BUILTIN_URI = {
    URIRef("http://schema.org/Thing"),
    OWL.Thing,
    OWL.Nothing,
    OWL.NamedIndividual,
    OWL.Class,
    OWL.topObjectProperty,
    OWL.bottomObjectProperty,
    RDF.type,
    RDFS.domain,
    RDFS.range,
    OWL.ObjectProperty,
    OWL.Restriction,
    OWL.DatatypeProperty,
    RDFS.Literal
}

removal = [
    URIRef("http://www.w3.org/ns/prov#wasDerivedFrom"),
    RDFS.isDefinedBy,
    URIRef("http://www.w3.org/ns/prov#wasInfluencedBy")
]



def extract_recursive_description(graph: Graph, elements: URIRef) -> Graph:

    extracted_graph = Graph()
    elem_to_process = set(elements)
    processed = set()

    while elem_to_process:

        e = elem_to_process.pop()
        processed.add(e)

        print(f"Processing {e}")

        for s,p,o in graph.triples((e, None, None)):
            extracted_graph.add((s,p,o))

            if (o not in BUILTIN_URI) and (o not in processed):

                if isinstance(o, BNode):
                    elem_to_process.add(o)

                if (o, RDF.type, OWL.Class) in graph:
                    elem_to_process.add(o)

                if (o, RDF.type, OWL.ObjectProperty) in graph:
                    elem_to_process.add(o)

                if (o, RDF.type, OWL.DatatypeProperty) in graph:
                    elem_to_process.add(o)

        
    return extracted_graph

out_graph = extract_recursive_description(whow_ontology, seed_classes | seed_obj_props)

serialize(out_graph, dataset_path / "ontology")

In [None]:
from rdflib import BNode

onto_graph = Graph()
onto_graph.parse(dataset_path / "ontology.owl")



def extract_description(graph: Graph, elem: URIRef) -> Graph:

    extracted_graph = Graph()
    elem_to_process = {elem}
    processed = set()


    while elem_to_process:

        e = elem_to_process.pop()
        processed.add(e)

        print(f"Processing {e}")

        for s,p,o in graph.triples((e, None, None)):
            extracted_graph.add((s,p,o))

            if (o not in BUILTIN_URI) and (o not in processed):
                if isinstance(o, BNode):
                    elem_to_process.add(o)

                if (o, RDF.type, OWL.Class) in graph:
                    extracted_graph.add((o, RDF.type, OWL.Class))

                if (o, RDF.type, OWL.ObjectProperty) in graph:
                    extracted_graph.add((o, RDF.type, OWL.ObjectProperty))

                if (o, RDF.type, OWL.DatatypeProperty) in graph:
                    extracted_graph.add((o, RDF.type, OWL.DatatypeProperty))

    return extracted_graph


rbox_graph = Graph()
for prop in set(onto_graph.subjects(RDF.type, OWL.ObjectProperty)) - BUILTIN_URI:
    rbox_graph += extract_description(onto_graph, prop)

for prop in set(onto_graph.subjects(RDF.type, OWL.DatatypeProperty)) - BUILTIN_URI:
    rbox_graph += extract_description(onto_graph, prop)


serialize(rbox_graph, dataset_path / "rbox" / "roles")

In [None]:
taxonomy_graph = Graph()

for c in set(onto_graph.subjects(RDF.type, OWL.Class)) - BUILTIN_URI:
    for s,p,o in onto_graph.triples((c, None, None)):
        if p == RDFS.subClassOf:
            taxonomy_graph.add((s,p,o))
            if isinstance(o, BNode):
                taxonomy_graph += extract_description(onto_graph, o)

serialize(taxonomy_graph, dataset_path / "tbox" / "taxonomy")

In [None]:
schema_graph = Graph()


for c in set(onto_graph.subjects(RDF.type, OWL.Class)) - BUILTIN_URI:
    if not isinstance(c, BNode):
        for s,p,o in onto_graph.triples((c, None, None)):
            if p != RDFS.subClassOf:
                
                schema_graph.add((s,p,o))

                for elem in onto_graph.objects(o, RDF.type):
                    schema_graph.add((o, RDF.type, elem))

                if isinstance(o, BNode):
                    print(f"Found BNODE in Triple {s, p, o}")
                    schema_graph += extract_description(onto_graph, o)
            

serialize(schema_graph, dataset_path / "tbox" / "schema")

# Final Ontology and Knowledge Graph

In [None]:
!/home/navis/robot/robot merge \
--input  {dataset_path / "ontology.owl"} \
--input  {dataset_path / "abox" / "individuals.owl"} \
--input {dataset_path / "abox" / "triples.nt"} \
--input {dataset_path / "abox" / "class_assertions.owl"} \
--output {dataset_path / "knowledge_graph.owl"}