# **Return of the Schema** for *YAGO4*

## Path Definition Basic Elements


In [None]:
from rdflib import Graph, RDF, RDFS, OWL, Namespace
from urllib.parse import quote
from rdflib.namespace import split_uri
from rdflib.term import URIRef
from pathlib import Path
import pickle
import csv
import ast
import json

def serialize(graph, path):
    graph.serialize(path.with_suffix(".xml"), format="xml")
    !/home/navis/robot/robot merge --input {path.with_suffix(".xml")} --output {path.with_suffix(".owl")}
    path.with_suffix(".xml").unlink()

In [None]:
MATERIALIZE = True
YAGO_IRI = "http://yago-knowledge.org/resource/"
DATASET_NAME = "YAGO4-20-C"
DATASET_NAME += f"-{'MATERIALIZE' if MATERIALIZE else "BASE"}"

home_path = Path().cwd().absolute().parent.parent 
dataset_path = home_path / "kgsaf_data" / f"{'materialize' if MATERIALIZE else "base"}" / "unpack" / DATASET_NAME
onto_path = home_path / "kgsaf_data" / "ontologies"/ "unpack" / "YAGO4"

print("Base Path", home_path)
print("Ontology", onto_path)
print("Dataset", dataset_path)

print("")

if MATERIALIZE:
    print("Loading MATERIALIZED Ontology")
    onto_file = onto_path / "yago_repaired_materialized.owl"
else:
    print("Loading BASE Ontology")
    onto_file = onto_path / "yago_repaired.owl"

print("\tLoading Ontology")

yago_ontology = Graph()
yago_ontology.parse(onto_file, format="xml")

print("\tOntology Loaded")

# [O] ABOX Triple Cleaning

Removal of triples with individuals that are also classes

In [None]:
data_triples = Graph()
data_triples.parse(dataset_path / "abox" / "triples.nt")

individuals = set(data_triples.subjects()) | set(data_triples.objects())

print("Len Individuals", len(individuals))


In [None]:
ind_classes = set()

for ind in individuals:
    if (ind, RDF.type, OWL.Class) in yago_ontology:
        ind_classes.add(ind)

print("Individuals that are Classes:", len(ind_classes), ind_classes)

In [None]:
preds = set(data_triples.predicates())

print("Predicates", len(preds))

for p in preds:
    if (p, RDF.type, OWL.DatatypeProperty) in yago_ontology:
        print("This property is incorreclty defined as DATAPROP", p)

    if (p, RDF.type, OWL.ObjectProperty) not  in yago_ontology:
        print("Missing OBJPROP definition", p)

In [None]:
yago_obj_props = set(yago_ontology.subjects(RDF.type, OWL.ObjectProperty)) - BUILTIN_URI

print("Valid Obj Props:", len(yago_obj_props))

In [None]:
for file in [
    dataset_path / "abox" / "splits"/ "train.nt",
    dataset_path / "abox" / "splits"/ "test.nt",
    dataset_path / "abox" /  "triples.nt",
    dataset_path / "abox" / "splits"/ "valid.nt"
]:
    out_graph = Graph()
    out_graph.parse(file)

    new_file = file.with_name(file.stem + '_legacy' + file.suffix)
    file.rename(new_file)

    for s,p,o in out_graph.triples((None, None, None)):
        if (s in ind_classes) or (o in ind_classes) or not(p in yago_obj_props):
            print("Removing", s,p,o)
            out_graph.remove((s,p,o))
    
    out_graph.serialize(file.with_name(file.stem + "_cleaned" + file.suffix), format="nt")

In [None]:
del individuals
del out_graph
del ind_classes
del data_triples

# [O] Machine Learning Ready Compatibility Check

In [None]:
for file in [
    dataset_path / "abox" / "splits"/ "train.nt",
    dataset_path / "abox" / "splits"/ "test.nt",
    dataset_path / "abox" / "splits"/ "valid.nt"
]:
    
    with open(file.with_suffix(".tsv"),"w") as tsv_out:
        graph = Graph()
        graph.parse(file)

        for s,p,o in graph:
            tsv_out.write(f"{str(s)}\t{str(p)}\t{str(o)}\n")


In [None]:
from pykeen.triples import TriplesFactory
import numpy as np

train_path = dataset_path / "abox" / "splits" / "train.tsv"
valid_path = dataset_path / "abox" / "splits" / "valid.tsv"
test_path  = dataset_path / "abox" / "splits" / "test.tsv"

train = TriplesFactory.from_path(train_path)

valid = TriplesFactory.from_path(
    valid_path,
    entity_to_id=train.entity_to_id,
    relation_to_id=train.relation_to_id,
)
test = TriplesFactory.from_path(
    test_path,
    entity_to_id=train.entity_to_id,
    relation_to_id=train.relation_to_id,
)

print(train)
print(test)
print(valid)

In [None]:
from pykeen.triples.leakage import unleak

train_unleak, valid_unleak, test_unleak = unleak(
    train,
    *[valid, test],
    n=None,
    minimum_frequency=0.97
    )

In [None]:
print(train_unleak)
print(test_unleak)
print(valid_unleak)


In [None]:
targets = [
    (dataset_path / "abox/splits/train", train_unleak.triples),
    (dataset_path / "abox/splits/valid", valid_unleak.triples),
    (dataset_path / "abox/splits/test", test_unleak.triples)
]


for path, split in targets:
    out_graph = Graph()
    for triple in split:
        s = URIRef(triple[0])
        p = URIRef(triple[1])
        o = URIRef(triple[2])
        out_graph.add((URIRef(s), URIRef(p), URIRef(o)))

    out_graph.serialize(path.with_suffix(".nt"), format="nt")

!cat {dataset_path}/abox/splits/*.nt > {dataset_path}/abox/triples.nt

# [R] ABOX Individuals and Class Assertions

In [None]:
data_triples = Graph()
data_triples.parse(dataset_path / "abox" / "triples.nt")

individuals = set(data_triples.subjects()) | set(data_triples.objects())

print("Len Individuals", len(individuals))
del data_triples

In [None]:
out_graph = Graph()

for ind in individuals:
    out_graph.add((ind, RDF.type, OWL.NamedIndividual))

serialize(out_graph, dataset_path / "abox" / "individuals")
del out_graph

In [None]:
ca_graph = Graph()
ca_graph.parse(onto_path / "yago-wd-full-types.nt")

### [BASE] RDF Lib Class Assertions

In [None]:
out_graph = Graph()


for ind in individuals:
    for ca in set(ca_graph.objects(ind, RDF.type)) - BUILTIN_URI:
        if (ca, RDF.type, OWL.Class) in yago_ontology:
            out_graph.add((ind, RDF.type, ca))
            if REASONED:
                for sup_c in set(yago_ontology.objects(ca, RDFS.subClassOf)) - BUILTIN_URI:
                    out_graph.add((ind, RDF.type, sup_c))
        else:
            print(f"Not a class {ca}")

serialize(out_graph, dataset_path / "abox" / "class_assertions")

### [REASONED] Reasoning Class Assertions

In [None]:
out_graph = Graph()


for ind in individuals:
    for ca in  set(ca_graph.objects(ind, RDF.type)) - BUILTIN_URI:
        if (ca, RDF.type, OWL.Class) in yago_ontology:
            out_graph.add((ind, RDF.type, ca))
        else:
            print(f"Not a class {ca}")

serialize(out_graph, dataset_path / "abox" / "unreasoned_class_assertions")

del out_graph
del ca_graph

In [None]:
!java -Xmx16G -jar /home/navis/robot/robot.jar merge -vvv \
    --input {dataset_path / "abox" / "unreasoned_class_assertions.owl"} \
    --input {yago4_path / "yago_satisfiable_reasoned.owl"} \
    --output {dataset_path / "abox" / "intermediate_abox_tbox.owl"}

In [None]:
properties = [
    #"SubClass",
    #"EquivalentClass",
    #"DisjointClasses",
    #"DataPropertyCharacteristic",
    #"EquivalentDataProperties",
    #"SubDataProperty",
    "ClassAssertion",
    #"PropertyAssertion",
    #"EquivalentObjectProperty",
    #"InverseObjectProperties",
    #"ObjectPropertyCharacteristic",
    #"SubObjectProperty",
    #"ObjectPropertyRange",
    #"ObjectPropertyDomain"
]

prop_string = ""
for p in properties:
    prop_string += " " + p


!java -Xmx20G -jar /home/navis/robot/robot.jar reason -vvv \
  --reasoner HermiT \
  --create-new-ontology true \
  --input {dataset_path / "abox" / "intermediate_abox_tbox.owl"} \
  --output {dataset_path / "abox" / "inferred_class_assertions.owl"} \
  --axiom-generators "{prop_string}" \
  --remove-redundant-subclass-axioms false \
  --exclude-tautologies structural \
  --include-indirect true \
  -D {dataset_path / "class_assertions_debug.owl"}

In [None]:
ca = Graph()
ca.parse(dataset_path / "abox" / "unreasoned_class_assertions.owl")
ca.parse(dataset_path / "abox" / "inferred_class_assertions.owl")

In [None]:
out_graph = Graph()

for s,p,o in ca.triples((None, RDF.type, None)):
    if s in individuals:
        out_graph.add((s,p,o))

serialize(out_graph, dataset_path / "abox" / "class_assertions")

In [None]:
(dataset_path / "abox" / "inferred_class_assertions.owl").unlink()
(dataset_path / "abox" / "unreasoned_class_assertions.owl").unlink()
(dataset_path / "abox" / "intermediate_abox_tbox.owl").unlink()

In [None]:
del(out_graph)
del(ca)

# RBOX Roles Definition and Properties

In [None]:
!cat {dataset_path}/abox/splits/*.nt > {dataset_path}/abox/triples.nt

In [None]:
data_triples = Graph()
data_triples.parse(dataset_path / "abox" / "triples.nt")

class_assertions = Graph()
class_assertions.parse(dataset_path / "abox" / "class_assertions.owl")

In [None]:
/home/navis/dev/semantic-web-datasets/datasets-base-backups/YAGO4-20-C-BASE/abox/class_assertions.owl
/home/navis/dev/semantic-web-datasets/datasets-base-backup/YAGO4-20-C-BASE/abox/class_assertions.owl

In [None]:
seed_obj_props = set(data_triples.predicates())
print("Seed Object Properties", len(seed_obj_props))

seed_classes =  set(class_assertions.subjects(RDF.type, OWL.Class))
print("Seed Classes", len(seed_classes))

In [None]:
from rdflib import Graph, URIRef, BNode, RDF, RDFS, OWL

BUILTIN_URI = {
    URIRef("http://schema.org/Thing"),
    OWL.Thing,
    OWL.Nothing,
    OWL.NamedIndividual,
    OWL.Class,
    OWL.topObjectProperty,
    OWL.bottomObjectProperty,
    RDF.type,
    RDFS.domain,
    RDFS.range,
    OWL.ObjectProperty,
    OWL.Restriction,
    OWL.DatatypeProperty,
    RDFS.Literal
}

removal = [
    URIRef("http://www.w3.org/ns/prov#wasDerivedFrom"),
    RDFS.isDefinedBy,
    URIRef("http://www.w3.org/ns/prov#wasInfluencedBy")
]



def extract_recursive_description(graph: Graph, elements: URIRef) -> Graph:

    extracted_graph = Graph()
    elem_to_process = set(elements)
    processed = set()

    while elem_to_process:

        e = elem_to_process.pop()
        processed.add(e)

        print(f"Processing {e}")

        for s,p,o in graph.triples((e, None, None)):
            extracted_graph.add((s,p,o))

            if (o not in BUILTIN_URI) and (o not in processed):

                if isinstance(o, BNode):
                    elem_to_process.add(o)

                if (o, RDF.type, OWL.Class) in graph:
                    elem_to_process.add(o)

                if (o, RDF.type, OWL.ObjectProperty) in graph:
                    elem_to_process.add(o)

                if (o, RDF.type, OWL.DatatypeProperty) in graph:
                    elem_to_process.add(o)

        
    return extracted_graph

out_graph = extract_recursive_description(yago_ontology, seed_classes | seed_obj_props)

serialize(out_graph, dataset_path / "ontology")


In [None]:
from rdflib import BNode

onto_graph = Graph()
onto_graph.parse(dataset_path / "ontology.owl")



def extract_description(graph: Graph, elem: URIRef) -> Graph:

    extracted_graph = Graph()
    elem_to_process = {elem}
    processed = set()


    while elem_to_process:

        e = elem_to_process.pop()
        processed.add(e)

        print(f"Processing {e}")

        for s,p,o in graph.triples((e, None, None)):
            extracted_graph.add((s,p,o))

            if (o not in BUILTIN_URI) and (o not in processed):
                if isinstance(o, BNode):
                    elem_to_process.add(o)

                if (o, RDF.type, OWL.Class) in graph:
                    extracted_graph.add((o, RDF.type, OWL.Class))

                if (o, RDF.type, OWL.ObjectProperty) in graph:
                    extracted_graph.add((o, RDF.type, OWL.ObjectProperty))

                if (o, RDF.type, OWL.DatatypeProperty) in graph:
                    extracted_graph.add((o, RDF.type, OWL.DatatypeProperty))

    return extracted_graph


rbox_graph = Graph()
for prop in set(onto_graph.subjects(RDF.type, OWL.ObjectProperty)) - BUILTIN_URI:
    rbox_graph += extract_description(onto_graph, prop)

for prop in set(onto_graph.subjects(RDF.type, OWL.DatatypeProperty)) - BUILTIN_URI:
    rbox_graph += extract_description(onto_graph, prop)


serialize(rbox_graph, dataset_path / "rbox" / "roles")


In [None]:
taxonomy_graph = Graph()

for c in set(onto_graph.subjects(RDF.type, OWL.Class)) - BUILTIN_URI:
    for s,p,o in onto_graph.triples((c, None, None)):
        if p == RDFS.subClassOf:
            if o not in BUILTIN_URI:
                taxonomy_graph.add((s,p,o))
                if isinstance(o, BNode):
                    taxonomy_graph += extract_description(onto_graph, o)

serialize(taxonomy_graph, dataset_path / "tbox" / "taxonomy")

In [None]:
schema_graph = Graph()


for c in set(onto_graph.subjects(RDF.type, OWL.Class)) - BUILTIN_URI:
    if not isinstance(c, BNode):
        for s,p,o in onto_graph.triples((c, None, None)):
            if p != RDFS.subClassOf:

                
                
                schema_graph.add((s,p,o))

                for elem in onto_graph.objects(o, RDF.type):
                    schema_graph.add((o, RDF.type, elem))

                if isinstance(o, BNode):
                    print(f"Found BNODE in Triple {s, p, o}")
                    schema_graph += extract_description(onto_graph, o)
            

serialize(schema_graph, dataset_path / "tbox" / "schema")

# Final Ontology and Knowledge Graph

In [None]:
!/home/navis/robot/robot merge \
--input  {dataset_path / "ontology.owl"} \
--input  {dataset_path / "abox" / "individuals.owl"} \
--input {dataset_path / "abox" / "triples.nt"} \
--input {dataset_path / "abox" / "class_assertions.owl"} \
--output {dataset_path / "knowledge_graph.owl"}