In [None]:
import sys
from pathlib import Path

sys.path.append(str(Path.cwd().parent.parent))

import shutil

from rdflib import Graph

import kgsaf_jdex.utils.conventions.paths as pc
from kgsaf_jdex.utils.conversion import OWLConverter

# UNZIP Compressed Datasets

In [None]:
home_path = Path.cwd().parent.parent.resolve()
data_path = home_path / "kgsaf_data" / "datasets"

for d_type in [data_path / "base", data_path / "materialize"]:
    print(f"Unpacking Datasets in {d_type}")
    (d_type / "unpack").mkdir(exist_ok=True, parents=True)
    for elem in d_type.iterdir():
        if elem.is_file():
            print(f"\tUnpacking Dataset {elem.name}")
            target_folder = d_type / "unpack" / elem.stem

            if target_folder.exists():
                shutil.rmtree(target_folder)

            shutil.unpack_archive(str(elem), str(target_folder))

# UNZIP Compressed Ontologies (Schema)

In [None]:
home_path = Path.cwd().parent.parent.resolve()
data_path = home_path / "kgsaf_data" / "ontologies"


print(f"Unpacking Ontologies in {data_path}")
(data_path / "unpack").mkdir(exist_ok=True, parents=True)
target_folder = data_path / "unpack" 

for elem in data_path.iterdir():
    if elem.is_file():
            print(f"\tUnpacking Ontology {elem.name}")
            
            if (target_folder / elem.stem).exists():
                shutil.rmtree(target_folder)

            shutil.unpack_archive(str(elem), str(target_folder))

# Object Property Assertion Re-Merge

In [None]:
home_path = Path.cwd().parent.parent.resolve()
data_path = home_path / "kgsaf_data" / "datasets"

for d_type in [data_path / "base" / "unpack", data_path / "materialize" / "unpack"]:
    print(f"Merging Object Assertions for Datasets in {d_type}")
    for data_folder in d_type.iterdir():
        if data_folder.is_dir():
            print(f"\t Merging Dataset {data_folder.name}")
            !cat {data_folder}/abox/splits/*.nt > {data_folder}/abox/obj_prop_assertions.nt

# Full Knowledge Graph Re-Merge

In [None]:
robot_path = "/home/navis/robot/robot"
home_path = Path.cwd().parent.parent.resolve()
data_path = home_path / "kgsaf_data" / "datasets"

for d_type in [data_path / "base" / "unpack", data_path / "materialize" / "unpack"]:
    print(f"Merging Full KG for Datasets in {d_type}")
    for data_folder in d_type.iterdir():
        if data_folder.is_dir():
            print(f"\t Merging Dataset {data_folder.name}")
            !{robot_path} merge \
            --input {data_folder / "ontology.owl"} \
            --input {data_folder / "abox" / "individuals.owl"} \
            --input {data_folder / "abox" / "obj_prop_assertions.nt"} \
            --input {data_folder / "abox" / "class_assertions.owl"} \
            --output {data_folder / "knowledge_graph.owl"}

# NTriples to PyKEEN TSV Conversion

In [None]:
class TSVConverter:
    def __init__(
        self,
        path: str,
    ):

        self.p_data = dict()
        self.base_path = Path(path).resolve().absolute()

    def convert(
        self,
        triples: bool = True,
        splits: bool = True,
    ):

        if triples:
            self.p_data["triples"] = (
                self.preprocess_triples(self.base_path / "abox/obj_prop_assertions.nt"),
                self.base_path / "abox/obj_prop_assertions.tsv",
            )

        if splits:
            self.p_data["train"] = (
                self.preprocess_triples(self.base_path / pc.RDF_TRAIN),
                self.base_path / pc.TRAIN,
            )
            self.p_data["test"] = (
                self.preprocess_triples(self.base_path / pc.RDF_TEST),
                self.base_path / pc.TEST,
            )
            self.p_data["valid"] = (
                self.preprocess_triples(self.base_path / pc.RDF_VALID),
                self.base_path / pc.VALID,
            )


    def serialize(self):
        for key, values in self.p_data.items():
            obj = values[0]
            path = values[1]

            with open(path, "w") as f:
                if key in ["triples", "train", "valid", "test"]:
                    f.write(obj)

    def preprocess_triples(self, path):
        triples = Graph()
        triples.parse(path)
        out_str = ""
        for s, p, o in triples:
            out_str += f"{str(s)}\t{str(p)}\t{str(o)}\n"
        return out_str

In [None]:
robot_path = "/home/navis/robot/robot"
home_path = Path.cwd().parent.parent.resolve()
data_path = home_path / "kgsaf_data" / "datasets"

for d_type in [data_path / "base" / "unpack", data_path / "materialize" / "unpack"]:
    print(f"Converting NTriples Datasets in {d_type}")
    for data_folder in d_type.iterdir():
        if data_folder.is_dir():
            print(f"\t Converting Dataset {data_folder.name}")
            processor = TSVConverter(data_folder)
            processor.convert()
            processor.serialize()
    

# JSON OWL Conversion

In [None]:
home_path = Path.cwd().parent.parent.resolve()
data_path = home_path / "kgsaf_data" / "datasets"

for d_type in [data_path / "base" / "unpack", data_path / "materialize" / "unpack"]:
    print(f"Conversion to JSON for Datasets in {d_type}")
    for data_folder in d_type.iterdir():
        if data_folder.is_dir():
            print(f"\tProcessing Dataset {data_folder.name}")
            processor = OWLConverter(data_folder)
            processor.preprocess(verbose=False)
            processor.serialize()