# ODIS / OIH Property Graph Explorer

In process graph OLAP with KuzuDB



## Installs, imports and definitions

For all the OLAP approaches, connecting with DuckDB might be useful, as might
IBIS (https://ibis-project.org/).  

### Try
* https://kuzudb.com/docusaurus/blog/llms-graphs-part-1/
* https://kuzudb.com/docusaurus/blog/transforming-your-data-to-graphs-2/

In [1]:
# %%capture
# !pip install -q minio
# !pip install -q oxrdflib
# !pip install -q ipysigma
# !pip install -q kuzu
# !pip install -q chocolate >= 0.0.2
# !pip install -q icecream >= 2.1
# !pip install -q pandas >= 2.2
# !pip install -q pyshacl >= 0.25
# !pip install -q rdflib >= 7.0

In [12]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)  ## remove pandas future warning
from minio import Minio
from urllib.request import urlopen
import kuzu
from ipysigma import Sigma
import os
import requests
from concurrent.futures import ThreadPoolExecutor
import networkx as nx
from lxml import etree

In [13]:
def publicurls(client, bucket, prefix):
    urls = []
    objects = client.list_objects(bucket, prefix=prefix, recursive=True)
    for obj in objects:
        result = client.stat_object(bucket, obj.object_name)

        if result.size > 0:  #  how to tell if an objet   obj.is_public  ?????
            url = client.presigned_get_object(bucket, obj.object_name)
            # print(f"Public URL for object: {url}")
            urls.append(url)

    return urls

def download_file(url):
    """Downloads a remote file and handles potential errors."""
    try:
        response = requests.get(url, stream=True)  # Stream for efficient memory usage
        response.raise_for_status()  # Raise exception for error codes

        filename = url.split("/")[-1]
        local_filename = f"./data/{filename.replace('.nq', '.nt')}"  # Convert extension

        with open(local_filename, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024):  # Download in chunks
                f.write(chunk)

        print(f"Downloaded: {filename}")

    except requests.exceptions.RequestException as e:
        print(f"Download failed for {url}: {e}")

In [14]:
client = Minio("ossapi.oceaninfohub.org:80",  secure=False) # Create client with anonymous access.
urls = publicurls(client, "commons", "OIH-KG/21022024/nt")
# for u in urls:
#   print(u)

## Copy files local for use

In [5]:
# make the directory ./data if it doesn't exist
# Define the directory path
directory_path = "./data"

# Check if the directory exists
if not os.path.exists(directory_path):
    # Create the directory
    os.makedirs(directory_path)

In [6]:
with ThreadPoolExecutor() as executor:
    for url in urls:
        if "_prov" not in url:
            executor.submit(download_file, url)

Downloaded: inanodc_release.nt
Downloaded: invemarvessels_release.nt
Downloaded: invemarinstitutions_release.nt
Downloaded: africaioc_release.nt
Downloaded: invemarexperts_release.nt
Downloaded: bmdc_release.nt
Downloaded: emodnet_release.nt
Downloaded: edmo_release.nt
Downloaded: invemartraining_release.nt
Downloaded: nmdis_release.nt
Downloaded: oceanscape_release.nt
Downloaded: obps_release.nt
Downloaded: pedp_release.nt
Downloaded: edmerp_release.nt
Downloaded: oceanexpert_release.nt
Downloaded: rda_release.nt
Downloaded: marinetraining_release.nt
Downloaded: cioos_release.nt
Downloaded: invemardocuments_release.nt
Downloaded: obis_release.nt
Downloaded: medin_release.nt
Downloaded: pdh_release.nt


## Load all in ./data to Kuzu


In [7]:
db = kuzu.Database('./kuzu')
conn = kuzu.Connection(db)
conn.execute("CREATE RDFGraph UniKG;")

<kuzu.query_result.QueryResult at 0x7a72493daa10>

In [8]:
file_names = os.listdir("./data")
file_paths = [("./data/{}".format(file_name)) for file_name in file_names]
conn.execute("COPY UniKG FROM {};".format(file_paths))

<kuzu.query_result.QueryResult at 0x7a72493db130>

In [9]:
r = conn.execute("MATCH (s)-[p:UniKG]->(o) RETURN count(*);")
while r.has_next():
    print(r.get_next())

[7320341]


## Subset load

In [10]:
db2 = kuzu.Database('./kuzu')
conn2 = kuzu.Connection(db2)
conn2.execute("CREATE RDFGraph UniKG;")

RuntimeError: Binder exception: UniKG already exists in catalog.

In [None]:
ss = ['./data/obis_release.nt', './data/obps_release.nt', "./data/oceanexpert_release.nt"]
conn2.execute("COPY UniKG FROM {};".format(ss))

In [None]:
r = conn2.execute("MATCH (s)-[p:UniKG]->(o) RETURN count(*);")
while r.has_next():
    print(r.get_next())

## Kuzu to Visilization

I can same results to networkx, which can same to graphml for use with sigm

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
c1 = """MATCH (s)-[p:UniKG {iri: "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"}]-> (st),
(t)-[p2:UniKG {iri: "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"}]-> (tt),
(s)-[p3:UniKG_rt]->(t)

RETURN s.iri, t.iri, p3.iri  LIMIT 10000
"""

c2 = """MATCH (s)-[p:UniKG {iri: "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"}]-> (st),
(t)-[p2:UniKG {iri: "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"}]-> (tt),
(s)-[p3:UniKG_rt]->(t)
RETURN COUNT(*)
"""

c3 = """MATCH (s)-[p:UniKG {iri: "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"}]-> (st),
(t)-[p2:UniKG {iri: "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"}]-> (tt),
(s)-[p3:UniKG_rt]->(t)
RETURN s, t, p3 LIMIT 100000
"""

In [None]:
r = conn.execute(c3)

In [None]:
g = r.get_as_networkx(directed=False)

In [None]:
# sigma can work with a networkx graph to begin with
Sigma(
    g,
    node_size=g.degree,
    default_edge_type='curve',
    node_border_color_from='node',
    node_metrics=['louvain'],
    node_color='louvain',
    start_layout=5,
    edge_size=lambda u, v: g.degree(u) + g.degree(v),
    edge_size_range=(0.5, 5),
    label_font='cursive',
    node_label_size=g.degree,
    label_density=0
)