# ODIS / OIH RDF Graph Explorer

In process RDF 


## Installs, imports and definitions

### Try
* https://kuzudb.com/docusaurus/blog/llms-graphs-part-1/
* https://kuzudb.com/docusaurus/blog/transforming-your-data-to-graphs-2/

In [1]:
%%capture
!pip install -q minio
!pip install -q oxrdflib
!pip install -q ipysigma
!pip install --upgrade --pre kuzu
!pip install -q pyoxigraph
!pip install -q pygraphml

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)  ## remove pandas future warning
from minio import Minio
from urllib.request import urlopen
import kuzu
from ipysigma import Sigma
import os
import requests
from concurrent.futures import ThreadPoolExecutor
import networkx as nx
from lxml import etree
import io
import pyoxigraph
from rdflib import Graph

from pygraphml import GraphMLParser
from pygraphml import Graph as GraphML

In [3]:
def publicurls(client, bucket, prefix):
    urls = []
    objects = client.list_objects(bucket, prefix=prefix, recursive=True)
    for obj in objects:
        result = client.stat_object(bucket, obj.object_name)

        if result.size > 0:  #  how to tell if an objet   obj.is_public  ?????
            url = client.presigned_get_object(bucket, obj.object_name)
            # print(f"Public URL for object: {url}")
            urls.append(url)

    return urls

def download_file(url):
    """Downloads a remote file and handles potential errors."""
    try:
        response = requests.get(url, stream=True)  # Stream for efficient memory usage
        response.raise_for_status()  # Raise exception for error codes

        filename = url.split("/")[-1]
        local_filename = f"./data/{filename.replace('.nq', '.nt')}"  # Convert extension

        with open(local_filename, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024):  # Download in chunks
                f.write(chunk)

        print(f"Downloaded: {filename}")

    except requests.exceptions.RequestException as e:
        print(f"Download failed for {url}: {e}")

In [4]:
client = Minio("ossapi.oceaninfohub.org:80",  secure=False) # Create client with anonymous access.
urls = publicurls(client, "commons", "OIH-KG/21022024/nt")
# for u in urls:
#   print(u)

## Copy files local for use

In [5]:
# make the directory ./data if it doesn't exist
# Define the directory path
directory_path = "./data"

# Check if the directory exists
if not os.path.exists(directory_path):
    # Create the directory
    os.makedirs(directory_path)

In [6]:
with ThreadPoolExecutor() as executor:
    for url in urls:
        if "_prov" not in url:
            executor.submit(download_file, url)

Downloaded: inanodc_release.nt
Downloaded: invemarvessels_release.nt
Downloaded: invemarinstitutions_release.nt
Downloaded: invemartraining_release.nt
Downloaded: africaioc_release.nt
Downloaded: bmdc_release.nt
Downloaded: invemarexperts_release.nt
Downloaded: emodnet_release.nt
Downloaded: edmo_release.nt
Downloaded: nmdis_release.ntDownloaded: marinetraining_release.nt

Downloaded: oceanscape_release.nt
Downloaded: edmerp_release.nt
Downloaded: cioos_release.nt
Downloaded: pedp_release.nt
Downloaded: obps_release.nt
Downloaded: obis_release.nt
Downloaded: invemardocuments_release.nt
Downloaded: oceanexpert_release.nt
Downloaded: rda_release.nt
Downloaded: medin_release.nt
Downloaded: pdh_release.nt


## Load to Oxigraph

In [7]:
store = pyoxigraph.Store()  #    store = pyoxigraph.Store(path="./store")
mime_type = "application/n-triples"

In [8]:
dir = "./data"
for f in os.listdir(dir):
  fp = os.path.join(dir, f)

  store.load(io.StringIO(open(fp, 'r').read()), mime_type, base_iri=None, to_graph=None)

In [9]:
rq1 = """	PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX schema: <https://schema.org/>

SELECT DISTINCT ?source ?type ?target ?sType ?tType
WHERE {
    ?source a ?sType .
    ?target a ?tType .
    ?source ?type ?target .
}

"""

rq2 = 'SELECT ?s WHERE { ?s ?p ?o } LIMIT 100'

rq3 = """	PREFIX schema: <https://schema.org/>
SELECT ?o
WHERE {
     ?s schema:distribution ?d .
     ?d schema:contentUrl ?o .
 }
"""


In [10]:
qr = list(store.query(rq1))

In [11]:
# print(len(qr))
# for r in qr[0:20]:
#   # h = is_url_reachable(r['o'].value)
#   # d = is_url_downloadable(r['o'].value)
#   h = "test;"
#   d = "test;"
#   print("{} {} {}".format(r['source'].value, r['target'].value, r['type'].value))

In [None]:
#  yeah, I get it...   don't iterate rows...  PR's welcome for this, being bad is too easy!
# for index, row in nodes.iterrows():
#    g.add_node(row['Id'])
g = GraphML()
g.directed = False

for  r in qr:
    n1 = g.add_node(r['source'])
    # n1['name'] = row['name']

    n2 = g.add_node(r['target'])
    # n2['type'] = "Funder"
    # n2['value'] = row['funding.name']

    # n3 = g.add_node(row['ahash'])
    # n3['type'] = "Affiliation"
    # n3['value'] = row['affil']

    e1 = g.add_edge(n1, n2)
    # e2 = g.add_edge(n1, n3)
    # e['predicate'] = row['type']

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
fname = "testGraphML.graphml"
parser = GraphMLParser()
parser.write(g, fname)

In [None]:
g = nx.read_graphml("testGraphML.graphml")

In [None]:
Sigma(
    g,
    node_size=g.degree,
    default_edge_type='curve',
    node_border_color_from='node',
    node_metrics=['louvain'],
    node_color='louvain',
    start_layout=5,
    edge_size=lambda u, v: g.degree(u) + g.degree(v),
    edge_size_range=(0.5, 5),
    label_font='cursive',
    node_label_size=g.degree,
    label_density=0
)