# Ocean InfoHub Graph Explorer

## Oxigraph explorer

<a href="https://githubtocolab.com/gleanerio/archetype/blob/master/networks/commons/notebooks/networkViz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.png" alt="Open in Colab"/></a>

At present the test with the available graphs represents just over 10 million triples.

## Installs, imports and definitions


In [1]:
%%capture
!pip install -q minio
!pip install -q oxrdflib
!pip install -q ipysigma
!pip install -q kuzu
!pip install -q pyoxigraph
!pip install -q pygraphml

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)  ## remove pandas future warning
from minio import Minio
from urllib.request import urlopen
import kuzu
from ipysigma import Sigma
import os
import requests
from concurrent.futures import ThreadPoolExecutor
import networkx as nx
from lxml import etree
import io
import pyoxigraph
import re
from rdflib import Graph

from pygraphml import GraphMLParser
from pygraphml import Graph as GraphML

In [3]:
from ast import mod

# popper will convert nq to nt (via a simple hack), it will also convert http to https for schema.org prefixes
def popper(input):
    lines = input.splitlines()
    modified_lines = []

    for line in lines:
        newline = line.replace("http://schema.org", "https://schema.org")
        segments = newline.split(' ')

        if len(segments) > 3:
            segments.pop()   # Remove the last two segment
            segments.pop()
            new_line = ' '.join(segments) + ' .'
            modified_lines.append(new_line)


    # print(len(modified_lines))
    result_string = '\n'.join(modified_lines)
    # print(len(result_string))

    return(result_string)

# prefalign will convert http to https for schema.org prefixes
def prefalign(input):
    lines = input.splitlines()
    modified_lines = []

    regex = re.compile(r'[\r\n\t]+')

    for line in lines:
        new_line = line.replace("http://schema.org", "https://schema.org")
        new_string = re.sub(regex, ' ', new_line)
        modified_lines.append(new_string)


    # print(len(modified_lines))
    result_string = '\n'.join(modified_lines)
    # print(len(result_string))

    return(result_string)


def publicurls(client, bucket, prefix):
    urls = []
    objects = client.list_objects(bucket, prefix=prefix, recursive=True)
    for obj in objects:
        result = client.stat_object(bucket, obj.object_name)

        if result.size > 0:  #  how to tell if an objet   obj.is_public  ?????
            url = client.presigned_get_object(bucket, obj.object_name)
            # print(f"Public URL for object: {url}")
            urls.append(url)

    return urls

def download_file(url):
    """Downloads a remote file and handles potential errors."""
    try:
        response = requests.get(url, stream=True)  # Stream for efficient memory usage
        response.raise_for_status()  # Raise exception for error codes

        filename = url.split("/")[-1]
        local_filename = f"./data/{filename.replace('.nq', '.nt')}"  # Convert extension

        with open(local_filename, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024):  # Download in chunks
                f.write(chunk)

        print(f"Downloaded: {filename}")

    except requests.exceptions.RequestException as e:
        print(f"Download failed for {url}: {e}")

In [4]:
client = Minio("ossapi.oceaninfohub.org:80",  secure=False) # Create client with anonymous access.
urls = publicurls(client, "commons", "ODIS-KG-MAIN/18042024")
# for u in urls:
#   print(u)

## Copy files local for use

You could load over the network directory into the triplestore.  For other tools like Kuzu this is not currently an option so I have the code here to pull down the files first, then load them.

In [5]:
# Define the directory path
directory_path = "./data"

# Check if the directory exists and make it if it doesn't
if not os.path.exists(directory_path):
    os.makedirs(directory_path)

In [6]:
with ThreadPoolExecutor() as executor:
    for url in urls:
        if "_prov" not in url:
            executor.submit(download_file, url)

Downloaded: inanodc_release.nq
Downloaded: invemarvessels_release.nq
Downloaded: invemarinstitutions_release.nq
Downloaded: africaioc_release.nq
Downloaded: invemartraining_release.nq
Downloaded: marinetraining_release.nq
Downloaded: invemarexperts_release.nq
Downloaded: oceanscape_release.nq
Downloaded: oceanexperts_release.nq
Downloaded: medin_release.nq
Downloaded: euroceanevents_release.nq
Downloaded: wod_release.nq
Downloaded: pedp_release.nq
Downloaded: oceanexpert_release.nq
Downloaded: rda_release.nq
Downloaded: edmerp_release.nq
Downloaded: edmo_release.nq
Downloaded: euroceanexperts_release.nq
Downloaded: obis_release.nq
Downloaded: obps_release.nq
Downloaded: cioos_release.nq
Downloaded: pdh_release.nq
Downloaded: aquadocs_release.nq


## Conversion section if needed

In this section you can call code to either convert nq to nt and or align the various schema.org prefix values.

In [7]:
# Define the directory path
directory_path = "./converted"

# Check if the directory exists and make it if it doesn't
if not os.path.exists(directory_path):
    os.makedirs(directory_path)

In [8]:
dir = "./data"
output_dir = "./converted"
for f in os.listdir(dir):
  fp = os.path.join(dir, f)

  # prefalign
  pa = prefalign(open(fp, 'r').read())
  open(os.path.join(output_dir, f), 'w').write(pa)

## Load to Oxigraph

In [9]:
# store = pyoxigraph.Store()  #    store = pyoxigraph.Store(path="./store") # use path for disk store
diskstore = pyoxigraph.Store(path="./store")
mime_type = "application/n-quads"   # application/n-triples or application/n-quads if you are loading those from data raw

## Disk or Memory Store

In [10]:
dir = "./data"  # either of data or converted depending on what you did above

# Use either or memory or disk store below.
# Memory store is faster, but needs 24+ GB of memory
# Disk store is slow, but runs in 16+ GB of memory (aquadocs spike?)

# MEMORY STORE
# for f in os.listdir(dir):
#   fp = os.path.join(dir, f)
#   print(f"Loading: {fp}")
#   store.load(open(fp, 'rb'), mime_type, base_iri=None, to_graph=None)

# DISK STORE
for f in os.listdir(dir):
  fp = os.path.join(dir, f)
  print(f"Loading: {fp}")
  diskstore.load(open(fp, 'rb'), mime_type, base_iri=None, to_graph=None)

Loading: ./data/marinetraining_release.nt
Loading: ./data/euroceanexperts_release.nt
Loading: ./data/invemarinstitutions_release.nt
Loading: ./data/obps_release.nt
Loading: ./data/oceanexpert_release.nt
Loading: ./data/invemarexperts_release.nt
Loading: ./data/wod_release.nt
Loading: ./data/invemartraining_release.nt
Loading: ./data/edmo_release.nt
Loading: ./data/cioos_release.nt
Loading: ./data/euroceanevents_release.nt
Loading: ./data/africaioc_release.nt
Loading: ./data/pedp_release.nt
Loading: ./data/inanodc_release.nt
Loading: ./data/oceanexperts_release.nt
Loading: ./data/aquadocs_release.nt
Loading: ./data/oceanscape_release.nt
Loading: ./data/rda_release.nt
Loading: ./data/edmerp_release.nt
Loading: ./data/pdh_release.nt
Loading: ./data/obis_release.nt
Loading: ./data/medin_release.nt
Loading: ./data/invemarvessels_release.nt


## SPARQL Query sections

In [11]:
qtest = """	PREFIX schema: <https://schema.org/>
SELECT ?s
WHERE {
  graph ?g {
     ?s ?p ?o  .
     }
 } LIMIT 10
"""

q1 = list(diskstore.query(qtest))

In [12]:
print(qr)

NameError: name 'qr' is not defined

In [None]:
qtypetype = """	PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX schema: <https://schema.org/>

SELECT DISTINCT ?source ?type ?target ?sType ?tType
WHERE {
    ?source a ?sType .
    ?target a ?tType .
    ?source ?type ?target .
}
"""

q2 = list(storeobdisk.query(qtypetype))