# Graph Playground

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)  ## remove pandas future warning

import s3fs
import kglab
from minio import Minio
import rdflib
from rdflib import ConjunctiveGraph  #  needed for nquads

In [2]:
def publicurls(client, bucket, prefix):
    urls = []
    objects = client.list_objects(bucket, prefix=prefix, recursive=True)
    for obj in objects:
        result = client.stat_object(bucket, obj.object_name)

        if result.size > 0:  #  how to tell if an objet   obj.is_public  ?????
            url = client.presigned_get_object(bucket, obj.object_name)
            # print(f"Public URL for object: {url}")
            urls.append(url)

    return urls

In [3]:
# Check for using GPU, in case you want to ensure your GPU is used
gc = kglab.get_gpu_count()
print(gc)

0


In [4]:
client = Minio("ossapi.oceaninfohub.org:80",  secure=False) # Create client with anonymous access.
urls = publicurls(client, "public", "graph")

In [5]:
print(urls)

['http://ossapi.oceaninfohub.org/public/graphs/summonedafricaioc_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedaquadocs_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedcioos_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonededmerp_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonededmo_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedemodnet_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinanodc_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinvemardocuments_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinvemarexperts_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinvemarinstitutions_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinvemartraining_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinvemarvessels_v1_release.nq', 'http://ossapi.oceaninf

## URLs

At this point we have the URLs and we could either loop load all of them or pull one out manually and use.  This code could
be used as a basis for any of these approaches.


In [6]:
# load quad graph
g = ConjunctiveGraph()
g.parse("http://ossapi.oceaninfohub.org/public/graphs/summonedobis_v1_release.nq", format="nquads")
print(len(g))

161165


In [7]:
namespaces = {
    "shacl":   "http://www.w3.org/ns/shacl#" ,
    "schmea":   "https://schema.org/" ,
    "geo":      "http://www.opengis.net/ont/geosparql#",
}

kg = kglab.KnowledgeGraph(name = "OIH test", base_uri = "https://oceaninfohub.org/id/", namespaces = namespaces, import_graph = g)

In [8]:
sparql = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>


SELECT ?p (COUNT(?p) as ?count)
WHERE
{
  ?s ?p ?o .
}
GROUP BY ?p ORDER BY DESC(?count)
"""

pdf = kg.query_as_df(sparql)
# df = pdf   # .to_pandas()  #  breaks with papermill for reasons unknown at this time if to_pandas() is used, needed in my kglab conda env


In [9]:
pdf.head()

Unnamed: 0,p,count
0,rdf:type,28294
1,schmea:keywords,20101
2,schmea:name,11010
3,schmea:description,10321
4,schmea:url,9826


In [10]:
sparql = """
PREFIX schema: <https://schema.org/>


SELECT ?s ?desc ?name
WHERE
{
 ?s rdf:type ?type
   FILTER ( ?type IN (schema:ResearchProject, schema:Project, schema:Organization, 
   schema:Dataset, schema:CreativeWork, schema:Person, schema:Map, schema:Course,
   schema:CourseInstance, schema:Event, schema:Vehicle) )
   ?s schema:description ?desc .
   ?s schema:name ?name

}
"""

pdf = kg.query_as_df(sparql)
# df = pdf   # .to_pandas()  #  breaks with papermill for reasons unknown at this time if to_pandas() is used, needed in my kglab conda env


In [11]:
pdf.head(20)

Unnamed: 0,s,desc,name
0,<https://obis.org/dataset/1057a007-c31c-48a3-a...,"In Australia, it is thought that up to 26 Aust...",Census of annual pup production by Australian ...
1,<https://obis.org/dataset/d64477cf-491f-4de5-8...,Original provider:\nObservatorio Ambiental Gra...,Canary Islands - OAG (aggregated per 1-degree ...
2,<https://obis.org/dataset/e71d452f-615e-4654-b...,Original provider:\nVirginia Aquarium and Mari...,Virginia and Maryland Sea Turtle Research and ...
3,<https://obis.org/dataset/49f74e10-b23b-4aca-a...,Tow video and epibenthic sled collections were...,"Species assemblages, biomass and regional habi..."
4,<https://obis.org/dataset/30fd5e3c-d729-41bb-b...,"Data on the distribution, reproductive and sur...",PELD-ELPA Ecology of Lahille's bottlenose dolp...
5,<https://obis.org/dataset/9a37fc55-1fc8-4c19-b...,Original provider:\nCanadian Wildlife Service\...,PIROP Northwest Atlantic 1965-1992
6,<https://obis.org/dataset/1d81a51c-66fc-46bf-a...,Original provider:\nUniversity of North Caroli...,USWTR JAX Aerial Survey -Left side- 2011-2012
7,<https://obis.org/dataset/a595a9a0-642a-473f-8...,The EAISSNA database contains information on l...,Electronic Atlas of Ichthyoplankton on the Sco...
8,<https://obis.org/dataset/32948ff1-6f03-4877-b...,Original provider:\nHappywhale\n\nDataset cred...,Happywhale - Bryde's whale in South Atlantic O...
9,<https://obis.org/dataset/0abb8cc1-8651-4213-a...,El phylum Nemertea está formado por un pequeño...,Colección de Gusanos Cinta (Nemertea) de la re...


In [14]:
rq_pcount = """SELECT ?p (COUNT(?p) as ?pCount)
WHERE
{
  ?s ?p ?o .
}
GROUP BY ?p 
ORDER BY DESC(?count)
"""

pdf = kg.query_as_df(rq_pcount)
pdf.head()

Unnamed: 0,p,pCount
0,schmea:sameAs,5396
1,rdf:type,28294
2,schmea:keywords,20101
3,schmea:citation,4060
4,schmea:additionalProperty,3998


## All OIh-Graph load

The following will load all the graphs of the providers in the OIH-Graph.

In [12]:
bg = ConjunctiveGraph()

for u in urls:
    print("Loading {}".format(u))
    bg.parse(u, format="nquads")

print(len(bg))

Loading http://ossapi.oceaninfohub.org/public/graphs/summonedafricaioc_v1_release.nq
Loading http://ossapi.oceaninfohub.org/public/graphs/summonedaquadocs_v1_release.nq
Loading http://ossapi.oceaninfohub.org/public/graphs/summonedcioos_v1_release.nq
Loading http://ossapi.oceaninfohub.org/public/graphs/summonededmerp_v1_release.nq
Loading http://ossapi.oceaninfohub.org/public/graphs/summonededmo_v1_release.nq
Loading http://ossapi.oceaninfohub.org/public/graphs/summonedemodnet_v1_release.nq
Loading http://ossapi.oceaninfohub.org/public/graphs/summonedinanodc_v1_release.nq
Loading http://ossapi.oceaninfohub.org/public/graphs/summonedinvemardocuments_v1_release.nq
Loading http://ossapi.oceaninfohub.org/public/graphs/summonedinvemarexperts_v1_release.nq
Loading http://ossapi.oceaninfohub.org/public/graphs/summonedinvemarinstitutions_v1_release.nq
Loading http://ossapi.oceaninfohub.org/public/graphs/summonedinvemartraining_v1_release.nq
Loading http://ossapi.oceaninfohub.org/public/graphs/s

In [13]:
namespaces = {
    "shacl":   "http://www.w3.org/ns/shacl#" ,
    "schmea":   "https://schema.org/" ,
    "geo":      "http://www.opengis.net/ont/geosparql#",
}

bkg = kglab.KnowledgeGraph(name = "OIH test", base_uri = "https://oceaninfohub.org/id/", namespaces = namespaces, import_graph = bg)

In [14]:
sparql = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>


SELECT ?p (COUNT(?p) as ?count)
WHERE
{
  ?s ?p ?o .
}
GROUP BY ?p ORDER BY DESC(?count)
"""

pdf = bkg.query_as_df(sparql)
# df = pdf   # .to_pandas()  #  breaks with papermill for reasons unknown at this time if to_pandas() is used, needed in my kglab conda env


In [15]:
pdf.head()

Unnamed: 0,p,count
0,rdf:type,579669
1,schmea:name,316802
2,schmea:keywords,306246
3,schmea:url,189304
4,schmea:description,134921


In [32]:
bkg.save_parquet("OIHGraph_25032023.parquet")
