# OIH Graph Pre-Processor - Solr

## About
This notebook demonstrates some approaches for processing the release graphs into a format that
is useful for the Solr index


### Imports and definitions

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)  ## remove pandas future warning
import pandas as pd
import geopandas as gpd
from shapely import wkt
# import s3fs
import pyarrow.parquet as pq
import shapely
import os
import re
import json, io
from pyld import jsonld
import kglab
from minio import Minio
import rdflib
from rdflib import ConjunctiveGraph  #  needed for nquads

In [2]:
def publicurls(client, bucket, prefix):
    urls = []
    objects = client.list_objects(bucket, prefix=prefix, recursive=True)
    for obj in objects:
        result = client.stat_object(bucket, obj.object_name)

        if result.size > 0:  #  how to tell if an objet   obj.is_public  ?????
            url = client.presigned_get_object(bucket, obj.object_name)
            # print(f"Public URL for object: {url}")
            urls.append(url)

    return urls

In [3]:
def to_wkt(polygon_string):
    # split the input string into pairs
    pairs = polygon_string.split(',')

    # transform each pair into 'y x' format
    # transformed_pairs = [' '.join(reversed(pair.split())) for pair in pairs]
    transformed_pairs = [' '.join(pair.split()) for pair in pairs]


    # join the transformed pairs with a comma and a space
    transformed_string = ', '.join(transformed_pairs)

    # return the final WKT string
    return f"POLYGON (({transformed_string}))"

In [4]:
# Check for using GPU, in case you want to ensure your GPU is used
# gc = kglab.get_gpu_count()
# print(gc)

In [5]:
client = Minio("ossapi.oceaninfohub.org:80",  secure=False) # Create client with anonymous access.
urls = publicurls(client, "public", "graph")

In [6]:
print(urls)

['http://ossapi.oceaninfohub.org/public/graphs/summonedafricaioc_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedaquadocs_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedcioos_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonededmerp_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonededmo_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedemodnet_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinanodc_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinvemardocuments_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinvemarexperts_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinvemarinstitutions_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinvemartraining_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinvemarvessels_v1_release.nq', 'http://ossapi.oceaninf

## Single Graph Test

At this point we have the URLs, and we could either loop load all of them or pull one out manually and use.  This section dmonstrates loading and working with one


In [7]:
# load quad graph
g = ConjunctiveGraph()
g.parse("http://ossapi.oceaninfohub.org/public/graphs/summonedobis_v1_release.nq", format="nquads")
print(len(g))

161187


In [8]:
namespaces = {
    "shacl":   "http://www.w3.org/ns/shacl#" ,
    "schema":   "https://schema.org/" ,
    "schemawrong": "http://schema.org/",
    "geo":      "http://www.opengis.net/ont/geosparql#",
}

kg = kglab.KnowledgeGraph(name = "OIH test", base_uri = "https://oceaninfohub.org/id/", namespaces = namespaces, use_gpus=True, import_graph = g)

In [9]:
solrquery2 = """
PREFIX schema: <https://schema.org/>
PREFIX schemawrong: <http://schema.org/>

SELECT (?s as ?id) ?type 
WHERE
{
    ?s rdf:type ?type ?description
    FILTER ( ?type IN (schema:ResearchProject, schema:Project, schema:Organization,
    schema:Dataset, schema:CreativeWork, schema:Person, schema:Map, schema:Course,
    schema:CourseInstance, schema:Event, schema:Vehicle,   schemawrong:ResearchProject, schemawrong:Project, schemawrong:Organization,
    schemawrong:Dataset, schemawrong:CreativeWork, schemawrong:Person, schemawrong:Map, schemawrong:Course,
    schemawrong:CourseInstance, schemawrong:Event, schemawrong:Vehicle  ) )
    ?s schema:name | schemawrong:name ?name
    OPTIONAL { ?s schema:keywords | schemawrong:keywords ?keywords }
    OPTIONAL { ?s schema:description | schemawrong:description  ?desc . }
    OPTIONAL { ?s schema:name | schemawrong:description  ?name . }
    OPTIONAL { ?s schema:url | schemawrong:url ?url .   }
    OPTIONAL {?s schema:keywords | schemawrong:keywords ?keywords}
    OPTIONAL { ?s schema:spatialCoverage ?sc .
      ?sc a  schema:Place .
      ?sc schema:geo ?geo .
      ?geo a ?geotype .
      ?geo schema:polygon ?geom
    }
}
"""

solrquery1 = """
PREFIX schema: <https://schema.org/>
PREFIX schemawrong: <http://schema.org/>

SELECT (?s as ?id) ?type ?description ?name ?geom
WHERE
{
    ?s rdf:type ?type
    FILTER ( ?type IN (schema:ResearchProject, schema:Project, schema:Organization,
    schema:Dataset, schema:CreativeWork, schema:Person, schema:Map, schema:Course,
    schema:CourseInstance, schema:Event, schema:Vehicle,   schemawrong:ResearchProject, schemawrong:Project, schemawrong:Organization,
    schemawrong:Dataset, schemawrong:CreativeWork, schemawrong:Person, schemawrong:Map, schemawrong:Course,
    schemawrong:CourseInstance, schemawrong:Event, schemawrong:Vehicle  ) )
    ?s schema:name | schemawrong:name ?name
        OPTIONAL { ?s schema:name | schemawrong:description  ?name . }

    OPTIONAL { ?s schema:keywords | schemawrong:keywords ?keywords }
    OPTIONAL { ?s schema:description | schemawrong:description  ?description . }
    OPTIONAL { ?s schema:url | schemawrong:url ?url .   }
    OPTIONAL {?s schema:keywords | schemawrong:keywords ?keywords}
    ?s schema:spatialCoverage ?sc .
    ?sc a  schema:Place .
    ?sc schema:geo ?geo .
    ?geo a ?geotype .
    ?geo ?geompred ?geom .
    FILTER(!isIRI(?geom))

}
"""


In [10]:
pdf = kg.query_as_df(solrquery1)
# df = pdf   # .to_pandas()  #  breaks with papermill for reasons unknown at this time if to_pandas() is used, needed in my kglab conda env
# pdf.info()

In [11]:
pdf.head(20)


Unnamed: 0,id,type,description,name,geom
0,<https://obis.org/dataset/1057a007-c31c-48a3-a...,schema:Dataset,"In Australia, it is thought that up to 26 Aust...",Census of annual pup production by Australian ...,"135.96667 -43.63333,135.96667 -35.01667,150.23..."
1,<https://obis.org/dataset/1057a007-c31c-48a3-a...,schema:Dataset,"In Australia, it is thought that up to 26 Aust...",Census of annual pup production by Australian ...,"135.96667 -43.63333,135.96667 -35.01667,150.23..."
2,<https://obis.org/dataset/d64477cf-491f-4de5-8...,schema:Dataset,Original provider:\nObservatorio Ambiental Gra...,Canary Islands - OAG (aggregated per 1-degree ...,"-74.5 5.5,-74.5 45.5,32.5 45.5,32.5 5.5,-74.5 5.5"
3,<https://obis.org/dataset/d64477cf-491f-4de5-8...,schema:Dataset,Original provider:\nObservatorio Ambiental Gra...,Canary Islands - OAG (aggregated per 1-degree ...,"-74.5 5.5,-74.5 45.5,32.5 45.5,32.5 5.5,-74.5 5.5"
4,<https://obis.org/dataset/d64477cf-491f-4de5-8...,schema:Dataset,Original provider:\nObservatorio Ambiental Gra...,Canary Islands - OAG (aggregated per 1-degree ...,"-74.5 5.5,-74.5 45.5,32.5 45.5,32.5 5.5,-74.5 5.5"
5,<https://obis.org/dataset/e71d452f-615e-4654-b...,schema:Dataset,Original provider:\nVirginia Aquarium and Mari...,Virginia and Maryland Sea Turtle Research and ...,"-76.39647 36.58278,-76.39647 38.52142,-74.3984..."
6,<https://obis.org/dataset/e71d452f-615e-4654-b...,schema:Dataset,Original provider:\nVirginia Aquarium and Mari...,Virginia and Maryland Sea Turtle Research and ...,"-76.39647 36.58278,-76.39647 38.52142,-74.3984..."
7,<https://obis.org/dataset/e71d452f-615e-4654-b...,schema:Dataset,Original provider:\nVirginia Aquarium and Mari...,Virginia and Maryland Sea Turtle Research and ...,"-76.39647 36.58278,-76.39647 38.52142,-74.3984..."
8,<https://obis.org/dataset/49f74e10-b23b-4aca-a...,schema:Dataset,Tow video and epibenthic sled collections were...,"Species assemblages, biomass and regional habi...","124.05919 -15.94544,124.05919 -15.22044,124.69..."
9,<https://obis.org/dataset/49f74e10-b23b-4aca-a...,schema:Dataset,Tow video and epibenthic sled collections were...,"Species assemblages, biomass and regional habi...","124.05919 -15.94544,124.05919 -15.22044,124.69..."


In [13]:
pdf.to_parquet('solr_obis.parquet')  

# Frame testing


In [14]:
def simple_frame(dg, frame):
    try:
        framed = jsonld.frame(dg, frame)
        return framed  # ['citation']
    except:
        return ""

In [15]:
with open('./assets/frame.json', 'rb') as f:
    fr = json.loads(f.read().decode("utf-8", "ignore").replace('\n',' '))
    # fr =   f.read()

with open('./solrInputData.json', 'rb') as f:
    dg = json.loads(f.read().decode("utf-8", "ignore").replace('\n',' '))
    # dg =   file.read()
    
# print(fr)
# print(dg)

f = simple_frame(dg, fr)
json_formatted_str = json.dumps(f, indent=2)
print(json_formatted_str)

{
  "@context": {
    "@vocab": "http://schema.org/"
  },
  "@graph": [
    {
      "@id": "http://ckan.onc.uvic.ca",
      "@type": "DataCatalog",
      "description": ""
    },
    {
      "@id": "https://catalogue.cioos.ca",
      "@type": "DataCatalog",
      "description": ""
    },
    {
      "@id": "https://catalogue.cioos.ca/dataset/ff0232d8-34bd-4456-be28-20d4f8b2937c.jsonld",
      "@type": "Dataset",
      "description": [
        {
          "@language": "en",
          "@value": "The WET Labs ECO FLNTUS 894 was deployed on 2010-09-13 at Barkley Upper Slope. Upper Slope is a location within Barkley Canyon, which is located on the upper continental slope. This device is a Fluorometer Turbidity. Fluorometer Turbidity instruments measure chlorophyll fluorescence and turbidity within the same volume of seawater. The instrument uses a light emitting diode (LED) to provide an excitation source. The fluoresced light is received by a detector at a particular angle from the LED sou