# OIH Graph Pre-Processor - Solr

## About
This notebook demonstrates some approaches for processing the release graphs into a format that
is useful for the Solr index


### Imports and definitions

In [55]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)  ## remove pandas future warning
import pandas as pd
import geopandas as gpd
from shapely import wkt
import s3fs
import pyarrow.parquet as pq
import shapely
import os
import re
import json, io
from pyld import jsonld
import kglab
from minio import Minio
import rdflib
from rdflib import ConjunctiveGraph  #  needed for nquads

In [56]:
def publicurls(client, bucket, prefix):
    urls = []
    objects = client.list_objects(bucket, prefix=prefix, recursive=True)
    for obj in objects:
        result = client.stat_object(bucket, obj.object_name)

        if result.size > 0:  #  how to tell if an objet   obj.is_public  ?????
            url = client.presigned_get_object(bucket, obj.object_name)
            # print(f"Public URL for object: {url}")
            urls.append(url)

    return urls

In [57]:
def to_wkt(polygon_string):
    # split the input string into pairs
    pairs = polygon_string.split(',')

    # transform each pair into 'y x' format
    # transformed_pairs = [' '.join(reversed(pair.split())) for pair in pairs]
    transformed_pairs = [' '.join(pair.split()) for pair in pairs]


    # join the transformed pairs with a comma and a space
    transformed_string = ', '.join(transformed_pairs)

    # return the final WKT string
    return f"POLYGON (({transformed_string}))"

In [20]:
# Check for using GPU, in case you want to ensure your GPU is used
# gc = kglab.get_gpu_count()
# print(gc)

In [21]:
client = Minio("ossapi.oceaninfohub.org:80",  secure=False) # Create client with anonymous access.
urls = publicurls(client, "public", "graph")

In [22]:
print(urls)

['http://ossapi.oceaninfohub.org/public/graphs/summonedafricaioc_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedaquadocs_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedcioos_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonededmerp_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonededmo_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedemodnet_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinanodc_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinvemardocuments_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinvemarexperts_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinvemarinstitutions_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinvemartraining_v1_release.nq', 'http://ossapi.oceaninfohub.org/public/graphs/summonedinvemarvessels_v1_release.nq', 'http://ossapi.oceaninf

## Single Graph Test

At this point we have the URLs, and we could either loop load all of them or pull one out manually and use.  This section dmonstrates loading and working with one


In [23]:
# load quad graph
g = ConjunctiveGraph()
g.parse("http://ossapi.oceaninfohub.org/public/graphs/summonedobis_v1_release.nq", format="nquads")
print(len(g))

161187


In [24]:
namespaces = {
    "shacl":   "http://www.w3.org/ns/shacl#" ,
    "schema":   "https://schema.org/" ,
    "schemawrong": "http://schema.org/",
    "geo":      "http://www.opengis.net/ont/geosparql#",
}

kg = kglab.KnowledgeGraph(name = "OIH test", base_uri = "https://oceaninfohub.org/id/", namespaces = namespaces, use_gpus=True, import_graph = g)

In [53]:
solrquery2 = """
PREFIX schema: <https://schema.org/>
PREFIX schemawrong: <http://schema.org/>

SELECT (?s as ?id) ?type 
WHERE
{
    ?s rdf:type ?type ?description
    FILTER ( ?type IN (schema:ResearchProject, schema:Project, schema:Organization,
    schema:Dataset, schema:CreativeWork, schema:Person, schema:Map, schema:Course,
    schema:CourseInstance, schema:Event, schema:Vehicle,   schemawrong:ResearchProject, schemawrong:Project, schemawrong:Organization,
    schemawrong:Dataset, schemawrong:CreativeWork, schemawrong:Person, schemawrong:Map, schemawrong:Course,
    schemawrong:CourseInstance, schemawrong:Event, schemawrong:Vehicle  ) )
    ?s schema:name | schemawrong:name ?name
    OPTIONAL {  ?s schema:keywords | schemawrong:keywords ?keywords }
    OPTIONAL { ?s schema:description | schemawrong:description  ?desc . }
    OPTIONAL { ?s schema:name | schemawrong:description  ?name . }
    OPTIONAL { ?s schema:url | schemawrong:url ?url .   }
    OPTIONAL {?s schema:keywords | schemawrong:keywords ?keywords}
    OPTIONAL { ?s schema:spatialCoverage ?sc .
      ?sc a  schema:Place .
      ?sc schema:geo ?geo .
      ?geo a ?geotype .
      ?geo schema:polygon ?geom
    }
}
"""

solrquery1 = """
PREFIX schema: <https://schema.org/>
PREFIX schemawrong: <http://schema.org/>

SELECT (?s as ?id) ?type ?description ?name ?geom
WHERE
{
    ?s rdf:type ?type
    FILTER ( ?type IN (schema:ResearchProject, schema:Project, schema:Organization,
    schema:Dataset, schema:CreativeWork, schema:Person, schema:Map, schema:Course,
    schema:CourseInstance, schema:Event, schema:Vehicle,   schemawrong:ResearchProject, schemawrong:Project, schemawrong:Organization,
    schemawrong:Dataset, schemawrong:CreativeWork, schemawrong:Person, schemawrong:Map, schemawrong:Course,
    schemawrong:CourseInstance, schemawrong:Event, schemawrong:Vehicle  ) )
    ?s schema:name | schemawrong:name ?name
        OPTIONAL { ?s schema:name | schemawrong:description  ?name . }

    OPTIONAL {  ?s schema:keywords | schemawrong:keywords ?keywords }
    OPTIONAL { ?s schema:description | schemawrong:description  ?description . }
    OPTIONAL { ?s schema:url | schemawrong:url ?url .   }
    OPTIONAL {?s schema:keywords | schemawrong:keywords ?keywords}
    ?s schema:spatialCoverage ?sc .
    ?sc a  schema:Place .
    ?sc schema:geo ?geo .
    ?geo a ?geotype .
    ?geo ?geompred ?geom .
    FILTER(!isIRI(?geom))

}
"""


In [54]:
pdf = kg.query_as_df(solrquery1)
# df = pdf   # .to_pandas()  #  breaks with papermill for reasons unknown at this time if to_pandas() is used, needed in my kglab conda env
# pdf.info()

KeyboardInterrupt: 

In [None]:
pdf.head(20)


# Frame testing


In [74]:
def simple_frame(dg, frame):
    try:
        framed = jsonld.frame(dg, frame)
        return framed  # ['citation']
    except:
        return ""

In [94]:
with open('./assets/frame.json', 'rb') as f:
    fr = json.loads(f.read().decode("utf-8", "ignore").replace('\n',' '))
    # fr =   f.read()

with open('./solrInputData.json', 'rb') as f:
    dg = json.loads(f.read().decode("utf-8", "ignore").replace('\n',' '))
    # dg =   file.read()
    
# print(fr)
# print(dg)

f = simple_frame(dg, fr)
json_formatted_str = json.dumps(f, indent=2)
print(json_formatted_str)

{
  "@context": {
    "@vocab": "http://schema.org/"
  },
  "@graph": [
    {
      "@id": "http://ckan.onc.uvic.ca",
      "@type": "DataCatalog",
      "description": ""
    },
    {
      "@id": "https://catalogue.cioos.ca",
      "@type": "DataCatalog",
      "description": ""
    },
    {
      "@id": "https://catalogue.cioos.ca/dataset/ff0232d8-34bd-4456-be28-20d4f8b2937c.jsonld",
      "@type": "Dataset",
      "description": [
        {
          "@language": "en",
          "@value": "The WET Labs ECO FLNTUS 894 was deployed on 2010-09-13 at Barkley Upper Slope. Upper Slope is a location within Barkley Canyon, which is located on the upper continental slope. This device is a Fluorometer Turbidity. Fluorometer Turbidity instruments measure chlorophyll fluorescence and turbidity within the same volume of seawater. The instrument uses a light emitting diode (LED) to provide an excitation source. The fluoresced light is received by a detector at a particular angle from the LED sou