# GleanerIO JSON-LD Framing Processor

## About

- convert this from doing SHACL to doing framing


### Imports


In [1]:
import dask
import pandas as pd
import json, io
import pathlib
import s3fs
import kglab
from rdflib import Graph  #, plugin
from pyld import jsonld

### Setting and options

In [2]:
# Set up the S3 File system
oss = s3fs.S3FileSystem(
    anon=True,
    client_kwargs = {"endpoint_url":"https://oss.geodex.org"}
)

### Set up Dask

In [3]:
from dask.distributed import Client    #, progress
client = Client(threads_per_worker=10, n_workers=4)
client

0,1
Connection method: Cluster object,Cluster type: LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Status: running,Using processes: True
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads:  20,Total memory:  31.17 GiB

0,1
Comm: tcp://127.0.0.1:44995,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads:  20
Started:  Just now,Total memory:  31.17 GiB

0,1
Comm: tcp://127.0.0.1:32775,Total threads: 5
Dashboard: http://127.0.0.1:39487/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:40709,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-67gou4bo,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-67gou4bo
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB

0,1
Comm: tcp://127.0.0.1:33069,Total threads: 5
Dashboard: http://127.0.0.1:45745/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:33095,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-eyvzoi_2,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-eyvzoi_2
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB

0,1
Comm: tcp://127.0.0.1:35719,Total threads: 5
Dashboard: http://127.0.0.1:36337/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:41063,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-riqdfldo,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-riqdfldo
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB

0,1
Comm: tcp://127.0.0.1:34593,Total threads: 5
Dashboard: http://127.0.0.1:34785/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:41805,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-g2_6gard,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-g2_6gard
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB


## Framing Playground

A little playground to test frames

In [3]:
myframe =  {"@context":{"@vocab": "https://schema.org/"}, "@type": "Dataset", "@explicit": "true", "keywords": {}};

In [5]:
dg = './datagraphs/datagraph.json'
with open(dg,"r") as f:
    jld = json.loads(f.read().replace('\n',' '))
    myframed = jsonld.frame(jld, myframe)

print(myframed)


FileNotFoundError: [Errno 2] No such file or directory: './datagraphs/datagraph.json'

## Dask processing

### Define Delayed Definitions

In [6]:
# Simple JSON-LD framing inside Dask function
@dask.delayed()
def citation_frame(fn):
    # or preferably open in text mode and json.load from the file
    citationframe = {"@context":{"@vocab": "https://schema.org/"}, "@type": "Dataset", "@explicit": "true", "keywords": {}};
    #   citationframe = {"@context":{"@vocab": ctx}, "@type": "Dataset", "@explicit": "true", "citation": {}};
    with oss.open(fn, 'rb') as f:
        #return json.loads(f.read().replace('\n',' '))
        jld = json.loads(f.read().decode("utf-8", "ignore").replace('\n',' '))
        framed = jsonld.frame(jld, citationframe)
        return framed   # ['citation']

### Process files


In [7]:
%%time

fns = oss.ls('gleaner/summoned/hydroshare')
o = [citation_frame(f) for f in fns]

results = dask.compute(*o)  ## Wait..  can I just dask.compute(o)  ????
print(len(results))

9533
CPU times: user 1min 32s, sys: 2.43 s, total: 1min 34s
Wall time: 2min 26s


### Loop on results and load to graph

In [8]:
# You likely don't want to do this..   prints a lot of data
# for r in results:
#     print(r)
print(json.dumps(results[0]))

{"@context": {"@vocab": "https://schema.org/"}, "@id": "https://doi.org/10.4211/hs.d2bab32e7c1d4d55b8cba7221e51b02d", "@type": "Dataset", "keywords": ["DPS Regions", "DPS Districts", "Addresses", "Address Points", "TDEM", "Height Above Nearest Drainage", "Emergency Response", "9-1-1", "HAND", "Texas", "Harvey2017"]}


In [9]:
rnamespaces = {
    "schema":  "https://schema.org/",
    "shacl":   "http://www.w3.org/ns/shacl#" ,
}

rkg = kglab.KnowledgeGraph(
    name = "Schema.org shacl eval datagraph",
    base_uri = "https://example.org/id/",
    namespaces = rnamespaces,
)

g = rkg.rdf_graph()

for r in results:
    rkg.load_rdf_text(data=json.dumps(r), format="json-ld") 
        
   

In [10]:
sparql = """
PREFIX schema: <https://schema.org/>
SELECT *
  WHERE {
    ?s schema:keywords ?o .
  }
"""

df = rkg.query_as_df(sparql)

In [13]:
pdf = df.to_pandas()
# pdf.style.apply(change_color_group, axis=None)
# pdf.info()
pdf.head(10)

Unnamed: 0,s,o
0,<https://doi.org/10.4211/hs.d2bab32e7c1d4d55b8...,DPS Regions
1,<https://doi.org/10.4211/hs.d2bab32e7c1d4d55b8...,DPS Districts
2,<https://doi.org/10.4211/hs.d2bab32e7c1d4d55b8...,Addresses
3,<https://doi.org/10.4211/hs.7661752c688a4f3ebc...,Addresses
4,<https://doi.org/10.4211/hs.d2bab32e7c1d4d55b8...,Address Points
5,<https://doi.org/10.4211/hs.d2bab32e7c1d4d55b8...,TDEM
6,<https://doi.org/10.4211/hs.d2bab32e7c1d4d55b8...,Height Above Nearest Drainage
7,<https://www.hydroshare.org/resource/b35f259d7...,Height Above Nearest Drainage
8,<https://doi.org/10.4211/hs.7235a0d6a18343078b...,Height Above Nearest Drainage
9,<https://doi.org/10.4211/hs.d2bab32e7c1d4d55b8...,Emergency Response


## Keyword value counts

We can look at the keywords now and see if anything can be done to align them to something like WikiData.  We can inspect
that USACE Corps Water Management System (CWMS) is likely part of the  US Army Corp of Engineers
which in WikiData is at: [https://www.wikidata.org/wiki/Q1049334](https://www.wikidata.org/wiki/Q1049334).

The question is there a reliable approach to connecting these?

In [14]:
pdf.o.value_counts()

mmw                                           564
model-my-watershed                            564
cbf                                           276
USACE Corps Water Management System (CWMS)    251
Pequea                                        234
                                             ... 
administrative division codes                   1
capitals                                        1
international codes                             1
port identification numbers                     1
Total Solids                                    1
Name: o, Length: 6605, dtype: int64

In [12]:
# Optional export to parquet and or CSV
df.to_parquet("./output/hydro_kw_frame.parquet")

## ----  scratch below

Looking at [http://abstractsearch.agu.org:8890/sparql](http://abstractsearch.agu.org:8890/sparql)

with

```sparql
select  ?o
where {

?s <http://swrc.ontoware.org/ontology#abstract> ?o

}
LIMIT 100
```

show ssome nice abstracts.  However, there are no keywords?



In [3]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import shapely

cor = "http://cor.esipfed.org/sparql"

In [4]:
#@title
def get_sparql_dataframe(service, query):
    """
    Helper function to convert SPARQL results into a Pandas data frame.
    """
    sparql = SPARQLWrapper(service)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()

    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)

    return pd.DataFrame(out, columns=cols)

In [5]:
from SPARQLWrapper import SPARQLWrapper, JSON

swt1 = """
PREFIX ufokn: <http://schema.ufokn.org/core/v1/>
SELECT *
where
{
  ?sub rdfs:label ?text
}
"""

swtdf = get_sparql_dataframe(cor, swt1)