# GleanerIO JSON-LD Framing Processor

## TODO

- convert this from doing SHACL to doing framing


## Imports


In [17]:
import dask
import pandas as pd
import json, io
import pathlib
import s3fs
import kglab
from rdflib import Graph  #, plugin
from pyld import jsonld

### Setting and options

In [18]:
# Some pandas df options (not needed)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Set the shapegraph to use
sg = './shapes/eco_general1.ttl'

# Set up the S3 File system
oss = s3fs.S3FileSystem(
    anon=True,
    client_kwargs = {"endpoint_url":"https://oss.geodex.org"}
)

## Framing Playground

A little playground to test frames

In [19]:
# work up some examples for people to play with a simple frame and data graph and get some output.  

### Definitions

In [20]:
# Simple JSON-LD framing inside Dask function
@dask.delayed()
def citation_frame(fn):
    # or preferably open in text mode and json.load from the file
    citationframe = {"@context":{"@vocab": "https://schema.org/"}, "@type": "Dataset", "@explicit": "true", "citation": {}};
    #   citationframe = {"@context":{"@vocab": ctx}, "@type": "Dataset", "@explicit": "true", "citation": {}};
    with oss.open(fn, 'rb') as f:
        #return json.loads(f.read().replace('\n',' '))
        jld = json.loads(f.read().decode("utf-8", "ignore").replace('\n',' '))
        framed = jsonld.frame(jld, citationframe)
        return framed   # ['citation']

## Set up Dask

In [5]:
from dask.distributed import Client    #, progress
client = Client(threads_per_worker=5, n_workers=4)
client

0,1
Connection method: Cluster object,Cluster type: LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Status: running,Using processes: True
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads:  20,Total memory:  31.17 GiB

0,1
Comm: tcp://127.0.0.1:44253,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads:  20
Started:  Just now,Total memory:  31.17 GiB

0,1
Comm: tcp://127.0.0.1:40761,Total threads: 5
Dashboard: http://127.0.0.1:43667/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:36153,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-p4z5wzr2,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-p4z5wzr2
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB

0,1
Comm: tcp://127.0.0.1:37025,Total threads: 5
Dashboard: http://127.0.0.1:41751/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:45239,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-rg7txaw4,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-rg7txaw4
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB

0,1
Comm: tcp://127.0.0.1:33441,Total threads: 5
Dashboard: http://127.0.0.1:35927/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:33067,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-ml1nmh3f,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-ml1nmh3f
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB

0,1
Comm: tcp://127.0.0.1:34981,Total threads: 5
Dashboard: http://127.0.0.1:46395/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:44409,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-m3l4i3jn,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-m3l4i3jn
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB


## Get files


In [42]:
%%time

fns = oss.ls('gleaner/summoned/lipdverse')
o = [citation_frame(f) for f in fns]

results = dask.compute(*o)  ## Wait..  can I just dask.compute(o)  ????
print(len(results))

697
CPU times: user 1.66 s, sys: 149 ms, total: 1.8 s
Wall time: 6.07 s


### Loop on results and load to graph

In [22]:
# You likely don't want to do this..   prints a lot of data
# for r in results:
#     print(r)

In [41]:
rnamespaces = {
    "schema":  "https://schema.org/",
    "shacl":   "http://www.w3.org/ns/shacl#" ,
}

rkg = kglab.KnowledgeGraph(
    name = "Schema.org shacl eval datagraph",
    base_uri = "https://example.org/id/",
    namespaces = rnamespaces,
)

for r in results:
    # write to a file for hack input patter
    with open("/tmp/data.jsonld","w") as f:
        f.write(json.dumps(r))
        f.close()
    
    path = pathlib.Path("/tmp/data.jsonld")
    rkg.load_jsonld(path)  # need to load as JSON-LD
        
        
   
    # try a file "like" object
#     with io.StringIO() as f:  
#         f.write(json.dumps(r))
        
#     rkg.load_jsonld(f) 
        
   

In [36]:
sparql = """
SELECT *
  WHERE {
    ?s ?p ?o
  }
"""

df = rkg.query_as_df(sparql)

In [37]:
pdf = df.to_pandas()
# pdf.style.apply(change_color_group, axis=None)
pdf.info()
pdf.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 697 entries, 0 to 696
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   p       697 non-null    object
 1   o       697 non-null    object
 2   s       697 non-null    object
dtypes: object(3)
memory usage: 16.5+ KB


Unnamed: 0,p,o,s
0,rdf:type,:Dataset,<http://lipdverse.org/Temp12k/1_0_2/Penny.Fisher.1998.html>
1,rdf:type,:Dataset,<http://lipdverse.org/Temp12k/1_0_2/Radtke.Webb.1983.html>
2,rdf:type,:Dataset,<http://lipdverse.org/Temp12k/1_0_2/LakeStowell.Lemmen.2018.html>
3,rdf:type,:Dataset,<http://lipdverse.org/Temp12k/1_0_2/PolevaCave.Constantin.2007.html>
4,rdf:type,:Dataset,<http://lipdverse.org/Temp12k/1_0_2/HumberPond3.McAndrews.1989.html>
5,rdf:type,:Dataset,<http://lipdverse.org/Temp12k/1_0_2/Agassiz.Lecavalier.2017.html>
6,rdf:type,:Dataset,<http://lipdverse.org/Temp12k/1_0_2/klotjarnen.Seppa.2009.html>
7,rdf:type,:Dataset,<http://lipdverse.org/Temp12k/1_0_2/Gass.Webb.1983.html>
8,rdf:type,:Dataset,<http://lipdverse.org/Temp12k/1_0_2/MD99-2227.deVernal.2013.html>
9,rdf:type,:Dataset,<http://lipdverse.org/Temp12k/1_0_2/Naujg1.Willemse.1999.html>


In [14]:
pdf["severity"].value_counts()
pdf["focus"].value_counts()
pdf["message"].value_counts()

A provider must be noted    666
Name: message, dtype: int64

In [15]:
# TODO export to parquet and or CSV
df.to_parquet("./output/eco_opentopo_SHACL.parquet")
df.to_csv("./output/eco_opentopo_SHACL.csv")