# GleanerIO JSON-LD Framing Processor

## TODO

- convert this from doing SHACL to doing framing


## Imports


In [1]:
import dask
import pandas as pd
import json, io
import pathlib
import s3fs
import kglab
from rdflib import Graph  #, plugin
from pyld import jsonld

### Setting and options

In [2]:
# Some pandas df options (not needed)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Set the shapegraph to use
sg = './shapes/eco_general1.ttl'

# Set up the S3 File system
oss = s3fs.S3FileSystem(
    anon=True,
    client_kwargs = {"endpoint_url":"https://oss.geodex.org"}
)

## Framing Playground

A little playground to test frames

In [3]:
myframe =  {"@context":{"@vocab": "https://schema.org/"}, "@type": "Course", "@explicit": "true", "description": {}};

In [4]:
dg = './datagraphs/datagraph.json'
with open(dg,"r") as f:
    jld = json.loads(f.read().replace('\n',' '))
    myframed = jsonld.frame(jld, myframe)

print(myframed)


{'@context': {'@vocab': 'https://schema.org/'}, '@type': 'Course', 'description': "In this course you will get an introduction to the main tools and ideas in the data scientist's toolbox..."}


### Definitions

In [5]:
# Simple JSON-LD framing inside Dask function
@dask.delayed()
def citation_frame(fn):
    # or preferably open in text mode and json.load from the file
    citationframe = {"@context":{"@vocab": "https://schema.org/"}, "@type": "Dataset", "@explicit": "true", "citation": {}};
    #   citationframe = {"@context":{"@vocab": ctx}, "@type": "Dataset", "@explicit": "true", "citation": {}};
    with oss.open(fn, 'rb') as f:
        #return json.loads(f.read().replace('\n',' '))
        jld = json.loads(f.read().decode("utf-8", "ignore").replace('\n',' '))
        framed = jsonld.frame(jld, citationframe)
        return framed   # ['citation']

## Set up Dask

In [5]:
from dask.distributed import Client    #, progress
client = Client(threads_per_worker=5, n_workers=4)
client

0,1
Connection method: Cluster object,Cluster type: LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Status: running,Using processes: True
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads:  20,Total memory:  31.17 GiB

0,1
Comm: tcp://127.0.0.1:44253,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads:  20
Started:  Just now,Total memory:  31.17 GiB

0,1
Comm: tcp://127.0.0.1:40761,Total threads: 5
Dashboard: http://127.0.0.1:43667/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:36153,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-p4z5wzr2,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-p4z5wzr2
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB

0,1
Comm: tcp://127.0.0.1:37025,Total threads: 5
Dashboard: http://127.0.0.1:41751/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:45239,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-rg7txaw4,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-rg7txaw4
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB

0,1
Comm: tcp://127.0.0.1:33441,Total threads: 5
Dashboard: http://127.0.0.1:35927/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:33067,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-ml1nmh3f,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-ml1nmh3f
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB

0,1
Comm: tcp://127.0.0.1:34981,Total threads: 5
Dashboard: http://127.0.0.1:46395/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:44409,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-m3l4i3jn,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-m3l4i3jn
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB


## Get files


In [12]:
%%time

fns = oss.ls('gleaner/summoned/lipdverse')
o = [citation_frame(f) for f in fns]

results = dask.compute(*o)  ## Wait..  can I just dask.compute(o)  ????
print(len(results))

697
CPU times: user 5.35 s, sys: 273 ms, total: 5.63 s
Wall time: 7.43 s


### Loop on results and load to graph

In [7]:
# You likely don't want to do this..   prints a lot of data
# for r in results:
#     print(r)

In [16]:
rnamespaces = {
    "schema":  "https://schema.org/",
    "shacl":   "http://www.w3.org/ns/shacl#" ,
}

rkg = kglab.KnowledgeGraph(
    name = "Schema.org shacl eval datagraph",
    base_uri = "https://example.org/id/",
    namespaces = rnamespaces,
)

g = rkg.rdf_graph()

for r in results:
    # WORKS: write to a file for hack input patter
#     with open("/tmp/data.jsonld","w") as f:
#         f.write(json.dumps(r))
#         f.close()
    
#     path = pathlib.Path("/tmp/data.jsonld")
#     rkg.load_jsonld(path)  # need to load as JSON-LD
        
        
   
    # try a file "like" object
    with io.StringIO() as f:  
        f.write(json.dumps(r))
    g.parse(file=f, format="json-ld")  
    # rkg.load_jsonld(f) 
        
   

AttributeError: '_io.StringIO' object has no attribute 'name'

In [9]:
sparql = """
SELECT *
  WHERE {
    ?s ?p ?o
  }
"""

df = rkg.query_as_df(sparql)

In [10]:
pdf = df.to_pandas()
# pdf.style.apply(change_color_group, axis=None)
pdf.info()
pdf.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 697 entries, 0 to 696
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   p       697 non-null    object
 1   s       697 non-null    object
 2   o       697 non-null    object
dtypes: object(3)
memory usage: 16.5+ KB


Unnamed: 0,p,s,o
0,rdf:type,<http://lipdverse.org/Temp12k/1_0_2/LacGras.King.1986.html>,:Dataset
1,rdf:type,<http://lipdverse.org/Temp12k/1_0_2/MD98_2195.Ijiri.2005.html>,:Dataset
2,rdf:type,<http://lipdverse.org/Temp12k/1_0_2/SO189_039KL.Mohtadi.2014.html>,:Dataset
3,rdf:type,<http://lipdverse.org/Temp12k/1_0_2/Toskaljavri.Seppa.2002.html>,:Dataset
4,rdf:type,<http://lipdverse.org/Temp12k/1_0_2/GIK17051_3.Jung.1996.html>,:Dataset
5,rdf:type,<http://lipdverse.org/Temp12k/1_0_2/NussbaumerSeen.Rsch.1995.html>,:Dataset
6,rdf:type,<http://lipdverse.org/Temp12k/1_0_2/PuertodeLosTornos.Penalba.1989.html>,:Dataset
7,rdf:type,<http://lipdverse.org/Temp12k/1_0_2/MR003-K03-PC01.Harada.2004.html>,:Dataset
8,rdf:type,<http://lipdverse.org/Temp12k/1_0_2/MD07-3088.Montade.2019.html>,:Dataset
9,rdf:type,<http://lipdverse.org/Temp12k/1_0_2/SP02.Adams.2010.html>,:Dataset


In [14]:
pdf["severity"].value_counts()
pdf["focus"].value_counts()
pdf["message"].value_counts()

A provider must be noted    666
Name: message, dtype: int64

In [15]:
# TODO export to parquet and or CSV
df.to_parquet("./output/eco_opentopo_SHACL.parquet")
df.to_csv("./output/eco_opentopo_SHACL.csv")