# GleanerIO JSON-LD Framing Processor

## About

- convert this from doing SHACL to doing framing


### Imports


In [2]:
import dask
import boto3
import pandas as pd
import json, io
import pathlib
import s3fs
import kglab
from rdflib import Graph  #, plugin
from pyld import jsonld

### File Access

In [3]:
# Set up the S3 File system
oss = s3fs.S3FileSystem(
    anon=True,
    client_kwargs = {"endpoint_url":"https://oss.geodex.org"}
)

## Access controlled s3
# session = boto3.Session(profile_name='default' ,   region_name="us-east-1")
# s3 = session.client('s3')  # needed later for listing objects
# s3r = session.resource('s3')
# oss = s3fs.S3FileSystem( profile="default")


## Manual code access
# ACCESS_CODE = getpass.getpass()
# SECRET_CODE = getpass.getpass()

# oss = s3fs.S3FileSystem(
#     anon=False,
#     key=ACCESS_CODE,
#     secret=SECRET_CODE,
#     client_kwargs = {"endpoint_url":"http://192.168.86.45:49159"}
# )

### Set up Dask

In [3]:
from dask.distributed import Client    #, progress
client = Client(threads_per_worker=5, n_workers=4)
client

0,1
Connection method: Cluster object,Cluster type: LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Status: running,Using processes: True
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads:  20,Total memory:  31.17 GiB

0,1
Comm: tcp://127.0.0.1:34009,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads:  20
Started:  Just now,Total memory:  31.17 GiB

0,1
Comm: tcp://127.0.0.1:33951,Total threads: 5
Dashboard: http://127.0.0.1:41341/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:36369,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-aqmpxzl1,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-aqmpxzl1
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB

0,1
Comm: tcp://127.0.0.1:43467,Total threads: 5
Dashboard: http://127.0.0.1:44161/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:38221,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-25kqm_xq,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-25kqm_xq
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB

0,1
Comm: tcp://127.0.0.1:33297,Total threads: 5
Dashboard: http://127.0.0.1:40843/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:35991,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-wd4bkl1v,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-wd4bkl1v
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB

0,1
Comm: tcp://127.0.0.1:41433,Total threads: 5
Dashboard: http://127.0.0.1:41819/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:38777,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-_6f9dhqc,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-_6f9dhqc
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB


## Framing Playground

A little playground to test frames

In [4]:
myframe =  {"@context":{"@vocab": "https://schema.org/"}, "@type": "Course", "@explicit": "true", "description": {}};

In [5]:
dg = './datagraphs/datagraph.json'
with open(dg,"r") as f:
    jld = json.loads(f.read().replace('\n',' '))
    myframed = jsonld.frame(jld, myframe)

print(myframed)


{'@context': {'@vocab': 'https://schema.org/'}, '@type': 'Course', 'description': "In this course you will get an introduction to the main tools and ideas in the data scientist's toolbox..."}


## Dask processing

### Define Delayed Definitions

In [6]:
# Simple JSON-LD framing inside Dask function
@dask.delayed()
def citation_frame(fn):
    # or preferably open in text mode and json.load from the file
    citationframe = {"@context":{"@vocab": "https://schema.org/"}, "@type": "Dataset", "@explicit": "true", "description": {}};
    #   citationframe = {"@context":{"@vocab": ctx}, "@type": "Dataset", "@explicit": "true", "citation": {}};
    with oss.open(fn, 'rb') as f:
        #return json.loads(f.read().replace('\n',' '))
        jld = json.loads(f.read().decode("utf-8", "ignore").replace('\n',' '))
        framed = jsonld.frame(jld, citationframe)
        return framed   # ['citation']

### Process files


In [7]:
%%time

fns = oss.ls('gleaner/summoned/lipdverse')
o = [citation_frame(f) for f in fns]

results = dask.compute(*o)  ## Wait..  can I just dask.compute(o)  ????
print(len(results))

697
CPU times: user 710 ms, sys: 79.3 ms, total: 790 ms
Wall time: 6.06 s


### Loop on results and load to graph

In [13]:
# You likely don't want to do this..   prints a lot of data
# for r in results:
#     print(r)
print(json.dumps(results[100]))

{"@context": {"@vocab": "https://schema.org/"}, "@id": "http://lipdverse.org/Temp12k/1_0_2/Topptjonna.Paus.2011.html", "@type": "Dataset", "description": "This dataset from Topptjonna (Europe>Northern Europe>Scandinavia>Norway) is derived from a LakeSediment archive, and includes data on OriginalSampleID, temperature, and ReliabIeYN1. The data are relevant to the time interval from 11824 to 6133 (Calibrated)."}


In [14]:
rnamespaces = {
    "schema":  "https://schema.org/",
    "shacl":   "http://www.w3.org/ns/shacl#" ,
}

rkg = kglab.KnowledgeGraph(
    name = "Schema.org shacl eval datagraph",
    base_uri = "https://example.org/id/",
    namespaces = rnamespaces,
)

g = rkg.rdf_graph()

for r in results:
    rkg.load_rdf_text(data=json.dumps(r), format="json-ld") 
        
   

In [15]:
sparql = """
PREFIX schema: <https://schema.org/>
SELECT *
  WHERE {
    ?s schema:description ?o .
  }
"""

df = rkg.query_as_df(sparql)

In [16]:
pdf = df.to_pandas()
# pdf.style.apply(change_color_group, axis=None)
pdf.info()
pdf.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 697 entries, 0 to 696
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   s       697 non-null    object
 1   o       697 non-null    object
dtypes: object(2)
memory usage: 11.0+ KB


Unnamed: 0,s,o
0,<http://lipdverse.org/Temp12k/1_0_2/Wonderkrat...,This dataset from Wonderkrater (Africa>Souther...
1,<http://lipdverse.org/Temp12k/1_0_2/GeoB5844_2...,This dataset from GeoB5844_2 (Indian Ocean>Red...
2,<http://lipdverse.org/Temp12k/1_0_2/BuntesMoor...,This dataset from Buntes Moor (Europe>Western ...
3,<http://lipdverse.org/Temp12k/1_0_2/Mohawk.Web...,This dataset from Mohawk (North America>United...
4,<http://lipdverse.org/Temp12k/1_0_2/MD98_2195....,This dataset from MD98_2195 (Pacific Ocean>Wes...
5,<http://lipdverse.org/Temp12k/1_0_2/Churruca.C...,This dataset from Churruca (South America>Chil...
6,<http://lipdverse.org/Temp12k/1_0_2/LagunaChap...,This dataset from Laguna Chaplin (South Americ...
7,<http://lipdverse.org/Temp12k/1_0_2/Sfl4-1.Wil...,This dataset from sfl4-1 (North America>Greenl...
8,<http://lipdverse.org/Temp12k/1_0_2/HiddenLake...,This dataset from Hidden Lake CA (North Americ...
9,<http://lipdverse.org/Temp12k/1_0_2/Laihalampi...,This dataset from Laihalampi (Europe>Northern ...


In [17]:
# Optional export to parquet and or CSV
df.to_parquet("./output/frameresults.parquet")