# GleanerIO JSON-LD Framing Processor

## About

- convert this from doing SHACL to doing framing


### Imports


In [1]:
import dask
import pandas as pd
import json, io
import pathlib
import s3fs
import kglab
from rdflib import Graph  #, plugin
from pyld import jsonld

### Setting and options

In [2]:
# Set up the S3 File system
oss = s3fs.S3FileSystem(
    anon=True,
    client_kwargs = {"endpoint_url":"https://oss.geodex.org"}
)

### Set up Dask

In [6]:
from dask.distributed import Client    #, progress
client = Client(threads_per_worker=5, n_workers=4)
client

0,1
Connection method: Cluster object,Cluster type: LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Status: running,Using processes: True
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads:  20,Total memory:  31.17 GiB

0,1
Comm: tcp://127.0.0.1:43261,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads:  20
Started:  Just now,Total memory:  31.17 GiB

0,1
Comm: tcp://127.0.0.1:43823,Total threads: 5
Dashboard: http://127.0.0.1:37475/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:35915,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-8ljx660e,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-8ljx660e
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB

0,1
Comm: tcp://127.0.0.1:38989,Total threads: 5
Dashboard: http://127.0.0.1:35135/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:38403,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-g1n7y4xx,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-g1n7y4xx
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB

0,1
Comm: tcp://127.0.0.1:39957,Total threads: 5
Dashboard: http://127.0.0.1:37989/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:40327,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-ndg4wahb,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-ndg4wahb
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB

0,1
Comm: tcp://127.0.0.1:42133,Total threads: 5
Dashboard: http://127.0.0.1:44803/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:42523,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-y5yv_7pk,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/validation/dask-worker-space/worker-y5yv_7pk
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 3.94 GiB


## Framing Playground

A little playground to test frames

In [3]:
myframe =  {"@context":{"@vocab": "https://schema.org/"}, "@type": "Course", "@explicit": "true", "description": {}};

In [4]:
dg = './datagraphs/datagraph.json'
with open(dg,"r") as f:
    jld = json.loads(f.read().replace('\n',' '))
    myframed = jsonld.frame(jld, myframe)

print(myframed)


{'@context': {'@vocab': 'https://schema.org/'}, '@type': 'Course', 'description': "In this course you will get an introduction to the main tools and ideas in the data scientist's toolbox..."}


## Dask processing

### Define Delayed Definitions

In [19]:
# Simple JSON-LD framing inside Dask function
@dask.delayed()
def citation_frame(fn):
    # or preferably open in text mode and json.load from the file
    citationframe = {"@context":{"@vocab": "https://schema.org/"}, "@type": "Dataset", "@explicit": "true", "description": {}};
    #   citationframe = {"@context":{"@vocab": ctx}, "@type": "Dataset", "@explicit": "true", "citation": {}};
    with oss.open(fn, 'rb') as f:
        #return json.loads(f.read().replace('\n',' '))
        jld = json.loads(f.read().decode("utf-8", "ignore").replace('\n',' '))
        framed = jsonld.frame(jld, citationframe)
        return framed   # ['citation']

### Process files


In [20]:
%%time

fns = oss.ls('gleaner/summoned/lipdverse')
o = [citation_frame(f) for f in fns]

results = dask.compute(*o)  ## Wait..  can I just dask.compute(o)  ????
print(len(results))

697
CPU times: user 1.13 s, sys: 119 ms, total: 1.25 s
Wall time: 5.63 s


### Loop on results and load to graph

In [30]:
# You likely don't want to do this..   prints a lot of data
# for r in results:
#     print(r)
print(json.dumps(results[0]))

{"@context": {"@vocab": "https://schema.org/"}, "@id": "http://lipdverse.org/Temp12k/1_0_2/Wonderkrater.Truc.2013.html", "@type": "Dataset", "description": "This dataset from Wonderkrater (Africa>Southern Africa>South Africa) is derived from a Peat archive, and includes data on ageMin, AgeOld, temperature, ReliabIeYN1, uncertaintyHigh, uncertaintyLow, ReliabIeYN2, and temperatureComposite. The data are relevant to the time interval from 19825 to -29 (Calibrated)."}


In [22]:
rnamespaces = {
    "schema":  "https://schema.org/",
    "shacl":   "http://www.w3.org/ns/shacl#" ,
}

rkg = kglab.KnowledgeGraph(
    name = "Schema.org shacl eval datagraph",
    base_uri = "https://example.org/id/",
    namespaces = rnamespaces,
)

g = rkg.rdf_graph()

for r in results:
    rkg.load_rdf_text(data=json.dumps(r), format="json-ld") 
        
   

In [33]:
sparql = """
PREFIX schema: <https://schema.org/>
SELECT *
  WHERE {
    ?s schema:description ?o .
  }
"""

df = rkg.query_as_df(sparql)

In [34]:
pdf = df.to_pandas()
# pdf.style.apply(change_color_group, axis=None)
pdf.info()
pdf.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 697 entries, 0 to 696
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   o       697 non-null    object
 1   s       697 non-null    object
dtypes: object(2)
memory usage: 11.0+ KB


Unnamed: 0,o,s
0,"This dataset from Wonderkrater (Africa>Southern Africa>South Africa) is derived from a Peat archive, and includes data on ageMin, AgeOld, temperature, ReliabIeYN1, uncertaintyHigh, uncertaintyLow, ReliabIeYN2, and temperatureComposite. The data are relevant to the time interval from 19825 to -29 (Calibrated).",<http://lipdverse.org/Temp12k/1_0_2/Wonderkrater.Truc.2013.html>
1,"This dataset from GeoB5844_2 (Indian Ocean>Red Sea) is derived from a MarineSediment archive, and includes data on ageDuplicate, ageOriginal, d18o_ruber, d18o_acicula, d18o_inflata, d18o_mabahethi, d18o_marginata, Uk37, temperature, and ageMedianBacon. The data are relevant to the time interval from 21014 to 516 (BP).",<http://lipdverse.org/Temp12k/1_0_2/GeoB5844_2.Arz.2003.html>
2,"This dataset from Buntes Moor (Europe>Western Europe>Austria) is derived from a Peat archive, and includes data on GDD5, temperature, temperatureComposite, and precipitation. The data are relevant to the time interval from 6521 to 330 (BP).",<http://lipdverse.org/Temp12k/1_0_2/BuntesMoor.Weirich.1980.html>
3,"This dataset from Mohawk (North America>United States Of America>Connecticut) is derived from a LakeSediment archive, and includes data on temperature and precipitation. The data are relevant to the time interval from 16337 to 215 (yr 14C BP).",<http://lipdverse.org/Temp12k/1_0_2/Mohawk.Webb.1986.html>
4,"This dataset from MD98_2195 (Pacific Ocean>Western Pacific Ocean>East China Sea) is derived from a MarineSediment archive, and includes data on ageDuplicate, ageOriginal, Uk37, TEX86, temperature, and ageMedianBacon. The data are relevant to the time interval from 23203 to 194 (BP).",<http://lipdverse.org/Temp12k/1_0_2/MD98_2195.Ijiri.2005.html>
5,"This dataset from Churruca (South America>Chile) is derived from a MarineSediment archive, and includes data on core and temperature. The data are relevant to the time interval from 11570 to 170 (BP).",<http://lipdverse.org/Temp12k/1_0_2/Churruca.Caniupan.2014.html>
6,"This dataset from Laguna Chaplin (South America>Bolivia) is derived from a LakeSediment archive, and includes data on ageMin, AgeOld, temperature, uncertaintyHigh, uncertaintyLow, and ReliabIeYN1. The data are relevant to the time interval from 15144 to -44 (Calibrated).",<http://lipdverse.org/Temp12k/1_0_2/LagunaChaplin.Punyasena.2008.html>
7,"This dataset from sfl4-1 (North America>Greenland) is derived from a LakeSediment archive, and includes data on OM. The data are relevant to the time interval from 7400 to 12 (BP).",<http://lipdverse.org/Temp12k/1_0_2/Sfl4-1.Willemse.1999.html>
8,"This dataset from Hidden Lake CA (North America>United States Of America>California) is derived from a LakeSediment archive, and includes data on temperature, uncertainty, and reliable. The data are relevant to the time interval from 15314 to 16 (BP).",<http://lipdverse.org/Temp12k/1_0_2/HiddenLakeCA.Potito.2006.html>
9,"This dataset from Laihalampi (Europe>Northern Europe>Scandinavia>Finland) is derived from a LakeSediment archive, and includes data on temperature. The data are relevant to the time interval from 8995 to 49 (BP).",<http://lipdverse.org/Temp12k/1_0_2/Laihalampi.Heikkila.2003.html>


In [15]:
# Optional export to parquet and or CSV
df.to_parquet("./output/frameresults.parquet")