# GleanerIO JSON-LD Framing Processor

## About

- convert this from doing SHACL to doing framing


### Imports


In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import dask
import boto3
import pandas as pd
import json, io
import getpass
import pathlib
import s3fs
import kglab
from rdflib import Graph  #, plugin
from pyld import jsonld

### Set up Dask client

In [2]:
from dask.distributed import Client    #, progress
client = Client(threads_per_worker=5, n_workers=4)
client

0,1
Connection method: Cluster object,Cluster type: LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Status: running,Using processes: True
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads:  20,Total memory:  31.18 GiB

0,1
Comm: tcp://127.0.0.1:37081,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads:  20
Started:  Just now,Total memory:  31.18 GiB

0,1
Comm: tcp://127.0.0.1:38455,Total threads: 5
Dashboard: http://127.0.0.1:35041/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:33185,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/framing/dask-worker-space/worker-7prhuwiy,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/framing/dask-worker-space/worker-7prhuwiy
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 4.00 GiB

0,1
Comm: tcp://127.0.0.1:41791,Total threads: 5
Dashboard: http://127.0.0.1:32915/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:33767,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/framing/dask-worker-space/worker-n_nqjuqe,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/framing/dask-worker-space/worker-n_nqjuqe
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 4.00 GiB

0,1
Comm: tcp://127.0.0.1:36003,Total threads: 5
Dashboard: http://127.0.0.1:36573/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:35273,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/framing/dask-worker-space/worker-a_fqxk0f,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/framing/dask-worker-space/worker-a_fqxk0f
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 4.00 GiB

0,1
Comm: tcp://127.0.0.1:33285,Total threads: 5
Dashboard: http://127.0.0.1:37445/status,Memory: 7.79 GiB
Nanny: tcp://127.0.0.1:38487,
Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/framing/dask-worker-space/worker-jc5xrd69,Local directory: /home/fils/Containers/dvols/jupyter/work/Gleaner/notebooks/framing/dask-worker-space/worker-jc5xrd69
GPU: NVIDIA GeForce GTX 1050 Ti,GPU memory: 4.00 GiB


### File Access

In [3]:
## Set up the S3 File system
# oss = s3fs.S3FileSystem(
#     anon=True,
#     client_kwargs = {"endpoint_url":"https://oss.geocodes-dev.earthcube.org"}
# )

## Access controlled s3
session = boto3.Session(profile_name='default' ,   region_name="us-east-1")
s3 = session.client('s3')  # needed later for listing objects
s3r = session.resource('s3')
oss = s3fs.S3FileSystem( profile="default")


## Manual code access
# ACCESS_CODE = getpass.getpass()
# SECRET_CODE = getpass.getpass()

# oss = s3fs.S3FileSystem(
#     anon=False,
#     key=ACCESS_CODE,
#     secret=SECRET_CODE,
#     client_kwargs = {"endpoint_url":"https://oss.geocodes-dev.earthcube.org"}
# )

## Dask processing

### Define Delayed Definitions

In [4]:
# Simple JSON-LD framing inside Dask function
@dask.delayed()
def citation_frame(fn):
    # or preferably open in text mode and json.load from the file
    frame = {"@context":{"@vocab": "https://schema.org/"}, "@type": "Organization", "@explicit": "true", "description": {}};
    #   citationframe = {"@context":{"@vocab": ctx}, "@type": "Dataset", "@explicit": "true", "citation": {}};
    with oss.open(fn, 'rb') as f:
        #return json.loads(f.read().replace('\n',' '))
        try:
            jld = json.loads(f.read().decode("utf-8", "ignore").replace('\n',' '))
            framed = jsonld.frame(jld, frame)
            return framed   # ['citation']
        except:
            return ""

In [5]:
%%time
# process the files

fns = oss.ls('gleaner.oih/summoned/edmo')
o = [citation_frame(f) for f in fns]

results = dask.compute(*o)  ## Wait..  can I just dask.compute(o)  ????
print(len(results))

4527
CPU times: user 4.47 s, sys: 360 ms, total: 4.83 s
Wall time: 35.2 s


### Loop on results and load to graph

In [6]:
### Single print
print(json.dumps(results[10]))
### Loop on results and load to graph
# You likely don't want to do this..   prints a lot of data
# for r in range(len(results)):
#     print(r)
#     print(json.dumps(results[r]))

{"@context": {"@vocab": "https://schema.org/"}, "@id": "https://edmo.seadatanet.org/report/343", "@type": "Organization", "description": "The Department of Geology of the Faculty of Science of the Sea is carrying out sediment research in diverse coastal sedimentary environments. These studies have been performed by professors and postgraduate and postdoctoral researchers. The research covers the production of process-answer models for the studied environment."}


In [7]:
rnamespaces = {
    "schema":  "https://schema.org/",
    "shacl":   "http://www.w3.org/ns/shacl#" ,
}

rkg = kglab.KnowledgeGraph(
    name = "Schema.org shacl eval datagraph",
    base_uri = "https://example.org/id/",
    namespaces = rnamespaces,
)

g = rkg.rdf_graph()

for r in results:
    rkg.load_rdf_text(data=json.dumps(r), format="json-ld") 
        
   

In [8]:
sparql = """
PREFIX schema: <https://schema.org/>
SELECT *
  WHERE {
    ?s schema:description ?o .
  }
"""

df = rkg.query_as_df(sparql)

In [9]:
pdf = df.to_pandas()
# pdf.style.apply(change_color_group, axis=None)
pdf.info()
pdf.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3430 entries, 0 to 3429
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   s       3430 non-null   object
 1   o       3430 non-null   object
dtypes: object(2)
memory usage: 53.7+ KB


Unnamed: 0,s,o
0,<https://edmo.seadatanet.org/report/3600>,Oceaneering is a global oilfield provider of e...
1,<https://edmo.seadatanet.org/report/714>,Estonian Marine Institute is one of many Tartu...
2,<https://edmo.seadatanet.org/report/5102>,Oxford Archaeology (OA) has a 40-year history ...
3,<https://edmo.seadatanet.org/report/3516>,Appalachian State University is located in Boo...
4,<https://edmo.seadatanet.org/report/3963>,Woodside is Australia&rsquo;s largest independ...
5,<https://edmo.seadatanet.org/report/75>,For information relating to PowerGen Plc conta...
6,<https://edmo.seadatanet.org/report/2106>,The Norwegian Coastal Administration (Kystverk...
7,<https://edmo.seadatanet.org/report/773>,Galway Campus is the largest of GMIT's five ca...
8,<https://edmo.seadatanet.org/report/343>,The Department of Geology of the Faculty of Sc...
9,<https://edmo.seadatanet.org/report/2402>,The Max Planck Institute for Meteorology is an...


In [10]:
# Optional export to parquet and or CSV
df.to_parquet("./output/frameresults.parquet")

In [11]:
import marqo

mq = marqo.Client(url='http://ghost.lan:8882')


for index, row in pdf.iterrows():
    # print(row['s'], row['o'])
    mq.index("my-first-index").add_documents([{"Title": row['s'],"Description": row['o'] }])

2022-12-29 21:56:48,650 logger:'marqo' INFO add_documents pre-processing: took 0.000s for 1 docs, for an average of 0.000s per doc.
2022-12-29 21:56:48,955 logger:'marqo' INFO add_documents roundtrip: took 0.304s to send 1 docs to Marqo (roundtrip, unbatched), for an average of 0.304s per doc.
2022-12-29 21:56:48,955 logger:'marqo' INFO add_documents Marqo index: took 0.289s for Marqo to process & index 1 docs (server unbatched), for an average of 0.289s per doc.
2022-12-29 21:56:48,955 logger:'marqo' INFO add_documents completed. total time taken: 0.305s.
2022-12-29 21:56:48,956 logger:'marqo' INFO add_documents pre-processing: took 0.000s for 1 docs, for an average of 0.000s per doc.
2022-12-29 21:56:49,070 logger:'marqo' INFO add_documents roundtrip: took 0.114s to send 1 docs to Marqo (roundtrip, unbatched), for an average of 0.114s per doc.
2022-12-29 21:56:49,071 logger:'marqo' INFO add_documents Marqo index: took 0.111s for Marqo to process & index 1 docs (server unbatched), for