# Ingestion Testing

In [6]:
import datetime, json, os, rdflib, redis
ldfs_cache = redis.StrictRedis()
LUA_LOCATION = "/Users/jeremynelson/2016/linked-data-fragments/lib/"
LUA_SCRIPTS = dict()
def setup():
    for name in ["add_get_triple",
                 "triple_pattern_search",
                "get_triple"]:
        filepath = os.path.join(
            LUA_LOCATION, "{}.lua".format(name))
        print(filepath, os.path.exists(filepath))
        with open(filepath) as fo:
            lua_script = fo.read()
        sha1 = ldfs_cache.script_load(lua_script)
        LUA_SCRIPTS[name] = sha1
setup()

/Users/jeremynelson/2016/linked-data-fragments/lib/add_get_triple.lua True
/Users/jeremynelson/2016/linked-data-fragments/lib/triple_pattern_search.lua True
/Users/jeremynelson/2016/linked-data-fragments/lib/get_triple.lua True


In [7]:
def process_source(source):
    triples_processed = 0
    rec_graph = rdflib.Graph()
    rec_graph.namespace_manager.bind("", "http://dp.la/dataurl")
    rec_graph.parse(data=json.dumps(source), format='json-ld')
    for s,p,o in rec_graph:
        triples_processed += 1
        ldfs_cache.evalsha(LUA_SCRIPTS["add_get_triple"], 3, str(s), str(p), str(o))
    return triples_processed

def process_dpla_json(dpla_filepath):
    dpla_json = json.load(open(dpla_filepath, errors='ignore'))
    start = datetime.datetime.utcnow()
    total_triples = 0
    print("Started processing {} records at {}".format(len(dpla_json), start.isoformat()))
    for i,row in enumerate(dpla_json):
        total_triples += process_source(row)
        if not i%100 and i>0:
            print(".", end="")
        if not i%1000:
            print(i, end="")
        if not i%5000 and i > 0:
            print(" triples={} ".format(total_triples))
    end = datetime.datetime.utcnow()
    print("Finished processing at {}, total time {} mins. Records {} Triples {}".format(
        end.isoformat(),
        (end-start).seconds / 60.0,
        len(dpla_json),
        total_triples))

In [11]:
process_dpla_json("/Users/jeremynelson/2016/dplafest-2016/static/json/uiuc.js")

Started processing 18231 records at 2016-04-13T17:10:20.939783
0..........1000..........2000..........3000..........4000..........5000 triples=0 
..........6000..........7000..........8000..........9000..........10000 triples=0 
..........11000..........12000..........13000..........14000..........15000 triples=0 
..........16000..........17000..........18000..Finished processing at 2016-04-13T17:10:30.167289, total time 0.15 mins. Records 18231 Triples 0


In [13]:
uiuc = json.load(open("/Users/jeremynelson/2016/dplafest-2016/static/json/uiuc.js"))


In [14]:
len(uiuc)

18231

In [19]:
g = rdflib.Graph()
g.parse(data=json.dumps(uiuc[3]), format="json-ld")

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [16]:
len(g)

0

In [21]:
json.dumps(uiuc[3])

'{"subject": [{"name": "Legislation--Illinois--Periodicals"}], "format": "Language material", "specType": ["Serial", "Government Document"], "title": ["Legislative synopsis and digest ... General Assembly, State of Illinois. 1988: 1"], "publisher": ["Springfield, Ill. : [s.n.]"], "spatial": [{"country": "United States", "state": "Illinois", "name": "Illinois", "coordinates": "40.1139373779, -89.1587677002"}], "creator": ["Illinois. General Assembly"], "contributor": ["Illinois. General Assembly. Senate", "Illinois. General Assembly. House of Representatives", "Illinois. General Assembly. Legislative Reference Bureau"], "date": {"displayDate": "1923-", "end": null, "begin": "1923"}, "extent": ["v. ; 22-24 cm."], "identifier": ["(OCoLC)ocm04039666", "LC call number: J87 .I3 date K"], "@context": "http://dp.la/vocab", "relation": ["Illinois. General Assembly. Legislative digest. (DLC) 92640603. (OCoLC)26019719"], "type": "text", "description": ["Action on all bills and resolutions receive

In [32]:
g = rdflib.Graph()
g.parse(data=json.dumps(uiuc[3]), format="json-ld")

<Graph identifier=N13a00bb470074a6ebe62dcd036f6367b (<class 'rdflib.graph.Graph'>)>

In [33]:
for s,p,o in g:
    print(s,p,o)