In [22]:
import os, json, pprint, pickle
from rdflib import Graph, URIRef
from rdflib.namespace import OWL
# ****TODO*****
# salvare i dizionari ausiliari delle entity pages in formato string: set{} e non string: list[], quindi come pickle

# testing update of entity pages
# NO DELETE, SPLIT OR MERGE, only UPDATE
# UPDATE operation might consist in either 1) adding a URI from Judaica Link (a new URI added to JL through a new dataset)
# to an existing Entity Page, or 2) Creating a new Entity Page given a sameAs statement if BOTH subject and object do 
# not have an Entity Pages
# in order to do this, we always use two auxiliary indexes, ep2resource and resource2ep, which must always be
# up-to-date and consistent with the EP dataset. From the consistency of these 2 indexes depends the consistency of 
# the dataset itself.

In [31]:
ep = Graph()
ep.parse('/home/rovera/jl/output/entity_pages/entity_pages.ttl', format='ttl')

<Graph identifier=N239a47db61c740769d2db9d2893a25ea (<class 'rdflib.graph.Graph'>)>

In [32]:
ep2resources = json.load(open("/home/rovera/jl/output/entity_pages/entity_pages_dictionary.json", 'r', encoding="utf-8"))
ep2resources = {k:set(v) for k,v in ep2resources.items()}
resources2ep = json.load(open("/home/rovera/jl/output/entity_pages/ep_inverted_index.json", 'r', encoding="utf-8"))

In [33]:
def update_entity_pages(resource1, resource2, ep_graph, ep2r, r2ep):
    
    ep_base = 'http://data.judaicalink.org/data/ep/'
    
    if resource1 in r2ep and resource2 in r2ep:
        print("No update needed: {} and {} are known resources.".format(resource1, resource2))
    elif resource1 in r2ep and resource2 not in r2ep:
        # update ep2r with resource2
        ep = r2ep[resource1]
        ep2r[ep].add(resource2)
        # update r2ep with resource2
        r2ep[resource2] = ep
        # update ep Graph
        uri_ep = URIRef(ep)
        uri2 = URIRef(resource2)
        ep_graph.add((uri_ep, OWL.sameAs, uri2))
        
        print("Updated existing data structures with {}.".format(resource2))
    elif resource2 in r2ep and resource1 not in r2ep:
        # update ep2r with resource2
        ep = r2ep[resource2]
        ep2r[ep].add(resource1)
        # update r2ep with resource2
        r2ep[resource1] = ep
        # update ep Graph
        uri_ep = URIRef(ep)
        uri1 = URIRef(resource1)
        ep_graph.add((uri_ep, OWL.sameAs, uri1))
        print("Updated existing data structures with {}.".format(resource1))
        
    else: # this means both r1 and r2 are unknown, --> create a new entity page
        # for a new entity page we need to assign a new index
        # we get the highest index and create the new one by adding 1
        last_index = max([int(ep.split('/data.judaicalink.org/data/ep/')[1]) for ep in ep2r])
        new_index = last_index+1
        # create in ep2r
        new_ep = ep_base+str(new_index)
        ep2r[new_ep] = {resource1, resource2}
        # create in r2ep
        r2ep[resource1] = new_ep
        r2ep[resource2] = new_ep
        # update ep Graph
        new_ep_uri = URIRef(new_ep)
        uri1 = URIRef(resource1)
        uri2 = URIRef(resource2)
        ep_graph.add((new_ep_uri, OWL.sameAs, uri1))
        ep_graph.add((new_ep_uri, OWL.sameAs, uri2))
        
        print("Created new Entity Page {} for resources {} and {}.".format(new_ep, resource1, resource2))
        print("Updated accordingly existing data structures.")

In [14]:
for k,v in ep2resources.items():
    print(k)
    print(v)
    break

http://data.judaicalink.org/data/ep/1000000
{'https://www.deutsche-digitale-bibliothek.de/entity/124739709', 'http://id.loc.gov/authorities/nr96041780', 'http://www.wikidata.org/entity/Q46997964', 'http://www.isni.org/0000000114803494', 'http://hub.culturegraph.org/entityfacts/124739709', 'http://data.judaicalink.org/data/gnd/124739709', 'http://d-nb.info/gnd/124739709', 'http://viaf.org/viaf/115428728', 'http://d-nb.info/gnd/124739709/about', 'http://catalogue.bnf.fr/ark:/12148/cb12545882q'}


In [34]:
# test update_ep method
r1 = 'http://d-nb.info/gnd/124739709/about'
r2 = 'http://www.isni.org/0000000114803494'
# test case 1
update_entity_pages(r1, r2, ep, ep2resources, resources2ep)
# test case 2
r1 = 'http://d-nb.info/gnd/124739709/about'
r2 = 'http://www.isni.org/blablablablabla'
update_entity_pages(r1, r2, ep, ep2resources, resources2ep)
# test case 3
r1 = 'http://d-nb.info/gnd/sticazzi/about'
r2 = 'http://www.isni.org/0000000114803494'
update_entity_pages(r1, r2, ep, ep2resources, resources2ep)
# test case 4
r1 = 'http://www.pincopallo.net'
r2 = 'http://www.pincopallo.com'
update_entity_pages(r1, r2, ep, ep2resources, resources2ep)

No update needed: http://d-nb.info/gnd/124739709/about and http://www.isni.org/0000000114803494 are known resources.
Updated existing data structures with http://www.isni.org/blablablablabla.
Updated existing data structures with http://d-nb.info/gnd/sticazzi/about.
Created new Entity Page http://data.judaicalink.org/data/ep/1049606 for resources http://www.pincopallo.net and http://www.pincopallo.com.
Updated accordingly existing data structures.


In [16]:
print(ep2resources['http://data.judaicalink.org/data/ep/1000000'])

{'https://www.deutsche-digitale-bibliothek.de/entity/124739709', 'http://id.loc.gov/authorities/nr96041780', 'http://d-nb.info/gnd/sticazzi/about', 'http://www.wikidata.org/entity/Q46997964', 'http://www.isni.org/0000000114803494', 'http://www.isni.org/blablablablabla', 'http://hub.culturegraph.org/entityfacts/124739709', 'http://data.judaicalink.org/data/gnd/124739709', 'http://d-nb.info/gnd/124739709', 'http://viaf.org/viaf/115428728', 'http://d-nb.info/gnd/124739709/about', 'http://catalogue.bnf.fr/ark:/12148/cb12545882q'}
