In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pprint, json, pickle
blacklist = ['http://www.dnb.de/DE/Service/DigitaleDienste/EntityFacts/entityfacts_node.html']

In [2]:
def generate_pools(query_results, blacklist):
    """
    BEHAVIOR
    Generates sets of URIs connected via an owl:sameAs statement 
    INPUT
    query_results: set of triples s, sameas, same from the SPARQL query
    blacklist: list of URIs to discard from the input
    OUTPUT
    A list of sets, each set is a pool of URIs connected via an owl:sameAs statement
    """
    print("Generating pools...", end=" ")
    e_index = {} # dict {resource_uri :index of the pool}
    pools = [] # list of sets
    pool_i = 0
    
    for res in results['results']['bindings']:
        s = res['s']['value']
        same = res['same']['value']
    
        if s not in blacklist and same not in blacklist:
            if s in e_index and same not in e_index: # update existing pool
                pools[e_index[s]].add(same)
                e_index[same] = e_index[s]
            elif same in e_index and s not in e_index: # update existing pool
                pools[e_index[same]].add(s)
                e_index[s] = e_index[same]
            elif same not in e_index and s not in e_index: # create new pool
                pools.append({s, same}) 
                e_index[s] = pool_i
                e_index[same] = pool_i
                pool_i += 1
    print("Done! Generated {} pools of resource URIs from Judaica Link.".format(len(pools)))
    return pools

In [3]:
def get_pools_to_be_merged(pools):
    """
    BEHAVIOR
    Checks for overlapping pools of entities in the generated pools
    INPUT
    pools: a list of sets of URIs
    OUTPUT
    to_be_merged: list of tuples, each tuple contains indexes of pools that belong together
    touched_pools: set of integers, indexes of all pools that are concerned by overlapping
    """
    
    print("Processing {} pools of URIs for extracting overlaps...".format(len(pools)))
    uri2pool = {}

    for pool in pools:
        pool_index = pools.index(pool)
        for uri in pool:
            if uri in uri2pool:
                uri2pool[uri].add(pool_index)
            else:
                uri2pool[uri] = {pool_index}
    
    uri2pool = {k: sorted(list(v)) for k,v in uri2pool.items()}
    
    to_be_merged = set([])
    touched_pools = set([])
    for pool_indexes in uri2pool.values():
        if len(pool_indexes) > 1: # means that an URI belongs to more than one pool
            to_be_merged.add(tuple(pool_indexes))
            for pool_index in pool_indexes:
                touched_pools.add(pool_index)
    print("Found {} sets of pools to merge...".format(len(to_be_merged)))
    return (to_be_merged, touched_pools)

In [4]:
def merge_pools(pools, to_be_merged, touched_pools):
    """
    BEHAVIOR
    Merges pools of URIs that overlap (that have some URI in common)
    INPUT
    pools: list of sets, each is a pool of URIs
    to_be_merged: list of tuples, each tuple contains indexes of pools that belong together
    touched_pools: set of integers, indexes of all pools that are concerned by overlapping
    OUTPUT
    pools_merged: list of sets
    """
    
    if len(to_be_merged) == 0:
        print("No pools to merge! Returning pools.")
        return pools
    else:
        print("Merging pools...")
        print("Initial number of pools: {}".format(len(pools)))
        pools_merged = [pool for pool in pools if pools.index(pool) not in touched_pools]
        for pool_indexes in to_be_merged:
            new_pool = set([])
            for i in pool_indexes:
                new_pool.update(pools[i])
            pools_merged.append(new_pool)
        print("Final number of pools: {}".format(len(pools_merged)))
        return pools_merged

In [5]:
def generate_entity_pages(pools, save=False, pathout=""):
    """
    BEHAVIOR
    Generates URIs for entity pages
    INPUT
    pools: list of sets
    OUTPUT:
    entity_pages: dict, {entity_page_URI: set([resource_URI_1, resource_URI_2, etc])}
    """
    
    print("Generating entity pages out of {} pools of entities...".format(len(pools)))
    entity_pages = {}
    base = 'http://data.judaicalink.org/data/ep/'
    i = 1000000
    for pool in pools:
        entity_page_uri = base+str(i)
        entity_pages[entity_page_uri] = list(pool)
        i += 1    
    print("...Done! Generated {} entity pages!".format(len(entity_pages)))
    if save is True:
        print("Saving entity pages to {}...".format(pathout))
        with open(pathout, 'wb') as outfile:
            pickle.dump(entity_pages, outfile)
    
    return entity_pages

In [6]:
def get_ep_inverted_index(entity_pages, save=False, pathout=""):
    """
    BEHAVIOR
    Generates an inverted index {resource_URI: entity_page_URI}
    Checks the consistency of the entity pages, each resource_URI must have exactly on entity_page_URI
    INPUT
    entity_pages: entity page dictionary
    OUTPUT
    uri2ep: dict, {resource_URI: entity_page_URI}
    """
    print("Generating inverted index resource-to-entity_page...")
    uri2ep = {}
    for ep, uris in entity_pages.items():
        for uri in uris:
            if uri not in uri2ep:
                uri2ep[uri] = set([ep])
            else:
                uri2ep[uri].add(ep)
    
    print("Checking consistency of inverted index...")
    inconsistencies = 0
    for uri, ep in uri2ep.items():
        if len(ep) > 1:
            inconsistencies += 1
    if inconsistencies != 0:
        print("WARNING! Found {} inconsistent entity pages!".format(inconsistencies))
    else:
        print("The index is consistent! :)")
    
    uri2ep = {k: list(v)[0] for k,v in uri2ep.items()}
    if save is True:
        print("Saving inverted index to {}...".format(pathout))
        with open(pathout, 'wb') as outfile:
            pickle.dump(uri2ep, outfile)
            
    print("Done!")
    return uri2ep

In [7]:
sparql = SPARQLWrapper("http://data.judaicalink.org/sparql/query") # changed from old:http://localhost:3040/judaicalink/query
sparql.setQuery("""
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    SELECT ?s ?same
    WHERE { GRAPH ?g {
        ?s owl:sameAs ?same
        }}
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

In [None]:
pools = generate_pools(results, blacklist)
print(pools)
to_merge, touched_pools = get_pools_to_be_merged(pools)
merged = merge_pools(pools, to_merge, touched_pools)
entity_pages = generate_entity_pages(merged, save=False, pathout='~/jl/output/entity_pages/ep_data.pickle')
print(entity_pages)
ep_inverted_index = get_ep_inverted_index(entity_pages, save=False, pathout='~/jl/output/entity_pages/ep_inverted_index.pickle')