In [33]:
import pprint, os
from SPARQLWrapper import SPARQLWrapper, JSON
from rdflib import Graph, URIRef, Namespace, Literal
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

In [25]:
sparql = SPARQLWrapper("http://data.judaicalink.org/sparql/query")
sparql.setQuery("""
    PREFIX jl: <http://data.judaicalink.org/ontology/>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    SELECT ?ep ?abstract
    WHERE {
        GRAPH <http://data.judaicalink.org/data/ep> {
            ?ep owl:sameAs ?same 
        }
        ?same jl:hasAbstract ?abstract
        }
        
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

In [26]:
print(len(results['results']['bindings']))

51978


In [34]:
abstracts = {}
i = 0
for res in results['results']['bindings']:
    i += 1
    if i % 1000 == 0:
        print(i, end=" ")
    ep = res['ep']['value']
    ab = res['abstract']['value']
   
    if ep not in abstracts:
        abstracts[ep] = []
    try:
        if detect(ab) in ('de', 'en'): # select only abstracts in german or english
            abstracts[ep].append(ab)
    except LangDetectException:
        pass

1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 23000 24000 25000 26000 27000 28000 29000 30000 31000 32000 33000 34000 35000 36000 37000 38000 39000 40000 41000 42000 43000 44000 45000 46000 47000 48000 49000 50000 51000 

In [36]:
abstracts = {k:v for k,v in abstracts.items() if len(v) > 0}

In [37]:
# if for an entity page more abstracts are available, take the longest one
abstracts_final = {k: max(v, key=len) for k,v in abstracts.items()}

In [41]:
g = Graph()
jl = Namespace('http://data.judaicalink.org/ontology/')
g.bind('jl', jl)
for ep, abstract in abstracts_final.items():
    ep = URIRef(ep)
    ab = Literal(abstract)
    g.add((ep, jl.hasAbstract, ab))
g.serialize('ep_abstracts.ttl', format='ttl')