In [39]:
from SPARQLWrapper import SPARQLWrapper, JSON
from elasticsearch import Elasticsearch
import json, csv, requests, time, pickle
from urllib.parse import unquote
es = Elasticsearch()

In [6]:
with open('/data/cm/output/linker/cm_entities_tagme.tsv') as csv_file:
    tagme_output = list(csv.reader(csv_file, delimiter='\t', quotechar='"'))

In [22]:
def wiki2wikidata(title, lang):
    
    params = {
        'action': 'wbgetentities',
        'sites': lang+'wiki',
        'titles': title,
        'languages': lang,
        'format': 'json'
    }
    base = 'https://www.wikidata.org/w/api.php?'
    params_str = "&".join(["{}={}".format(k,v) for k,v in params.items()])
    
    query = base + params_str
    
    res = requests.get(query)
    
    return res

In [19]:
sparql = SPARQLWrapper("http://localhost:3040/judaicalink/query")
sparql.setQuery("""
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    SELECT DISTINCT ?label ?same
    WHERE { GRAPH ?g { ?s skos:prefLabel ?label .
                        ?s owl:sameAs ?same.
                        ?s a foaf:Person
    
    }}
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

In [20]:
jl_persons = {}
hits = 0
counter = 0
for result in results["results"]["bindings"]:
    counter += 1
    label = result["label"]["value"]
    same = result['same']['value']
    if "," in label:
        label = " ".join(map(str.strip, reversed(label.split(","))))
    #persons.append(label)
    if label in jl_persons:
        jl_persons[label].add(same)
    else:
        jl_persons[label] = set([same])

In [25]:
len(jl_persons)

45192

In [26]:
c = 0
for k,v in jl_persons.items():
    for uri in v:
        if 'wikidata' in uri:
            print(k, uri)
            c += 1
    if c > 10:
        break

Philippe Mesnard http://www.wikidata.org/entity/Q46997964
Reinhard Piechocki http://www.wikidata.org/entity/Q1382611
William Douglas Morrison http://www.wikidata.org/entity/Q51573121
Leone Modena http://www.wikidata.org/entity/Q1652176
Peter Herde http://www.wikidata.org/entity/Q2075646
Natan ShahÌ£am http://www.wikidata.org/entity/Q2910365
Nancy Huston http://www.wikidata.org/entity/Q237684
Boleslaus V. Wicherkiewicz http://www.wikidata.org/entity/Q9176983
Marie-Louise Roth http://www.wikidata.org/entity/Q1897168
Hans Lenk http://www.wikidata.org/entity/Q70521
Friedhelm Rathjen http://www.wikidata.org/entity/Q1457253


In [36]:
# get overlap between jl_persons and tagme output
per_overlap = set([])
for match in tagme_output:
    if match[0] in jl_persons:
        per_overlap.add((match[0], match[6].replace(" ", "_")))
print(len(per_overlap))

3915


In [42]:
wikidata_enrichment = {}
c = 0
for el in per_overlap:
    res = wiki2wikidata(el[1], 'de').json()
    wikidata_id = list(res['entities'].keys())[0]
    wikidata_url = 'http://www.wikidata.org/entity/'+wikidata_id
    
    wikidata_enrichment[el] = wikidata_url
    c += 1
    if c % 100 == 0:
        print(c, end=" ")
        print(wikidata_url)
    time.sleep(0.5)

100 http://www.wikidata.org/entity/Q20657936
200 http://www.wikidata.org/entity/Q535396
300 http://www.wikidata.org/entity/Q1608348
400 http://www.wikidata.org/entity/Q1551290
500 http://www.wikidata.org/entity/Q84904
600 http://www.wikidata.org/entity/Q506760
700 http://www.wikidata.org/entity/Q1177029
800 http://www.wikidata.org/entity/Q101542
900 http://www.wikidata.org/entity/Q1594243
1000 http://www.wikidata.org/entity/Q93444
1100 http://www.wikidata.org/entity/Q711755
1200 http://www.wikidata.org/entity/Q1610507
1300 http://www.wikidata.org/entity/Q1330285
1400 http://www.wikidata.org/entity/Q15992069
1500 http://www.wikidata.org/entity/Q2177746
1600 http://www.wikidata.org/entity/Q2799328
1700 http://www.wikidata.org/entity/Q22691903
1800 http://www.wikidata.org/entity/Q1065635
1900 http://www.wikidata.org/entity/Q103140
2000 http://www.wikidata.org/entity/Q76998
2100 http://www.wikidata.org/entity/Q1293846
2200 http://www.wikidata.org/entity/Q30875
2300 http://www.wikidata.org/

In [47]:
wikidata_enrichment = {k:v for k,v in wikidata_enrichment.items() if not v.endswith('-1')}

In [48]:
with open('/data/cm/output/linker/wikidata_enrichment.pickle', 'wb') as outfile:
    pickle.dump(wikidata_enrichment, outfile)

In [49]:
c = 0
for k, v in wikidata_enrichment.items():
    if v.endswith('-1'):
        print(k,v)

In [50]:
wikidata_pers = {k[0]: v for k,v in wikidata_enrichment.items()}

In [51]:
overlap = {}
for match in tagme_output:
    if match[0] in jl_persons:
        wiki_url = 'https://de.wikipedia.org/wiki/'+match[6].replace(" ", "_")
        dbpedia_url_de = 'http://de.dbpedia.org/resource/'+match[6].replace(" ", "_")
        dbpedia_url = 'http://dbpedia.org/resource/'+match[6].replace(" ", "_")
        
        wikidata_url = ''
        try:
            wikidata_url = wikidata_pers[match[0]]
        except KeyError:
            pass
        
        
        tagme_urls = set([wiki_url, dbpedia_url_de, dbpedia_url, wikidata_url])
        if wikidata_url != '':
            tagme_urls.add(wikidata_url)
        jl_urls = {unquote(jl_uri) for jl_uri in persons[match[0]]}
        
        common_urls = jl_urls.intersection(tagme_urls)
        
        if common_urls != set([]):
            overlap[match[0]] = common_urls

In [52]:
print(len(overlap))

3018


In [53]:
i = 0
for k,v in overlap.items():
    i += 1
    if i > 550 and i < 650:
        print(k)
        print(v)
        print()
    if i > 650:
        break

Ernst Cohn-Wiener
{'https://de.wikipedia.org/wiki/Ernst_Cohn-Wiener', 'http://www.wikidata.org/entity/Q113179'}

Albrecht Alt
{'https://de.wikipedia.org/wiki/Albrecht_Alt', 'http://www.wikidata.org/entity/Q84895', 'http://dbpedia.org/resource/Albrecht_Alt'}

Clemens Brentano
{'https://de.wikipedia.org/wiki/Clemens_Brentano', 'http://www.wikidata.org/entity/Q57235', 'http://dbpedia.org/resource/Clemens_Brentano'}

Theodor Reik
{'https://de.wikipedia.org/wiki/Theodor_Reik', 'http://www.wikidata.org/entity/Q78734', 'http://dbpedia.org/resource/Theodor_Reik'}

Charles Baudelaire
{'http://www.wikidata.org/entity/Q501', 'https://de.wikipedia.org/wiki/Charles_Baudelaire', 'http://dbpedia.org/resource/Charles_Baudelaire'}

Julius Wellhausen
{'http://www.wikidata.org/entity/Q76897', 'https://de.wikipedia.org/wiki/Julius_Wellhausen', 'http://dbpedia.org/resource/Julius_Wellhausen'}

Sigmund Mowinckel
{'http://dbpedia.org/resource/Sigmund_Mowinckel', 'https://de.wikipedia.org/wiki/Sigmund_Mowinck

In [58]:
mentions_overlap = 0
for mention in tagme_output:
    if mention[0] in overlap:
        #print(jl_person)
        mentions_overlap += 1
print("Found {} JL person mentions in CM.".format(mentions_overlap))

Found 282613 JL persons in CM.
