In [1]:
import os, json, pprint, pickle
from SPARQLWrapper import SPARQLWrapper, JSON
from rdflib import Graph, URIRef
from rdflib.namespace import RDF, OWL, RDFS

In [2]:
# get gnd data from judaicalink
sparql = SPARQLWrapper("http://data.judaicalink.org/sparql/query")
sparql.setQuery("""
    PREFIX gndo: <http://d-nb.info/standards/elementset/gnd#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    SELECT ?s ?o
    WHERE { GRAPH ?g {
        ?s ?p ?o .
        FILTER(CONTAINS(str(?s), 'd-nb.info/gnd/') || CONTAINS(str(?o), 'd-nb.info/gnd/'))
        }}
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

In [3]:
gnd_ids = set([])
print(len(results['results']['bindings']))
for res in results['results']['bindings']:
    s = res['s']['value']
    o = res['o']['value']
    if 'd-nb.info/gnd/' in s:
        gnd_ids.add(s)
    if 'd-nb.info/gnd/' in o:
        gnd_ids.add(o)
print(len(gnd_ids))
gnd_ids = set([gnd_id.replace('http:', 'https:') for gnd_id in gnd_ids if not gnd_id.endswith('/about')])
print(len(gnd_ids))

194127
90098
54021


In [38]:
for el in list(gnd_ids)[:10]:
    print(el)

https://d-nb.info/gnd/1145377033
https://d-nb.info/gnd/120854473
https://d-nb.info/gnd/120037734
https://d-nb.info/gnd/118942328
https://d-nb.info/gnd/1017050511
https://d-nb.info/gnd/119078694
https://d-nb.info/gnd/1032591528
https://d-nb.info/gnd/1090421753
https://d-nb.info/gnd/175194114
https://d-nb.info/gnd/133054233


In [7]:
gnd_depr_id = {}
with open('/data/gnd/authorities_lds.nt', 'r') as gnd_auth:
    i = 0
    for line in gnd_auth:
        s, p, o = line.split(' ')[:3]
        s = s.lstrip('<').rstrip('>')
        if s not in gnd_depr_id:
            gnd_depr_id[s] = {'id': None, 'depr': set([])}
        if p.endswith('gndIdentifier>'):
            gnd_depr_id[s]['id'] = o.split('^^')[0].strip('"')
        if p.endswith('deprecatedUri>'):
            gnd_depr_id[s]['depr'].add(o.split('^^')[0].strip('"'))
print(len(gnd_depr_id))

28607557


In [9]:
i = 0
for k,v in gnd_depr_id.items():
    if i < 10:
        if len(v['depr']) > 0:
            print(k,v)
            i += 1
    else:
        break

https://d-nb.info/gnd/111-9 {'id': '111-9', 'depr': {'https://d-nb.info/gnd/109176140X'}}
https://d-nb.info/gnd/275-6 {'id': '275-6', 'depr': {'https://d-nb.info/gnd/1088112544', 'https://d-nb.info/gnd/1088112064', 'https://d-nb.info/gnd/16286598-3', 'https://d-nb.info/gnd/16294577-2', 'https://d-nb.info/gnd/4223553-4'}}
https://d-nb.info/gnd/278-1 {'id': '278-1', 'depr': {'https://d-nb.info/gnd/1087263476'}}
https://d-nb.info/gnd/291-4 {'id': '291-4', 'depr': {'https://d-nb.info/gnd/16123413-6', 'https://d-nb.info/gnd/4011410-7'}}
https://d-nb.info/gnd/298-7 {'id': '298-7', 'depr': {'https://d-nb.info/gnd/1089028156'}}
https://d-nb.info/gnd/299-9 {'id': '299-9', 'depr': {'https://d-nb.info/gnd/1092086471', 'https://d-nb.info/gnd/1088199755', 'https://d-nb.info/gnd/4463123-6'}}
https://d-nb.info/gnd/301-3 {'id': '301-3', 'depr': {'https://d-nb.info/gnd/1092122931'}}
https://d-nb.info/gnd/304-9 {'id': '304-9', 'depr': {'https://d-nb.info/gnd/1088208487', 'https://d-nb.info/gnd/108640343

In [11]:
jl_gnd_deprecated = {} # jl_gnd_deprecated: gnd_current
for k,v in gnd_depr_id.items():
    for depr_uri in v['depr']:
        if depr_uri in gnd_ids:
            jl_gnd_deprecated[depr_uri] = k
print(len(jl_gnd_deprecated))

13554


In [13]:
jl_gnd_deprecated = {k.replace('https:', 'http:'): v for k,v in jl_gnd_deprecated.items()}

In [15]:
json.dump(jl_gnd_deprecated, open('jl_gnd_deprecated.json', 'w'), indent=2)