In [16]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pprint

In [17]:
sparql = SPARQLWrapper("http://localhost:3040/judaicalink/query")
sparql.setQuery("""
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    SELECT DISTINCT ?s ?same ?label
    WHERE { GRAPH ?g {
        ?s owl:sameAs ?same.
        ?s skos:prefLabel ?label.
        }}
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

In [39]:
c = 0
for res in results['results']['bindings']:
    c += 1
    if c < 10:
        print(res)
print(c)

{'s': {'type': 'uri', 'value': 'http://data.judaicalink.org/data/gnd/124739709'}, 'same': {'type': 'uri', 'value': 'http://id.loc.gov/authorities/nr96041780'}, 'label': {'type': 'literal', 'value': 'Mesnard, Philippe'}, 'g': {'type': 'uri', 'value': 'http://data.judaicalink.org/data/gnd-persons'}}
{'s': {'type': 'uri', 'value': 'http://data.judaicalink.org/data/gnd/124739709'}, 'same': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q46997964'}, 'label': {'type': 'literal', 'value': 'Mesnard, Philippe'}, 'g': {'type': 'uri', 'value': 'http://data.judaicalink.org/data/gnd-persons'}}
{'s': {'type': 'uri', 'value': 'http://data.judaicalink.org/data/gnd/124739709'}, 'same': {'type': 'uri', 'value': 'https://www.deutsche-digitale-bibliothek.de/entity/124739709'}, 'label': {'type': 'literal', 'value': 'Mesnard, Philippe'}, 'g': {'type': 'uri', 'value': 'http://data.judaicalink.org/data/gnd-persons'}}
{'s': {'type': 'uri', 'value': 'http://data.judaicalink.org/data/gnd/124739709'}, '

In [25]:
# extract preferred labels
preflabs = {}
for res in results['results']['bindings']:
    s = res['s']['value']
    lab = res['label']['value']
    
    if s not in preflabs:
        preflabs[s] = lab

In [6]:
# unique IRIs
iris = set([])
for res in results['results']['bindings']:
    s = res['s']['value']
    same = res['same']['value']
    iris.add(s)
    iris.add(same)
print(len(iris))

416271


In [9]:
# create sameas pools

sameas = []

c = 0
for res in results['results']['bindings']:
    s = res['s']['value']
    same = res['same']['value']
    found = False
    for pool in sameas:
        if s in pool:
            pool.add(same)
            found = True
            break
        elif same in pool:
            pool.add(s)
            found = True
            break
        else:
            found = False
    if found == False:
        sameas.append(set([s, same]))
        
    #if len(sameas) % 1000 == 0:
    #    print(len(sameas), end=" ")
    c += 1
    if c % 1000 == 0:
        print(int(c/1000), end=" ")

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 

In [18]:
# merge sameas pools

same_to_s = {}

sameas = {}

for res in results['results']['bindings']:
    s = res['s']['value']
    same = res['same']['value']
    
    if s in sameas:
        sameas[s].add(same)
        same_to_s[same] = s
    elif same in sameas:
        sameas[same].add(s)
        same_to_s[s] = same
    else:
        if s in same_to_s: # the subject is already an alias of some other subject
            sameas[same_to_s[s]].add(same)
            same_to_s[same] = same_to_s[s]
        elif same in same_to_s: # the alias is already an alias of some other subject
            sameas[same_to_s[same]].add(s)
            same_to_s[s] = same_to_s[same]
        else: # neither the subject nor the alias is an alias of any existing subject
            sameas[s] = set([same])
            same_to_s[same] = s
print(len(sameas))

47117


In [22]:
check_double_entries(sameas)
merged = merge_same_as_pools(sameas)
check_double_entries(merged)

47117
14
47110
0


In [20]:
def check_double_entries(same_as_dict, verbose=False):
    
    print(len(same_as_dict))
    c = 0
    for subj,aliases in same_as_dict.items():
        for alias in aliases:
            if alias != subj and alias in same_as_dict.keys():
                if verbose is True:
                    print(subj)
                    print(alias)
                    print()
                c += 1
    print(c)

In [21]:
def merge_same_as_pools(same_as_dict):
    merged = {}
    
    for subj, aliases in same_as_dict.items():
        
        found = False
        for alias in aliases:
            if alias in merged.keys() and alias != subj and not alias.endswith('-1') and not subj.endswith('-1'):
                merged[alias].add(subj)
                merged[alias].update(aliases)
                found = True
                break
        if found is False:
            merged[subj] = aliases
    
    return merged

In [23]:
merged = {k: {'same': v, 'pref': '', 'alt': set([])} for k,v in sameas.items()}

In [26]:
# add pref labels to sameas dict
for subj, label in preflabs.items():
    if subj in merged.keys():
        merged[subj]['pref'] = label

In [27]:
# visualize current status
i = 0
for k,v in merged.items():
    pprint.pprint(k)
    pprint.pprint(v)
    print()
    i += 1
    if i == 5:
        break

'http://data.judaicalink.org/data/gnd/124739709'
{'alt': set(),
 'pref': 'Mesnard, Philippe',
 'same': {'http://catalogue.bnf.fr/ark:/12148/cb12545882q',
          'http://d-nb.info/gnd/124739709',
          'http://d-nb.info/gnd/124739709/about',
          'http://hub.culturegraph.org/entityfacts/124739709',
          'http://id.loc.gov/authorities/nr96041780',
          'http://viaf.org/viaf/115428728',
          'http://www.isni.org/0000000114803494',
          'http://www.wikidata.org/entity/Q46997964',
          'https://www.deutsche-digitale-bibliothek.de/entity/124739709'}}

'http://data.judaicalink.org/data/gnd/132587475'
{'alt': set(),
 'pref': 'Piechocki, Reinhard',
 'same': {'http://catalogue.bnf.fr/ark:/12148/cb16511286d',
          'http://d-nb.info/gnd/132587475',
          'http://d-nb.info/gnd/132587475/about',
          'http://d-nb.info/gnd/176998837',
          'http://hub.culturegraph.org/entityfacts/132587475',
          'http://id.loc.gov/authorities/no2006127000'

In [28]:
# query for alternative labels
sparql_1 = SPARQLWrapper("http://localhost:3040/judaicalink/query")
sparql_1.setQuery("""
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    SELECT ?s ?alt
    WHERE { GRAPH ?g {
        ?s skos:altLabel ?alt.
        }}
""")
sparql_1.setReturnFormat(JSON)
results_1 = sparql_1.query().convert()

In [175]:
c = 0
for res in results_1['results']['bindings']:
    c += 1
    if c < 10:
        print(res)
print(c)

{'s': {'type': 'uri', 'value': 'http://data.judaicalink.org/data/djh/aschkenasi-zwi-hirsch-ben-jacob'}, 'alt': {'type': 'literal', 'value': 'Zwi Aschkenasi'}}
{'s': {'type': 'uri', 'value': 'http://data.judaicalink.org/data/djh/aschkenasi-zwi-hirsch-ben-jacob'}, 'alt': {'type': 'literal', 'value': 'Aschkenasi'}}
{'s': {'type': 'uri', 'value': 'http://data.judaicalink.org/data/djh/müller-hartmann-robert'}, 'alt': {'type': 'literal', 'value': 'Robert Müller-Hartmann'}}
{'s': {'type': 'uri', 'value': 'http://data.judaicalink.org/data/djh/nordheim-marcus'}, 'alt': {'type': 'literal', 'value': 'Marcus Nordheim'}}
{'s': {'type': 'uri', 'value': 'http://data.judaicalink.org/data/djh/amulettenstreit'}, 'alt': {'type': 'literal', 'value': 'Amulettenstreit'}}
{'s': {'type': 'uri', 'value': 'http://data.judaicalink.org/data/djh/verein-selbständiger-jüdischer-handwerker-und-gewerbetreibender-groß-hamburg'}, 'alt': {'type': 'literal', 'value': 'Verein selbständiger jüdischer Handwerker und Gewerbet

In [29]:
# extract alt labs
altlabs = {}
for res in results_1['results']['bindings']:
    s = res['s']['value']
    alt = res['alt']['value']
    if s not in altlabs:
        altlabs[s] = set([alt])
    else:
        altlabs[s].add(alt)

In [30]:
# add alt labs to main dictionary
for s, val in merged.items():
    
    if s in altlabs.keys():
        merged[s]['alt'].update(altlabs[s])
    
    for alias in val['same']:
        if alias in altlabs.keys():
            merged[s]['alt'].update(altlabs[alias])

In [31]:
# visualize current status
i = 0
for k,v in merged.items():
    
    if i > 150 and i < 155:
    
        pprint.pprint(k)
        pprint.pprint(v)
        print()
    i += 1

'http://data.judaicalink.org/data/gnd/108950549'
{'alt': {'Chotjewitz Häfner, Renate',
         'Chotjewitz, Renate',
         'Chotjewitz, Renate Häfner-',
         'Häfner, Renate',
         'Häfner, Renate C.-',
         'Häfner, Renate Chotjewitz-',
         'Häfner-Chotjewitz, Renate'},
 'pref': 'Chotjewitz-Häfner, Renate',
 'same': {'http://d-nb.info/gnd/108950549',
          'http://d-nb.info/gnd/108950549/about',
          'http://d-nb.info/gnd/1116289741',
          'http://hub.culturegraph.org/entityfacts/108950549',
          'http://id.loc.gov/authorities/n78027694',
          'http://kalliope-verbund.info/gnd/108950549',
          'http://viaf.org/viaf/39935389',
          'http://www.isni.org/000000002485583X',
          'http://www.wikidata.org/entity/Q2143355',
          'https://de.wikipedia.org/wiki/Renate_Chotjewitz-H%C3%A4fner',
          'https://www.deutsche-digitale-bibliothek.de/entity/108950549'}}

'http://data.judaicalink.org/data/gnd/136285279'
{'alt':

In [32]:
# count surface form occurrence
names = {}
for v in merged.values():
    if v['pref'] in names:
        names[v['pref']] += 1
    else:
        names[v['pref']] = 1
        
    for alt in v['alt']:
        if alt in names:
            names[alt] += 1
        else:
            names[alt] = 1

In [33]:
sorted_names = sorted([(k,v) for k,v in names.items()], key=lambda x:x[1], reverse=True)

In [40]:
pprint.pprint(sorted_names[200:220])

[('Rapoport, Nathan', 3),
 ('Sforno, Obadiah ben Jacob', 3),
 ('Yohanan', 3),
 ('Benet, Mordecai', 3),
 ('ישראל', 3),
 ('Rand, Ayn', 3),
 ('Garb, Jonathan', 3),
 ('Samuel, Herbert', 3),
 ('Jaakov Joseph, of Polonoye', 3),
 ('Goldberg, Leah', 3),
 ('Dayan, Moshe', 3),
 ('Simon, Paul', 3),
 ('Bick, Abraham', 3),
 ('Weinreb, Tzvi Hersh', 3),
 ('Albeck, Shalom', 3),
 ('Stone, I. F.', 3),
 ('Aboab, Isaac', 3),
 ('Rosenberg, J.', 3),
 ('Rosenberg, I.', 3),
 ('Tam, Jacob ben Meir', 3)]


In [34]:
# create a name index
iri_to_name = {} # iri: {set of names}
for s, val in merged.items():
    iri_to_name[s] = set([])
    iri_to_name[s].add(val['pref'])
    iri_to_name[s].update(val['alt'])
print(len(iri_to_name))

47117


In [35]:
# name to iri
name_to_iri = {}
for iri, names in iri_to_name.items():
    
    for name in names:
        
        if name in name_to_iri:
            name_to_iri[name].add(iri)
            
        else:
            name_to_iri[name] = set([iri])
print(len(name_to_iri))

198523


In [42]:
name_to_iri['Rosenberg, J.']

{'http://data.judaicalink.org/data/gnd/1049177371',
 'http://data.judaicalink.org/data/gnd/127913637',
 'http://data.judaicalink.org/data/gnd/131721461'}

In [37]:
pprint.pprint(merged['http://data.judaicalink.org/data/dbpedia/Paul_Simon'])
print()
pprint.pprint(merged['http://data.judaicalink.org/data/gnd/131692658'])

{'alt': {'Horovittsu, D.',
         'Horowitz, D.',
         'Horowitz, D. J.',
         'Horowitz, David J.',
         'Horowitz, David Joel'},
 'pref': 'Horowitz, David',
 'same': {'http://catalogue.bnf.fr/ark:/12148/cb11907737t',
          'http://d-nb.info/gnd/1090427395',
          'http://d-nb.info/gnd/119542072',
          'http://d-nb.info/gnd/119542072/about',
          'http://hub.culturegraph.org/entityfacts/119542072',
          'http://id.loc.gov/authorities/n50030821',
          'http://kalliope-verbund.info/gnd/119542072',
          'http://viaf.org/viaf/292202505',
          'http://www.isni.org/0000000397128416',
          'http://www.wikidata.org/entity/Q722462',
          'https://de.wikipedia.org/wiki/David_Horowitz',
          'https://en.wikipedia.org/wiki/David_Horowitz'}}

{'alt': {'Horovits, Daṿid',
         'Horovits, Daṿid',
         'Horoṿits, D.',
         'Horoṿits, Daṿid',
         'Horowitz, D.',
         'Horowitz, David',
         'Horoṿits, D.',
 

In [182]:
i = 0
for k,v in iri_to_name.items():
    
    if i > 3100 and i < 3120:
        print(k)
        pprint.pprint(v)
        print()
    i += 1

http://data.judaicalink.org/data/gnd/121509605
{'Wallace, Bert',
 'Wallace, Bert H.',
 'Wallach, Bert',
 'Wallach, Bert H.',
 'Wallach, Berthold',
 'Wallach, Berthold H.'}

http://data.judaicalink.org/data/gnd/173208371
{'Goldštein, Moiše', 'Goldshteyn, Moishes', 'Goldstein, Moises Z.'}

http://data.judaicalink.org/data/gnd/133897931
{'Carmona, Antonio Rodríguez',
 'Rodríguez Carmona, Antonio',
 'Rodríguez-Carmona, Antonio'}

http://data.judaicalink.org/data/gnd/109133889
{'Schembs', 'Schembs, Hans-Otto', 'Schembs, H. O.', 'Schembs, H.-O.'}

http://data.judaicalink.org/data/gnd/118779680
{'Leskien, A.',
 'Leskien, August',
 'Leskien, Joh. Heinr. Aug.',
 'Leskien, Johann Heinrich August',
 'Leskin, Avgust'}

http://data.judaicalink.org/data/gnd/136840388
{'Huttner, Ulrich', 'Huttner, U.'}

http://data.judaicalink.org/data/gnd/1073974103
{'Goshen-Gottstein, Alon',
 'Goshen-Goṭshṭain, Alon',
 'Gottstein, Alon Goshen-',
 'Goṭshṭain, Alon Goshen-',
 'גוטשטיין, אלון גושן-'}

http://