In [1]:
import warnings
warnings.filterwarnings("ignore")
from rdflib import Graph
from SPARQLWrapper import SPARQLWrapper, JSON, N3
from pprint import pprint
import wikidata_plain_sparql as wikidata
import pandas as pd
import numpy as np
import time
import pickle
from tqdm import trange



## Language editions

In [2]:
df = pd.read_csv('data/name_list.csv')

In [3]:
df['lang'] = ''
df['n_lang'] = ''
for i in trange(len(df)):
    wiki = df.loc[i,'Wikidata_Q']
    if type(wiki)==str:
        while True:
            try:
                a = wikidata.query(f'''
SELECT DISTINCT ?lang ?name 
WHERE {{
  ?article schema:about wd:{wiki} . hint:Prior hint:runFirst true.
  ?article schema:inLanguage ?lang ;
    schema:name ?name ;
    schema:isPartOf [ wikibase:wikiGroup "wikipedia" ] .
  FILTER (!CONTAINS(?name, ':')) .
}}
''')
                time.sleep(0.5)
                break
            except HTTPError:
                time.sleep(20)
                continue
        df.at[i,'lang'] = a['lang'].to_list()
        df.at[i, 'n_lang'] = len(a['lang'].to_list())
        
        # if i%10==0:
        #     time.sleep(10)

100%|███████████████████████████████████████████████████| 594/594 [08:29<00:00,  1.17it/s]


In [4]:
df.head(3)

Unnamed: 0,Name,wikidata,Wikipedia_URL,Wikidata_URL,Wikidata_Q,lang,n_lang
0,Ceolwulf,Y,https://en.wikipedia.org/wiki/Ceolwulf_of_Nort...,http://www.wikidata.org/entity/Q729480,Q729480,"[bar, ca, de, el, en, eo, es, fr, gl, it, la, ...",15
1,Bede,Y,https://en.wikipedia.org/wiki/Bede,http://www.wikidata.org/entity/Q154938,Q154938,"[yue, ast, ca, cy, da, en, es, eu, gl, nl, nn,...",68
2,Albinus,Y,https://en.wikipedia.org/wiki/Albinus_(abbot),http://www.wikidata.org/entity/Q16002399,Q16002399,"[en, es, fr, pt]",4


In [5]:
pickle.dump(df, open( "data/df.p", "wb" ))
df = pickle.load(open( "data/df.p", "rb" ))

## Outlinks + Inlinks

In [6]:
for i in trange(len(df)):
    wiki_q = df.loc[i,'Wikidata_Q']
    if type(wiki_q)==str and len(wiki_q)>1:
        while True:
            try:
                sparql = SPARQLWrapper('https://dbpedia.org/sparql')
                sparql.setQuery(f"""
PREFIX wd: <http://www.wikidata.org/entity/>
SELECT DISTINCT ?pLabel ?wikidata 
WHERE {{
  ?person_db owl:sameAs wd:{wiki_q}. 
  ?person_db dbo:wikiPageWikiLink ?outlinks_to_person.
  ?outlinks_to_person rdf:type dbo:Person.
  ?outlinks_to_person rdfs:label ?pLabel.
  FILTER(LANG(?pLabel) = "en").
  ?outlinks_to_person owl:sameAs ?wikidata.
  FILTER(regex(str(?wikidata), "www.wikidata.org" ) )
}}
""")
                time.sleep(0.5)
                break
            except HTTPError:
                time.sleep(20)
                continue
        sparql.setReturnFormat(JSON)
        qres = sparql.query().convert()
        df.loc[i,'Outlinks'] = str({i['wikidata']['value'].split('/')[-1]:i['pLabel']['value'] for i in qres['results']['bindings']})

100%|███████████████████████████████████████████████████| 594/594 [10:03<00:00,  1.02s/it]


In [7]:
for i in trange(len(df)):
    wiki_q = df.loc[i,'Wikidata_Q']
    if type(wiki_q)==str and len(wiki_q)>1:
        while True:
            try:
                sparql = SPARQLWrapper('https://dbpedia.org/sparql')
                sparql.setQuery(f"""
PREFIX wd: <http://www.wikidata.org/entity/>
SELECT DISTINCT ?pLabel ?wikidata
WHERE {{
  ?person_db owl:sameAs wd:{wiki_q}.
  ?inlinks_from_person dbo:wikiPageWikiLink ?person_db.
  ?inlinks_from_person rdf:type dbo:Person.
  ?inlinks_from_person rdfs:label ?pLabel.
  FILTER(LANG(?pLabel) = "en").
  ?inlinks_from_person owl:sameAs ?wikidata.
  FILTER(regex(str(?wikidata), "www.wikidata.org" ) )
}}
""")
                
                time.sleep(0.5)
                break
            except HTTPError:
                time.sleep(20)
                continue
        sparql.setReturnFormat(JSON)
        qres = sparql.query().convert()
        df.loc[i,'Inlinks'] = str({i['wikidata']['value'].split('/')[-1]:i['pLabel']['value'] for i in qres['results']['bindings']})

100%|███████████████████████████████████████████████████| 594/594 [10:08<00:00,  1.02s/it]


In [8]:
df.head(3)

Unnamed: 0,Name,wikidata,Wikipedia_URL,Wikidata_URL,Wikidata_Q,lang,n_lang,Outlinks,Inlinks
0,Ceolwulf,Y,https://en.wikipedia.org/wiki/Ceolwulf_of_Nort...,http://www.wikidata.org/entity/Q729480,Q729480,"[bar, ca, de, el, en, eo, es, fr, gl, it, la, ...",15,"{'Q154938': 'Bede', 'Q338268': 'Acca of Hexham...","{'Q737618': 'Aldfrith of Northumbria', 'Q15493..."
1,Bede,Y,https://en.wikipedia.org/wiki/Bede,http://www.wikidata.org/entity/Q154938,Q154938,"[yue, ast, ca, cy, da, en, es, eu, gl, nl, nn,...",68,"{'Q43689': 'Ambrose', 'Q254896': 'Anastasius o...","{'Q671299': 'Amphibalus', 'Q254896': 'Anastasi..."
2,Albinus,Y,https://en.wikipedia.org/wiki/Albinus_(abbot),http://www.wikidata.org/entity/Q16002399,Q16002399,"[en, es, fr, pt]",4,"{'Q1274084': 'Adrian of Canterbury', 'Q154938'...",{'Q154938': 'Bede'}


In [9]:
pickle.dump(df, open( "data/df.p", "wb" ))
df = pickle.load(open( "data/df.p", "rb" ))

## Properties

In [10]:
columns = ['gender_Q', 'gender', \
           'occupation_Q', 'occupation', \
           'religion_Q', 'religion', \
           'date_of_birth_Q', 'date_of_birth', \
           'date_of_death_Q', 'date_of_death', \
           'place_of_birth_Q', 'place_of_birth', \
           'place_of_death_Q', 'place_of_death', \
           'spouse_Q', 'spouse', 'child_Q', 'child', \
           'father_Q', 'father', 'mother_Q', 'mother']
df = pd.concat([df, pd.DataFrame(columns=columns)])

for i in trange(len(df)):
    wiki = df.loc[i,'Wikidata_Q']
    if type(wiki)==str and wiki!='':
        while True:
            try:
                a = wikidata.query(f'''
select ?gender ?genderLabel \
?occupation ?occupationLabel \
?religion ?religionLabel \
?date_of_birth ?date_of_birthLabel \
?date_of_death ?date_of_deathLabel \
?place_of_birth ?place_of_birthLabel \
?place_of_death ?place_of_deathLabel \
?spouse ?spouseLabel ?child ?childLabel \
?father ?fatherLabel ?mother ?motherLabel
where {{
  Optional {{wd:{wiki} wdt:P21 ?gender}}
  Optional {{wd:{wiki} wdt:P569 ?date_of_birth}}
  Optional {{wd:{wiki} wdt:P19 ?place_of_birth}}
  Optional {{wd:{wiki} wdt:P570 ?date_of_death}}
  Optional {{wd:{wiki} wdt:P20 ?place_of_death}}
  Optional {{wd:{wiki} wdt:P106 ?occupation}}
  Optional {{wd:{wiki} wdt:P140 ?religion}}
  Optional {{wd:{wiki} wdt:P26 ?spouse}}
  Optional {{wd:{wiki} wdt:P40 ?child}}
  Optional {{wd:{wiki} wdt:P22 ?father}}
  Optional {{wd:{wiki} wdt:P25 ?mother}}
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en".}}
  }}'''
)
                time.sleep(0.5)
                break
            except HTTPError:
                time.sleep(20)
                continue
        a.set_axis(columns,axis='columns', inplace=True)
        for col in columns: 
            l = list(set(a[col]))
            if len(l)>1:
                df.at[i,col] = l
            else:
                df.at[i,col] = l[0]

100%|███████████████████████████████████████████████████| 594/594 [07:56<00:00,  1.25it/s]


In [11]:
df.head(3)

Unnamed: 0,Name,wikidata,Wikipedia_URL,Wikidata_URL,Wikidata_Q,lang,n_lang,Outlinks,Inlinks,gender_Q,...,place_of_death_Q,place_of_death,spouse_Q,spouse,child_Q,child,father_Q,father,mother_Q,mother
0,Ceolwulf,Y,https://en.wikipedia.org/wiki/Ceolwulf_of_Nort...,http://www.wikidata.org/entity/Q729480,Q729480,"[bar, ca, de, el, en, eo, es, fr, gl, it, la, ...",15,"{'Q154938': 'Bede', 'Q338268': 'Acca of Hexham...","{'Q737618': 'Aldfrith of Northumbria', 'Q15493...",http://www.wikidata.org/entity/Q6581097,...,http://www.wikidata.org/entity/Q213804,Lindisfarne,,,,,http://www.wikidata.org/entity/Q75451525,Cuðwin (?),,
1,Bede,Y,https://en.wikipedia.org/wiki/Bede,http://www.wikidata.org/entity/Q154938,Q154938,"[yue, ast, ca, cy, da, en, es, eu, gl, nl, nn,...",68,"{'Q43689': 'Ambrose', 'Q254896': 'Anastasius o...","{'Q671299': 'Amphibalus', 'Q254896': 'Anastasi...",http://www.wikidata.org/entity/Q6581097,...,http://www.wikidata.org/entity/Q782155,Jarrow,,,,,,,,
2,Albinus,Y,https://en.wikipedia.org/wiki/Albinus_(abbot),http://www.wikidata.org/entity/Q16002399,Q16002399,"[en, es, fr, pt]",4,"{'Q1274084': 'Adrian of Canterbury', 'Q154938'...",{'Q154938': 'Bede'},http://www.wikidata.org/entity/Q6581097,...,,,,,,,,,,


## Notability

In [12]:
notability_ids = wikidata.query(f'''
SELECT DISTINCT ?itemLabel WHERE {{
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }}
  {{
    SELECT DISTINCT ?item WHERE {{
      ?item p:P31 ?statement0.
      ?statement0 (ps:P31/(wdt:P279*)) wd:Q62589316.
    }}
    LIMIT 2000
  }}
}}
        ''')
notability_ids = notability_ids['itemLabel'].to_list()

In [13]:
df['prop_list'] = ''
df['prop_notability_list'] = ''
df['n_prop_list'] = ''
df['n_prop_notability_list'] = ''
for i in trange(len(df)):
    wiki = df.loc[i,'Wikidata_Q']
    if type(wiki)==str:
        while True:
            try:
                a = wikidata.query(f'''
SELECT  ?p ?wdLabel ?ps ?ps_ ?ps_Label ?ps_Description ?pq_unitLabel 
WHERE {{
  VALUES ?item {{
    wd:{wiki}
  }}
  ?item ?p ?statement.   
  ?statement ?ps ?ps_.   
  ?wd wikibase:claim ?p;     
      wikibase:statementProperty ?ps.   
OPTIONAL {{
  ?statement ?pq ?pq_.
  ?wdpq wikibase:qualifier ?pq.
  ?statement ?pqv [wikibase:quantityUnit ?pq_unit]
}}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}}}
        ''')
                time.sleep(0.5)
                break
            except HTTPError:
                time.sleep(20)
                continue
        df.at[i,'prop_list'] = [url.split("/")[-1] for url in a.p]
        df.at[i,'prop_notability_list'] = [url.split("/")[-1] for url in a.p if url.split("/")[-1] in notability_ids]
        df.at[i,'n_prop_list'] = len(df.at[i,'prop_list'])
        df.at[i,'n_prop_notability_list'] = len(df.at[i,'prop_notability_list'])

100%|███████████████████████████████████████████████████| 594/594 [09:58<00:00,  1.01s/it]


In [14]:
pickle.dump(df, open( "data/df.p", "wb" ))
df = pickle.load(open( "data/df.p", "rb" ))

In [15]:
df.to_csv('data/data.csv')

In [16]:
df.head(5)

Unnamed: 0,Name,wikidata,Wikipedia_URL,Wikidata_URL,Wikidata_Q,lang,n_lang,Outlinks,Inlinks,gender_Q,...,child_Q,child,father_Q,father,mother_Q,mother,prop_list,prop_notability_list,n_prop_list,n_prop_notability_list
0,Ceolwulf,Y,https://en.wikipedia.org/wiki/Ceolwulf_of_Nort...,http://www.wikidata.org/entity/Q729480,Q729480,"[bar, ca, de, el, en, eo, es, fr, gl, it, la, ...",15,"{'Q154938': 'Bede', 'Q338268': 'Acca of Hexham...","{'Q737618': 'Aldfrith of Northumbria', 'Q15493...",http://www.wikidata.org/entity/Q6581097,...,,,http://www.wikidata.org/entity/Q75451525,Cuðwin (?),,,"[P27, P646, P569, P1185, P1343, P227, P97, P57...","[P227, P6126, P7902, P1415]",29,4
1,Bede,Y,https://en.wikipedia.org/wiki/Bede,http://www.wikidata.org/entity/Q154938,Q154938,"[yue, ast, ca, cy, da, en, es, eu, gl, nl, nn,...",68,"{'Q43689': 'Ambrose', 'Q254896': 'Anastasius o...","{'Q671299': 'Amphibalus', 'Q254896': 'Anastasi...",http://www.wikidata.org/entity/Q6581097,...,,,,,,,"[P1066, P8849, P6868, P8065, P106, P1889, P688...","[P4342, P7796, P5034, P227, P6126, P1263, P147...",242,21
2,Albinus,Y,https://en.wikipedia.org/wiki/Albinus_(abbot),http://www.wikidata.org/entity/Q16002399,Q16002399,"[en, es, fr, pt]",4,"{'Q1274084': 'Adrian of Canterbury', 'Q154938'...",{'Q154938': 'Bede'},http://www.wikidata.org/entity/Q6581097,...,,,,,,,"[P39, P4223, P31, P646, P214, P21, P570, P3241...","[P227, P1415]",14,2
3,Theodore of Canterbury,Y,https://en.wikipedia.org/wiki/Theodore_of_Tarsus,http://www.wikidata.org/entity/Q504529,Q504529,"[uk, ko, ang, arz, de, el, en, eo, es, fi, fr,...",23,"{'Q1274084': 'Adrian of Canterbury', 'Q154938'...","{'Q1274084': 'Adrian of Canterbury', 'Q225857'...",http://www.wikidata.org/entity/Q6581097,...,,,,,,,"[P10242, P10141, P18, P10553, P10799, P1317, P...","[P6126, P244, P4286, P1417, P227, P691, P1415,...",81,8
4,Hadrian,Y,https://en.wikipedia.org/wiki/Adrian_of_Canter...,http://www.wikidata.org/entity/Q1274084,Q1274084,"[sv, de, en, eo, es, fr, hr, it, la, lt, nb, p...",15,"{'Q154938': 'Bede', 'Q552201': 'Benedict Bisco...","{'Q16002399': 'Albinus (abbot)', 'Q709718': 'A...",http://www.wikidata.org/entity/Q6581097,...,,,,,,,"[P10553, P10799, P8080, P4342, P8366, P3241, P...","[P4342, P4286, P1415, P6126, P227, P7902]",33,6
