# Wiki Data Retrieve
An example of using python to retrieve and display data from the wikidata SPARQL endpoint

In [38]:
import requests
import os 
import pandas as pd

In [7]:
from string import Template
def counts_query(begin,end):
    tmpl =  Template("""
        SELECT ?year (count(?person) AS ?personCount) WHERE {
          ?person wdt:P31 wd:Q5;
                  wdt:P569 ?birth.
          hint:Prior hint:rangeSafe "true"^^xsd:boolean.
          FILTER((?birth > "$begin-01-01"^^xsd:dateTime) && (?birth < "$end-01-01"^^xsd:dateTime))
          BIND (year(?birth) as ?year)
          # 
        }
        group by ?year
        order by ?year
        """)
    return tmpl.substitute(dict(begin=begin, end=end))

In [60]:
import logging
def pull_from_wikidata(query):
    url = 'https://query.wikidata.org/sparql'
    r = requests.get(url, params = {'format': 'json', 'query': query})
    if r.status_code == requests.codes.too_many_requests:
        logging.warning("timeout")
        return None
    else:
        data = r.json()    
        return data

In [9]:
ranges = [(-3000,0),(0,1500),(1500,1700),(1700,1800),(1800,1850),
          (1850,1900),(1900,1925),(1925,1950),(1950,1960),(1960,1970),
          (1970,1980),(1980,1990),(1990,2000),(2000,2010)]

In [10]:
results = {}

In [11]:
def fill_results(results, ranges):
    empty_ranges = [r for r in ranges if r not in results]
    if empty_ranges:
        r = empty_ranges[0]
        data = pull_from_wikidata(counts_query(r[0],r[1]))
        if data:
            results[r] = data

In [77]:
results.keys()

dict_keys([(-3000, 0), (0, 1500), (1500, 1700), (1700, 1800), (1800, 1850), (1850, 1900), (1900, 1925), (1925, 1950), (1950, 1960), (1960, 1970), (1970, 1980)])

In [31]:
fill_results(results,ranges)
results.keys()

dict_keys([(-3000, 0), (0, 1500), (1500, 1700), (1700, 1800), (1800, 1850), (1850, 1900), (1900, 1925), (1925, 1950), (1950, 1960), (1960, 1970), (1970, 1980)])

In [32]:
import pickle
with open('results.pickle','wb') as fout:
    pickle.dump(results, fout)

***********************************************************

In [36]:
data = []
for rng,result in results.items():
    for item in result['results']['bindings']:
        data.append({'year':int(item['year']['value']), 'count':int(item['personCount']['value'])})

In [42]:
all_counts = pd.DataFrame.from_records(data, index="year")

In [91]:
import math
all_counts["group"] = all_counts["count"].cumsum().apply(lambda x: math.floor(x / 5000))

In [153]:
all_counts[all_counts.group < 120].tail()

Unnamed: 0_level_0,count,group
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1843,5596,115
1844,6017,116
1845,5892,117
1846,5700,118
1847,5732,119


In [47]:
with open('all_counts.pickle','wb') as fout:
    pickle.dump(all_counts,fout)

*************

In [61]:
def people_query(begin,end):
    tmpl = Template("""
SELECT ?person ?personLabel ?personDescription ?birth ?placeOfBirth ?placeOfBirthLabel ?birthCoords ?death ?placeOfDeath ?placeOfDeathLabel ?deathCoords ?image WHERE {
  ?person wdt:P31 wd:Q5;
    wdt:P19 ?placeOfBirth;
    wdt:P569 ?birth.
  hint:Prior hint:rangeSafe "true"^^xsd:boolean.
  FILTER((?birth > "$begin-01-01"^^xsd:dateTime) && (?birth < "$end-01-01"^^xsd:dateTime))
  ?placeOfBirth wdt:P625 ?birthCoords.
  OPTIONAL {
    ?person wdt:P18 ?image;
      wdt:P570 ?death;
      wdt:P20 ?placeOfDeath.
    ?placeOfDeath wdt:P625 ?deathCoords.
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
    """)
    return tmpl.substitute(begin=begin, end=end)

In [154]:
all_dates = [-3000,1149,1459,1572,1632,1684,1728,1754,1774,1791,1804,1814,1823,1832,1840,1847]
dates = (all_dates[-2], all_dates[-1])
data = pull_from_wikidata(people_query(dates[0],dates[1]))
people_data[dates] = data

In [155]:
len(data['results']['bindings'])

23786

In [156]:
import pickle
with open('people_data.pickle','wb') as fout:
    pickle.dump(people_data, fout)

In [157]:
people_data.keys()

dict_keys([(-3000, 1149), (1149, 1459), (1459, 1572), (1572, 1632), (1632, 1684), (1684, 1728), (1728, 1754), (1754, 1774), (1774, 1791), (1791, 1804), (1804, 1814), (1814, 1823), (1823, 1832), (1832, 1840), (1840, 1847)])

In [71]:
 def deepquery(item, path, default = None):
        keys = path.split("/")
        val = None

        for key in keys:
            if val:
                if isinstance(val, list):
                    val = [ v.get(key, default) if v else None for v in val]
                else:
                    val = val.get(key, default)
            else:
                val = dict.get(item, key, default)

            if not val:
                break;

        return val

In [74]:
def to_pandas(data):
    results = []
    vars = data['head']['vars']
    for item in data['results']['bindings']:
        row = {}
        for var in vars:
            row[var] = deepquery(item, var + '/value')
            results.append(row)
    return pd.DataFrame.from_records(results)

In [158]:
arrays = []
for key in people_data:
    arrays.append(to_pandas(people_data[key]))
    
people_frame = pd.concat(arrays).drop_duplicates()

In [162]:
people_frame = people_frame.drop_duplicates()

In [163]:
people_frame.shape

(291712, 12)

In [165]:
with open('people_frame.pickle','wb') as fout:
    pickle.dump(people_frame, fout)

In [164]:
len(people_frame.person.unique())

263959

In [16]:

countries = []
for item in data['results']['bindings']:
    countries.append(OrderedDict({
        'wikidataUrl' : item['person']['value'],
        'birthCoord'  : item['birthCoords']['value'],
        'image'       : deepquery(item, 'image/value', ''),
        'label'       : deepquery(item, "personLabel/value",""),
        'description' : deepquery(item, "personDescription/value")
    }))
df = pd.DataFrame.from_records(countries)
df.head()

Unnamed: 0,wikidataUrl,birthCoord,image,label,description
0,http://www.wikidata.org/entity/Q17892,Point(26.333333333 39.166666666),http://commons.wikimedia.org/wiki/Special:File...,Sappho,ancient Greek lyric poet
1,http://www.wikidata.org/entity/Q17892,Point(25.933055555 39.166666666),http://commons.wikimedia.org/wiki/Special:File...,Sappho,ancient Greek lyric poet
2,http://www.wikidata.org/entity/Q28988,Point(77.0 22.0),http://commons.wikimedia.org/wiki/Special:File...,Ananda,one of the principal disciples and a devout at...
3,http://www.wikidata.org/entity/Q36303,Point(27.275556 37.531111),http://commons.wikimedia.org/wiki/Special:File...,Thales,ancient Greek philosopher and mathematician
4,http://www.wikidata.org/entity/Q41328,Point(47.0 37.0),http://commons.wikimedia.org/wiki/Special:File...,Datis,


In [18]:
data['results']['bindings'][0]

{'person': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q17892'},
 'birthCoords': {'datatype': 'http://www.opengis.net/ont/geosparql#wktLiteral',
  'type': 'literal',
  'value': 'Point(26.333333333 39.166666666)'},
 'image': {'type': 'uri',
  'value': 'http://commons.wikimedia.org/wiki/Special:FilePath/Bust%20Sappho%20Musei%20Capitolini%20MC1164.jpg'},
 'personLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Sappho'},
 'personDescription': {'xml:lang': 'en',
  'type': 'literal',
  'value': 'ancient Greek lyric poet'}}

In [51]:
from geomet import wkt
def parse(items):
    result = {}
    for key,item in items.items():
        typ = item['type']
        if typ == 'literal' and item.get('datatype',None) == 'http://www.opengis.net/ont/geosparql#wktLiteral':
            result[key] = wkt.loads(str.upper(item['value']))
        elif typ in ['uri','literal'] : 
            result[key] = item['value']
    return result            

In [54]:
people = pd.DataFrame.from_records([parse(item) for item in data['results']['bindings']])

In [55]:
people.head()

Unnamed: 0,birthCoords,image,person,personDescription,personLabel
0,"{'type': 'Point', 'coordinates': [26.333333333...",http://commons.wikimedia.org/wiki/Special:File...,http://www.wikidata.org/entity/Q17892,ancient Greek lyric poet,Sappho
1,"{'type': 'Point', 'coordinates': [25.933055555...",http://commons.wikimedia.org/wiki/Special:File...,http://www.wikidata.org/entity/Q17892,ancient Greek lyric poet,Sappho
2,"{'type': 'Point', 'coordinates': [77.0, 22.0]}",http://commons.wikimedia.org/wiki/Special:File...,http://www.wikidata.org/entity/Q28988,one of the principal disciples and a devout at...,Ananda
3,"{'type': 'Point', 'coordinates': [27.275556, 3...",http://commons.wikimedia.org/wiki/Special:File...,http://www.wikidata.org/entity/Q36303,ancient Greek philosopher and mathematician,Thales
4,"{'type': 'Point', 'coordinates': [47.0, 37.0]}",http://commons.wikimedia.org/wiki/Special:File...,http://www.wikidata.org/entity/Q41328,,Datis


In [60]:
def to_pandas(data):
    return pd.DataFrame.from_records([parse(item) for item in data['results']['bindings']])

In [93]:
def collect(ranges):
    results = {}
    for r in ranges:
        data = pull_from_wikidata(counts_query(r[0],r[1]))
        results[r] = data
        yield results

In [101]:
collector = collect(ranges)

In [110]:
result = next(collector)

StopIteration: 

In [111]:
result.keys()

dict_keys([(-3000, 0), (0, 1500), (1500, 1700)])

In [88]:
x = pull_from_wikidata(counts_query(1800,1850))


<Response [200]>


In [89]:
xp = to_pandas(x)
xp

Unnamed: 0,personCount,year
0,2052,1800
1,4555,1801
2,3465,1802
3,3383,1803
4,3553,1804
5,3596,1805
6,3470,1806
7,3544,1807
8,3841,1808
9,3775,1809


In [62]:
counts_frame  = to_pandas(counts)

In [63]:
counts_frame.head()

Unnamed: 0,personCount,year
0,6,-2999
1,1,-2950
2,1,-2914
3,13,-2900
4,1,-2889


In [112]:
requests.Timeout

requests.exceptions.Timeout