In [2]:
import pandas as pd
import json
from SPARQLWrapper import SPARQLWrapper, JSON

In [3]:
def get_sparql_dataframe(service, query):
    """
    Helper function to convert SPARQL results into a Pandas data frame.
    """
    sparql = SPARQLWrapper(service)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()

    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)

    return pd.DataFrame(out, columns=cols)

In [4]:
wds = "https://query.wikidata.org/sparql"

Query Wikidata for all individuals with an [ORCID](http://orcid.org) and retrieve the short desription if available.

In [4]:
rq = """
PREFIX bd: <http://www.bigdata.com/rdf#>
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wos: <http://localhost/ontology/wos#>

 select distinct
   ?item
   ?itemLabel
   ?orcid
   ?description
WHERE {
  ?item wdt:P496 ?orcid 
  BIND(STRAFTER(str(?item), "http://www.wikidata.org/entity/") as ?wid)
  OPTIONAL { ?item schema:description ?description filter (lang(?description) = "en") }
  SERVICE wikibase:label {
        bd:serviceParam wikibase:language "en" .
  }
}
"""

In [5]:
df = get_sparql_dataframe(wds, rq)

In [6]:
df.head()

Unnamed: 0,item,itemLabel,orcid,description
0,http://www.wikidata.org/entity/Q505876,Francisco J. Ayala,0000-0001-6311-920X,Spanish-American philosopher and biologist
1,http://www.wikidata.org/entity/Q214119,Luciano Floridi,0000-0002-5444-2280,Italian philosopher
2,http://www.wikidata.org/entity/Q951559,Giulio Superti-Furga,0000-0002-0570-1768,Italian molecular and systems biologist
3,http://www.wikidata.org/entity/Q135180,Thomas Cech,0000-0001-7338-3389,Nobel laureate in chemistry
4,http://www.wikidata.org/entity/Q994491,Michael Jursa,0000-0003-2682-8933,Austrian assyriologist


Describe the data returned. At the time of this query there were 44795 Wikidata entities.

In [7]:
df.describe()

Unnamed: 0,item,itemLabel,orcid,description
count,44795,44795,44795,5453
unique,44791,44619,44793,1792
top,http://www.wikidata.org/entity/Q38304648,Jun Wang,0000-0003-4842-1667,researcher
freq,2,5,2,2835


Get a value count of the description field. This will show common titles for entities with ORCIDs.
Note that no text normalization has been here. These are the raw values returned by Wikidata. 

In [8]:
df['description'].value_counts()[:10]

researcher                     2835
German physicist                 27
American computer scientist      20
German computer scientist        19
American mathematician           18
American chemist                 16
German economist                 16
British computer scientist       15
American scientist               15
British physicist                15
Name: description, dtype: int64