# Wikidata Simple Questions


In [1]:
%%capture
!pip3 install sparqlwrapper

In [2]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/askplatypus/wikidata-simplequestions/master/annotated_wd_data_test_answerable.txt', sep="\t", index_col=False, header=None, names=['subject','predicate','object','question'])
df.head()

Unnamed: 0,subject,predicate,object,question
0,Q7358590,P20,Q1637790,Where did roger marquis die
1,Q154335,P509,Q12152,what was the cause of death of yves klein
2,Q2747238,P413,Q5059480,What position does carlos gomez play?
3,Q62498,P21,Q6581097,how does engelbert zaschka identify
4,Q182485,P413,Q1143358,what position does pee wee reese play in baseball


In [25]:
import time
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(JSON)

cache = {}
def get_wikidata_label(entity):
    if (entity in cache):
      print("use of cache!")
      return cache[entity]
    query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        SELECT  *
        WHERE {
            wd:ENTITY rdfs:label ?label .
            FILTER (langMatches( lang(?label), "EN" ) )
          } 
        LIMIT 1
        """
    query_text = query.replace('ENTITY',entity)
    sparql.setQuery(query_text)
    result = ""
    while (result == ""):
      try:
          ret = sparql.queryAndConvert()
          if (len(ret["results"]["bindings"]) == 0):
            result = "-"
          for r in ret["results"]["bindings"]:
              result = r['label']['value']
      except Exception as e:
          print("Error on wikidata query:",e)
          print("Waiting for 5.0 seconds..")
          time.sleep(5.0)
    cache[entity] = result
    return result

In [15]:
object_labels = []
subject_labels = []
predicate_labels = [] 


In [None]:
rows = df.to_numpy().tolist()
index = 0
for row in rows[index:]:
    print(index,":",row)
    subject_labels.append(get_wikidata_label(row[0])) #subject
    predicate_labels.append(get_wikidata_label(row[1].replace("R","P"))) #predicate
    object_labels.append(get_wikidata_label(row[2])) #object
    if (index % 100 == 0 ):
        print("Labels Identified:",index,"Cache Size:",len(cache))
    index += 1
print(len(object_labels),"labels retrieved!")

In [28]:
df['subject_label']=subject_labels
df['predicate_label']=predicate_labels
df['object_label']=object_labels
df.to_csv('wsq-labels.csv')
df.head()

Unnamed: 0,subject,predicate,object,question,subject_label,predicate_label,object_label
0,Q7358590,P20,Q1637790,Where did roger marquis die,Roger Marquis,place of death,Holyoke
1,Q154335,P509,Q12152,what was the cause of death of yves klein,Yves Klein,cause of death,myocardial infarction
2,Q2747238,P413,Q5059480,What position does carlos gomez play?,Carlos Gómez,position played on team / speciality,center fielder
3,Q62498,P21,Q6581097,how does engelbert zaschka identify,Engelbert Zaschka,sex or gender,male
4,Q182485,P413,Q1143358,what position does pee wee reese play in baseball,Pee Wee Reese,position played on team / speciality,shortstop
