# Pre-Processing Methods

In [None]:
%%capture
!pip3 install sparqlwrapper

In [1]:
# Common methods to retrieve data from Wikidata

import time
from SPARQLWrapper import SPARQLWrapper, JSON  
import pandas as pd
import urllib.request as url
import json
from SPARQLWrapper import SPARQLWrapper

wiki_sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
wiki_sparql.setReturnFormat(JSON)
wiki_sparql.setTimeout(timeout=25)

wiki_cache = {}

In [2]:
def get_wikidata_label(entity):
    if (entity in cache):
      #print("use of cache!")
      return cache[entity]
    query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        SELECT  *
        WHERE {
            wd:ENTITY rdfs:label ?label .
            FILTER (langMatches( lang(?label), "EN" ) )
          } 
        LIMIT 1
        """
    query_text = query.replace('ENTITY',entity)
    wiki_sparql.setQuery(query_text)
    result = ""
    while (result == ""):
      try:
          ret = wiki_sparql.queryAndConvert()
          if (len(ret["results"]["bindings"]) == 0):
            result = "-"
          for r in ret["results"]["bindings"]:
              result = r['label']['value']
      except Exception as e:
          print("Error on wikidata query:",e)
          if "timed out" in str(e): 
            result = "-"
          break
    cache[entity] = result
    return result

def get_wikidata(query):
    if ("ASK" not in query) and ("LIMIT" not in query):
      query += " LIMIT 10"
    #print(query)
    key = query.replace(" ","_")
    if (key in cache):
      #print("use of cache!")
      return cache[key]
    wiki_sparql.setQuery(query)
    result = []
    retries = 0
    while (len(result) == 0) and (retries < 5):
      try:
          ret = wiki_sparql.queryAndConvert()
          #print(ret)
          if ("ASK" in query):
            result.append(str(ret['boolean']))
          elif (len(ret["results"]["bindings"]) == 0):
            result.append("-")
          else:
            for r in ret["results"]["bindings"]:
                for k in r.keys():
                  tokens = r[k]['value'].split("/")
                  result.append(tokens[len(tokens)-1])
      except Exception as e:
          retries += 1
          print("Error on wikidata query:",e)
          if "timed out" in str(e): 
            result.append("-")
          break
    cache[key] = result
    return result

def preprocess_questions(questions):
  rows = []
  counter = 0
  for question in data['questions']:
    if (counter % 1000 == 0):
      print("Queries processed:",counter, "Cache Size:",len(cache))
    #print("#",question['question_id'])
    answer = question['query_answer'][0]
    subject_labels = []
    subjects = []
    predicates = [e.split(":")[1] for e in answer['sparql_template'].split(" ") if ":" in e]
    predicate_labels = []
    for p in predicates:
      predicate_labels.append(get_wikidata_label(p.replace("*","").split("/")[0]))
    objects = get_wikidata(answer['sparql_query'])
    object_labels = []
    for o in objects:
      if (len(o)>0) and (o[0]=="Q"):
        object_labels.append(get_wikidata_label(o))
      else:
        object_labels.append(o)
    for entity in answer['entities']:    
      subject_labels.append(entity['label'])
      subjects.append(entity['entity'].split(":")[1])
    row = {
        'subjects':subjects,
        'predicates' : predicates,
        'objects': objects,
        'question': question['natural_language_question'],
        'subject_labels':subject_labels,
        'predicate_labels':predicate_labels,
        'object_labels':object_labels
    }
    #print(row)
    rows.append(row)
    counter += 1
  df = pd.DataFrame(rows)
  return df

In [5]:
# Common methods to retrieve data from Wikidata

import time
from SPARQLWrapper import SPARQLWrapper, JSON  
import pandas as pd
import urllib.request as url
import json
from SPARQLWrapper import SPARQLWrapper

dbpedia_sparql = SPARQLWrapper("https://dbpedia.org/sparql/")
dbpedia_sparql.setReturnFormat(JSON)
dbpedia_sparql.setTimeout(timeout=60)

dbpedia_cache = {}

In [20]:
import hashlib

def hash_text(text):
    hash_object = hashlib.md5(text.encode())
    md5_hash = hash_object.hexdigest()
    return str(md5_hash)

def get_dbpedia_label(entity,use_cache=True,verbose=False):
  key = entity+"_label"
  if (use_cache) and (key in dbpedia_cache):
    #print("use of cache!")
    return dbpedia_cache[key].copy()
  query = """
      PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
      PREFIX dbr: <http://dbpedia.org/resource/> 
      select distinct ?label {
           
            <ENTITY> rdfs:label ?label .
            filter langMatches(lang(?label), 'en')
          
      }
      LIMIT 250
      """
  query_text = query.replace('ENTITY',entity) 
  dbpedia_sparql.setQuery(query_text)
  result = []
  while (len(result) == 0):
    try:
        if (verbose):
            print("SPARQL Query:",query_text)
        ret = dbpedia_sparql.queryAndConvert()
        if (verbose):
            print("SPARQL Response:",ret)
        for r in ret["results"]["bindings"]:
            id = entity
            value = id
            if ('label' in r) and ('value' in r['label']):
                value = r['label']['value']            
            if (' id ' not in value.lower()) and (' link ' not in value.lower()) and ('has abstract' not in value.lower()) and ('wiki' not in value.lower()) and ('instance of' not in value.lower()):
                result.append({'id':id, 'value':value})
    except Exception as e:
        print("Error on SPARQL query:",e)
    break           
  dbpedia_cache[key] = result
  #print(len(result),"properties found")
  return result

def get_dbpedia_property_value(filter,use_cache=True,verbose=False):
  key = hash_text(filter)
  if (use_cache) and (key in dbpedia_cache):
    #print("use of cache!")
    return dbpedia_cache[key].copy()
  query = """
      PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
      PREFIX dbr: <http://dbpedia.org/resource/> 
      select distinct ?object ?label {
          { FILTER }

          optional { 
            ?object rdfs:label ?label .
            filter langMatches(lang(?label), 'en')
          }
      }
      LIMIT 250
      """
  query_text = query.replace('FILTER',filter) 
  dbpedia_sparql.setQuery(query_text)
  result = []
  while (len(result) == 0):
    try:
        if (verbose):
            print("SPARQL Query:",query_text)
        ret = dbpedia_sparql.queryAndConvert()
        if (verbose):
            print("SPARQL Response:",ret)
        for r in ret["results"]["bindings"]:
            id = r['object']['value']
            value = id
            if ('label' in r) and ('value' in r['label']):
                value = r['label']['value']            
            if (' id ' not in value.lower()) and (' link ' not in value.lower()) and ('has abstract' not in value.lower()) and ('wiki' not in value.lower()) and ('instance of' not in value.lower()):
                result.append({'id':id, 'value':value})
    except Exception as e:
        print("Error on SPARQL query:",e)
    break           
  dbpedia_cache[key] = result
  #print(len(result),"properties found")
  return result


def get_forward_dbpedia_property_value(entity,property,use_cache=True,verbose=False):
  query_filter ="<ENTITY> <PROPERTY> ?object" 
  return get_dbpedia_property_value(query_filter.replace("ENTITY",entity).replace("PROPERTY",property),use_cache,verbose) 

def get_backward_dbpedia_property_value(entity,property,use_cache=True,verbose=False):
  query_filter ="?object <PROPERTY> <ENTITY>"
  return get_dbpedia_property_value(query_filter.replace("ENTITY",entity).replace("PROPERTY",property),use_cache,verbose) 


# Datasets

## SimpleQuestions Dataset



### Wikidata SimpleQuestions

In [1]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/askplatypus/wikidata-simplequestions/master/annotated_wd_data_test_answerable.txt', sep="\t", index_col=False, header=None, names=['subject','predicate','object','question'])
df.head()

Unnamed: 0,subject,predicate,object,question
0,Q7358590,P20,Q1637790,Where did roger marquis die
1,Q154335,P509,Q12152,what was the cause of death of yves klein
2,Q2747238,P413,Q5059480,What position does carlos gomez play?
3,Q62498,P21,Q6581097,how does engelbert zaschka identify
4,Q182485,P413,Q1143358,what position does pee wee reese play in baseball


Retrieve labels from wikidata for subject, predicate and object:

In [None]:
object_labels = []
subject_labels = []
predicate_labels = [] 
for index, row in df.iterrows():
    print(index,":",row)
    subject_labels.append(get_wikidata_label(row['subject'])) 
    predicate_labels.append(get_wikidata_label(row['predicate'].replace("R","P")))
    object_labels.append(get_wikidata_label(row['object']))
    if (index % 100 == 0 ):
        print("Labels Identified:",index,"Cache Size:",len(cache))
    index += 1
print(len(object_labels),"labels retrieved!")
df['subject_label']=subject_labels
df['predicate_label']=predicate_labels
df['object_label']=object_labels
df.to_csv('wsq-labels.csv')
df.head()

### SimpleDBpediaQuestions

In [23]:
# read dbpedia compatible SimpleQuestions
import urllib.request as url
import json
import unidecode
import pandas as pd


def normalize(label):
  return unidecode.unidecode(label.strip()).lower()


stream = url.urlopen("https://raw.githubusercontent.com/castorini/SimpleDBpediaQA/master/V1/test.json")
content = stream.read()
data = json.loads(content)
ref_questions = [e.lower().strip() for e in pd.read_csv('data/wsq-labels.csv', index_col=0)['question'].tolist()]
counter = 0
total = 0
rows = []
dbpedia_questions = []
for question in data['Questions']:
  total += 1
  if (total % 100 == 0):
        print(total)
  question_query = question['Query']
  if (question_query.lower().strip() in ref_questions):
    counter += 1
    subject_val = question['Subject']
    subject_label = ''
    ss = get_dbpedia_label(subject_val)
    if (len(ss) > 0):
        subject_label = ss[0]['value']
    predicate = question['PredicateList'][0]
    property_val = predicate['Predicate']
    property_label = ''
    pp = get_dbpedia_label(property_val)
    if (len(pp) > 0):
        property_label = pp[0]['value']
    if (predicate['Direction'] == 'forward'):
        object_val = get_forward_dbpedia_property_value(subject_val,property_val)
    else:
        object_val = get_backward_dbpedia_property_value(subject_val,property_val)
    object_id = ''
    object_label = ''
    if len(object_val) > 0:
        object_id = object_val[0]['id']
        object_label = object_val[0]['value']
    row = {'subject':subject_val, 'predicate':property_val, 'object': object_id, 'question':question_query, 'subject_label':subject_label, 'property_label':property_label, 'object_label': object_label}
    rows.append(row)
print("Total:",len(rows))
df = pd.DataFrame(rows)
df.to_csv('dsq-labels.csv')
df.head(10)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
Total: 3688


Unnamed: 0,subject,predicate,object,question,subject_label,property_label,object_label
0,http://dbpedia.org/resource/Roger_Marquis,http://dbpedia.org/ontology/deathPlace,"http://dbpedia.org/resource/Holyoke,_Massachus...",Where did roger marquis die,Roger Marquis,death place,"Holyoke, Massachusetts"
1,http://dbpedia.org/resource/Carlos_Gómez,http://dbpedia.org/ontology/position,http://dbpedia.org/resource/Center_fielder,What position does carlos gomez play?,Carlos Gómez,position,Center fielder
2,http://dbpedia.org/resource/Engelbert_Zaschka,http://xmlns.com/foaf/0.1/gender,,how does engelbert zaschka identify,Engelbert Zaschka,,
3,http://dbpedia.org/resource/Pee_Wee_Reese,http://dbpedia.org/ontology/position,http://dbpedia.org/resource/Shortstop,what position does pee wee reese play in baseball,Pee Wee Reese,position,Shortstop
4,http://dbpedia.org/resource/Myocardial_infarction,http://dbpedia.org/ontology/deathCause,http://dbpedia.org/resource/Moses_J._Epstein,Which Swiss conductor's cause of death is myoc...,Myocardial infarction,death cause,Moses J. Epstein
5,http://dbpedia.org/resource/Padraic_McGuinness,http://dbpedia.org/ontology/deathPlace,http://dbpedia.org/resource/Australia,where was padraic mcguinness's place of death,Padraic McGuinness,death place,Australia
6,http://dbpedia.org/resource/Sam_Edwards_(physi...,http://dbpedia.org/ontology/birthPlace,http://dbpedia.org/resource/Swansea,what is the place of birth of sam edwards?,Sam Edwards (physicist),birth place,Swansea
7,http://dbpedia.org/resource/Italianate_archite...,http://dbpedia.org/ontology/architecturalStyle,http://dbpedia.org/resource/280_Broadway,Which home is an example of italianate archite...,Italianate architecture,architectural style,280 Broadway
8,http://dbpedia.org/resource/Neo_Contra,http://dbpedia.org/ontology/publisher,http://dbpedia.org/resource/Konami,who published neo contra,Neo Contra,publisher,Konami
9,http://dbpedia.org/resource/Angie_Estes,http://purl.org/linguistics/gold/hypernym,http://dbpedia.org/resource/Poet,what is angie estes's profession,Angie Estes,,Poet


## Wikidata QA Dataset

From paper: https://arxiv.org/pdf/2107.02865v1.pdf 

In [None]:
import urllib.request as url
import json

stream = url.urlopen("https://raw.githubusercontent.com/thesemanticwebhero/ElNeuKGQA/main/data/dataset_wikisparql.json")
content = stream.read()
data = json.loads(content)
df = preprocess_questions(data)
df.to_csv('wqa-labels.csv')
df.head()

In [6]:
df.describe(include='all')

Unnamed: 0,subjects,question,subject_labels
count,101,101,101
unique,95,101,95
top,[],Who is the president of Poland?,[]
freq,4,1,4


## LC-QuAD 2.0 Dataset

From paper: https://arxiv.org/pdf/2107.02865v1.pdf 

In [None]:
import urllib.request as url
import json

stream = url.urlopen("https://raw.githubusercontent.com/thesemanticwebhero/ElNeuKGQA/main/data/dataset_lcquad2.json")
content = stream.read()
data = json.loads(content)
df = preprocess_questions(data)
df.to_csv('lcquad2-labels.csv')
df.head()

Queries processed: 0 Cache Size: 7439
Queries processed: 1000 Cache Size: 7439
Queries processed: 2000 Cache Size: 7439
Queries processed: 3000 Cache Size: 7439


## COVID-QA Dataset 

From paper: https://aclanthology.org/2020.nlpcovid19-acl.18.pdf

In [16]:
import urllib.request as url
import json
import pandas as pd

stream = url.urlopen("https://raw.githubusercontent.com/sharonlevy/Open_Domain_COVIDQA/main/data/qa_test.json")
content = stream.read()
data = json.loads(content)
rows = []
counter = 0
for item in data['data']:
  row = {
      'article': item['title'],
      'text' : item['context'],
      'question': item['question'],
      'answer': item['answers'][0]['text']  
  }
  rows.append(row)
  counter += 1
  if (counter % 100 == 0 ):
    print("Questions processed:",counter)
df = pd.DataFrame(rows)
df.to_csv('covidqa-labels.csv')
df.head()

Questions processed: 100
Questions processed: 200
Questions processed: 300


Unnamed: 0,article,text,question,answer
0,Improved Pharmacological and Structural Proper...,Although T20 is an indispensable anti-HIV drug...,What mutations have been typically associated ...,GIV motif (residues 36-45: GIVQQQNNLL) in the ...
1,Frontiers in antiviral therapy and immunothera...,These activities take place in various subcell...,What does the author anticipate that continued...,Another mRNA under post-transcriptional regula...
2,"MERS coronavirus: diagnostics, epidemiology an...",Detection of MERS-CoV infection using ELISA or...,What does the confirmatory process aim to ens...,the antibodies detected are able to specifical...
3,First cases of coronavirus disease 2019 (COVID...,Understanding the infection-severity is critic...,Why are serological tests vital?,to understand the proportion of cases who are ...
4,"MERS coronavirus: diagnostics, epidemiology an...","However, as is well described, cell culture is...",Which are the preferred method for MERS-CoV de...,PCR-based techniques
