# Approach

Use of official APIs from Wikidata and DBpedia to retrieve candidates from the entities discovered. Then, apply a ranking algorithm to select the resource as the best candidate

In [None]:
%%capture
!pip install -U sentence-transformers

In [7]:
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import time
from SPARQLWrapper import SPARQLWrapper, JSON  
import pandas as pd
import urllib.request as url
import json
import requests

sentence_language_model = "sentence-transformers/all-distilroberta-v1"
sentence_model = SentenceTransformer(sentence_language_model)
lemmatizer = WordNetLemmatizer()

wiki_cache = {}
dbpedia_cache = {}

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/cbadenes/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cbadenes/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
def lemmatize(text):
  result = []
  for token in text.split(" "):
    result.append(lemmatizer.lemmatize(token))
  return " ".join(result)

print("Lemma:",lemmatize("Turing awards"))

def sort_by_similar(text,texts):
  sentences = [text]
  sentences.extend(texts)
  embeddings = sentence_model.encode(sentences)
  sim_list = []
  index=0
  for e in embeddings[1:]:
    ref = embeddings[0]
    score = cosine_similarity([ref], [e])
    score_val = round(score[0][0], 1)
    sim_list.append({'id':index, 'text':texts[index], 'score':score_val})  
    index+=1
  sim_list.sort(key=lambda x: x.get('score'),reverse=True)
  return sim_list

def get_top_candidates(ref_text,candidates,max=-1):
  top_candidates = []
  if (len(candidates) == 0):
        return top_candidates
  sorted_candidates = sort_by_similar(ref_text,[c['text'] for c in candidates])  
  best_score = sorted_candidates[0]['score']
  for index, c in enumerate(sorted_candidates):
    if (index < max) or (c['score'] == best_score):
        candidate = candidates[c['id']]
        candidate['score'] = c['score']
        top_candidates.append(candidate)
  return top_candidates  

def print_candidates(criteria,candidates):
    print("## Sorted Candidates by",criteria,": ",[{'name':c['label'],'description':c['description'],'score':c['score']} for c in candidates])

def get_resources_by_candidates(context,label,candidates,max=-1):
  if (len(candidates) == 0):
    return []
  # sort candidates by name/label
  candidates_by_name = []  
  top_candidates_by_name = get_top_candidates(label,[ {'id':i, 'text':c['label'] } for i,c in enumerate(candidates)],10)
  for t in top_candidates_by_name:
    candidate = candidates[t['id']]
    candidate['score'] = t['score']
    candidates_by_name.append(candidate)
  #print_candidates("By Name",candidates_by_name)
  
  # sort candidates by properties
  candidates_by_properties = []
  candidate_properties = []
  for i,c in enumerate(candidates_by_name):
     for p in c['properties']:
        candidate_properties.append({'id':i, 'text':p['value'] })
  top_candidates_by_prop = get_top_candidates(context.replace(c['label'],""),candidate_properties,10) 
  for t in top_candidates_by_prop:
     candidate = candidates_by_name[t['id']]
     if (candidate not in candidates_by_properties):
        candidate['score'] = (2*candidate['score'] + 4*t['score']) / 6.0
        candidates_by_properties.append(candidate)
  #print_candidates("By Properties",candidates_by_properties)
  
        
  # sort candidates by description
  candidates_by_description = []
  top_candidates_by_desc = get_top_candidates(context.replace(c['label'],""),[ {'id':i, 'text':c['description'] } for i,c in enumerate(candidates_by_properties)],max)    
  for t in top_candidates_by_desc:
     candidate = candidates_by_properties[t['id']]
     if (candidate not in candidates_by_description):
        candidate['score'] = (2*candidate['score'] + 1*t['score']) / 3.0
        candidates_by_description.append(candidate)  
  #print_candidates("By Description",candidates_by_description)
  return candidates_by_description


Lemma: Turing award


## Wikidata Searching based on MediaWiki API

The MediaWiki Action API is a web service that allows access to some wiki-features like authentication, page operations, and search. It can provide meta information about the wiki and the logged-in user.

action=wbsearchentities

Searches for entities using labels and aliases.

Returns a label and description for the entity in the user language if possible. Returns details of the matched term. The matched term text is also present in the aliases key if different from the display label.

In [3]:
%%capture
!pip3 install sparqlwrapper

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
wiki_sparql = SPARQLWrapper("https://query.wikidata.org/sparql",agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36')
wiki_sparql.setReturnFormat(JSON)
wiki_sparql.setTimeout(timeout=60)

def get_wikidata_properties(entity,use_cache=False):
  if (use_cache) and (entity in wiki_cache):
    #print("use of cache!")
    return wiki_cache[entity].copy()
  query = """
      PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
      PREFIX wd: <http://www.wikidata.org/entity/> 
      SELECT distinct ?prop ?propLabel
      WHERE
      {
        { wd:ENTITY ?a ?b }
              union
              { ?s ?a wd:ENTITY } .

        SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } 
        ?prop wikibase:directClaim ?a .
      } 
      LIMIT 250
      """
  query_text = query.replace('ENTITY',entity)
  wiki_sparql.setQuery(query_text)
  result = []
  while (len(result) == 0):
    try:
        ret = wiki_sparql.queryAndConvert()
        for r in ret["results"]["bindings"]:
            if ('propLabel' in r) and ('value' in r['propLabel']):
                    value = r['propLabel']['value']
                    id = r['prop']['value'].split("http://www.wikidata.org/entity/")[1]
                    if ('id' not in value.lower()) and ('link' not in value.lower()) and ('has abstract' not in value.lower()) and ('wiki' not in value.lower()) and ('instance of' not in value.lower()):
                        result.append({'id':id, 'value':value})
    except Exception as e:
        print("Error on wikidata property query:",e,"->",query_text)
    break           
  wiki_cache[entity] = result
  return result

def get_wikidata_candidates(label,use_cache=True,verbose=False):
    if (use_cache) and (label in wiki_cache):
      #print("use of cache for label:",label)
      return wiki_cache[label].copy()
    candidates = []
    if (label==""):
        return candidates
    # type: One of the following values: form, form, item, lexeme, property, sense, sense
    headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    query_path = "https://www.wikidata.org/w/api.php?action=wbsearchentities&search=QUERY_TEXT&language=en&limit=10&type=item&format=json"
    request = query_path.replace("QUERY_TEXT",label)
    if (verbose):
        print("Request:",request)
    r = requests.get(request,headers = headers)
    if (len(r.json()['search']) == 0):
      if (verbose):
        print("search by lemma:",lemmatize(label))
      r = requests.get(query_path.replace("QUERY_TEXT",lemmatize(label)))
      size = len(label.split(" "))
      index = 1
      while(('search' in r.json()) and (len(r.json()['search']) == 0) and (index<size)):
        query_label = " ".join(label.split(" ")[index:])
        index += 1  
        if (verbose):
          print("search by Partial Label:",query_label)
        r = requests.get(query_path.replace("QUERY_TEXT",query_label)) 
    if (verbose):
        print("Response:",r.json())
    for answer in r.json()['search']:
        description = ""
        if ('description' in answer['display']):
          description = answer['display']['description']['value']
          if 'disambiguation' in description:
                continue
        candidate = {
            'label': answer['display']['label']['value'],
            'id':answer['id'],
            'description' : description,
            'properties' : get_wikidata_properties(answer['id'],use_cache)
        }
        candidates.append(candidate)
    wiki_cache[label]=candidates
    #print("cache '",label,"' updated with:'",[c['id'] for c in candidates])
    return candidates

def get_wikidata_resource(context,entity,max=-1,use_cache=True):
    candidates = get_wikidata_candidates(entity,use_cache)
    lema_entity = lemmatize(entity)
    if (entity != lema_entity):
        candidate_ids = [c['id'] for c in candidates]
        for ac in get_wikidata_candidates(lema_entity,use_cache):
            if (ac['id'] not in candidate_ids):
                candidates.append(ac)
    resources = get_resources_by_candidates(context, entity, candidates,max)
    return resources


## DBpedia Searching based on the Lookup Service

The DBpedia Lookup is an entity retrieval service for Linked Data. It provides a straightforward solution for the frequent use case of resolving keywords and natural language to related resource identifiers in the DBpedia knowledge graph. Related means that either the label or abstract of a resource matches, or an anchor text that was frequently used in Wikipedia to refer to a specific resource matches (e.g. the resource http://dbpedia.org/resource/United_States can be looked up by the string “USA”). 

So whether you need an auto-complete service for your RDF application, Linked Data enhancements for your CSV tables or simply a way to retrieve specific DBpedia identifiers – the DBpedia Lookup is for you!

As a part of the DBpedia Technology Stack the DBpedia Lookup can be deployed conveniently via Docker and works well with DBpedia Databus Collections. The DBpedia Lookup uses an Apache Lucene Index for resource indexing and retrieval and provides a web interface for querying.

In [10]:
dbpedia_sparql = SPARQLWrapper("https://dbpedia.org/sparql/")
dbpedia_sparql.setReturnFormat(JSON)
dbpedia_sparql.setTimeout(timeout=60)

def get_dbpedia_properties(entity,use_cache=True):
    #select distinct ?property ?label {
      key = entity+"_props"
      if (use_cache) and (key in dbpedia_cache):
          #print("use of cache!")
          return dbpedia_cache[key].copy()
      query = """
          PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
          PREFIX dbr: <http://dbpedia.org/resource/> 
          select distinct ?property ?label {
              { <http://dbpedia.org/resource/ENTITY> ?property ?o }
              union
              { ?s ?property <http://dbpedia.org/resource/ENTITY> }

              optional { 
                ?property rdfs:label ?label .
                filter langMatches(lang(?label), 'en')
              }
              filter(regex(?property, "property", "i" )) 
          }
          LIMIT 250
          """
      query_text = query.replace('ENTITY',entity)
      dbpedia_sparql.setQuery(query_text)
      result = []
      while (len(result) == 0):
        try:
            ret = dbpedia_sparql.queryAndConvert()
            for r in ret["results"]["bindings"]:
                if ('label' in r) and ('value' in r['label']):
                    value = r['label']['value']
                    id = r['property']['value']
                    if ('id' not in value.lower()) and ('link' not in value.lower()) and ('has abstract' not in value.lower()) and ('wiki' not in value.lower()) and ('instance of' not in value.lower()):
                        result.append({'id':id, 'value':value})
        except Exception as e:
            print("Error on SPARQL query:",e)
        break           
      dbpedia_cache[key] = result
      #print(len(result),"properties found")
      return result

def get_dbpedia_candidates(label,use_cache=True,verbose=False):
    if (use_cache) and (label in dbpedia_cache):
      #print("use of cache!")
      return dbpedia_cache[label].copy()
    candidates = []
    if (label==""):
        return candidates
    # type: One of the following values: form, form, item, lexeme, property, sense, sense
    #query_path = "https://www.wikidata.org/w/api.php?action=wbsearchentities&search=QUERY_TEXT&language=en&limit=10&type=item&format=json"
    query_path = "https://lookup.dbpedia.org/api/search?format=JSON&query=QUERY_TEXT&maxResults=10"
    url = query_path.replace("QUERY_TEXT",label)
    #print("->",url)
    r = requests.get(url)
    if (len(r.json()['docs']) == 0):
        if (verbose):
            print("Use of lemmatize literal:",lemmatize(label))
        r = requests.get(query_path.replace("QUERY_TEXT",lemmatize(label)))
        size = len(label.split(" "))
        index = 1  
        while(('search' in r.json()) and (len(r.json()['search']) == 0) and (index<size)):
            query_label = " ".join(label.split(" ")[index:])
            index += 1  
            if (verbose):
              print("search by Partial Label:",query_label)
            r = requests.get(query_path.replace("QUERY_TEXT",query_label)) 
    for answer in r.json()['docs']:
        description,label,id = "","",""
        properties = []
        if ('comment' in answer) and (len(answer['comment']) > 0):
          description = answer['comment'][0].replace("<B>","").replace("</B>","")
        if ('resource' in answer) and (len(answer['resource']) > 0):
          id = answer['resource'][0].split("http://dbpedia.org/resource/")[1]
          properties = get_dbpedia_properties(id,use_cache)
        if ('label' in answer) and (len(answer['label']) > 0):
          label = answer['label'][0].replace("<B>","").replace("</B>","")
        else:
          label = id        
        candidate = {
            'label': label,
            'id':id,
            'description' : description,
            'properties' : properties
        }
        candidates.append(candidate)
    dbpedia_cache[label]=candidates
    return candidates

def get_dbpedia_resource(context,entity,max=-1,use_cache=True):
    candidates = get_dbpedia_candidates(entity,use_cache)
    lema_entity = lemmatize(entity)
    if (entity != lema_entity):
        candidate_ids = [c['id'] for c in candidates]
        for ac in get_dbpedia_candidates(lema_entity,use_cache):
            if (ac['id'] not in candidate_ids):
                candidates.append(ac)
    resources = get_resources_by_candidates(context, entity, candidates,max)
    return resources   

# Evaluation

## Datasets

### SimpleQuestions Dataset

#### Wikidata SimpleQuestions

In [12]:
import pandas as pd

df = pd.read_csv('data/wsq-labels.csv', index_col=0)
wsq_df = df.drop(['predicate','object','predicate_label','object_label'], axis=1)
wsq_df = wsq_df.rename(columns = {'subject':'resources','subject_label':'entities'})
wsq_df = wsq_df[['question','entities','resources']]
wsq_df['entities'] = [ [e] for e in wsq_df['entities'].tolist()]
wsq_df['resources'] = [ [e] for e in wsq_df['resources'].tolist()]
wsq_df.head()

Unnamed: 0,question,entities,resources
0,Where did roger marquis die,[Roger Marquis],[Q7358590]
1,what was the cause of death of yves klein,[Yves Klein],[Q154335]
2,What position does carlos gomez play?,[Carlos Gómez],[Q2747238]
3,how does engelbert zaschka identify,[Engelbert Zaschka],[Q62498]
4,what position does pee wee reese play in baseball,[Pee Wee Reese],[Q182485]


In [13]:
wsq_df.describe()

Unnamed: 0,question,entities,resources
count,5622,5622,5622
unique,5605,5161,5189
top,Name an actor.,[drama film],[Q130232]
freq,6,25,25


#### DBpedia SimpleQuestions

From paper: 

In [116]:
!pip install unidecode

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting unidecode
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
     |████████████████████████████████| 235 kB 1.3 MB/s            
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.4


In [15]:
import urllib.request as url
import json
import unidecode


def normalize(label):
  return unidecode.unidecode(label.strip())


stream = url.urlopen("https://raw.githubusercontent.com/castorini/SimpleDBpediaQA/master/V1/test.json")
content = stream.read()
data = json.loads(content)
ref_questions = [e.lower().strip() for e in pd.read_csv('data/wsq-labels.csv', index_col=0)['question'].tolist()]
counter = 0
total = 0
rows = []
for question in data['Questions']:
  total += 1
  if (question['Query'].lower().strip() in ref_questions):
    counter += 1
    row = {
        'question':question['Query'],
        'entities':[normalize(question['Subject'].split("http://dbpedia.org/resource/")[1].replace("_"," "))],
        'resources':[question['Subject'].split("http://dbpedia.org/resource/")[1]],
    }
    rows.append(row)
print("Total:",total,"Counter:",counter,"Ref Size:",len(ref_questions))
dsq_df = pd.DataFrame(rows)
dsq_df.head()

Total: 8595 Counter: 3688 Ref Size: 5622


Unnamed: 0,question,entities,resources
0,Where did roger marquis die,[Roger Marquis],[Roger_Marquis]
1,What position does carlos gomez play?,[Carlos Gomez],[Carlos_Gómez]
2,how does engelbert zaschka identify,[Engelbert Zaschka],[Engelbert_Zaschka]
3,what position does pee wee reese play in baseball,[Pee Wee Reese],[Pee_Wee_Reese]
4,Which Swiss conductor's cause of death is myoc...,[Myocardial infarction],[Myocardial_infarction]


In [16]:
dsq_df.describe()

Unnamed: 0,question,entities,resources
count,3688,3688,3688
unique,3656,3420,3420
top,Name an actor.,[Actor],[Actor]
freq,12,25,25


### WikidataQA Dataset

In [17]:
import pandas as pd

def split_values(text):
  return text.replace('\'',"").replace("[","").replace("]","").split(",")

df = pd.read_csv('data/wqa-labels.csv', index_col=0)
wqa_df = df.drop(['predicates','objects','predicate_labels','object_labels'], axis=1)
wqa_df = wqa_df.rename(columns = {'subjects':'resources','subject_labels':'entities'})
wqa_df = wqa_df[['question','entities','resources']]
wqa_df['entities'] = [ split_values(e) for e in wqa_df['entities'].tolist()]
wqa_df['resources'] = [ split_values(e) for e in wqa_df['resources'].tolist()]
wqa_df.head()

Unnamed: 0,question,entities,resources
0,Who is the president of Poland?,[president of Poland],[Q1054799]
1,How many Turing awards have people from Austri...,"[Turing awards, Austria]","[Q185667, Q40]"
2,Give me all countries that have won a FIFA Wor...,[FIFA World Cup],[Q19317]
3,What is the population of Chile?,[Chile],[Q298]
4,Who is the author of One Piece?,[One Piece],[Q673]


In [18]:
wqa_df.describe()

Unnamed: 0,question,entities,resources
count,101,101,101
unique,101,95,95
top,Who is the president of Poland?,[],[]
freq,1,4,4


## Metrics

In [19]:
def normalize(label):
  return label.strip()

def precision(tp,fp):
  if (fp+tp == 0):
    return 0.0
  return tp / (fp + tp)

def recall(tp,fn):
  if (fn+tp == 0):
    return 0.0
  return tp / (fn + tp)

def f1(tp,fp,fn):
  p = precision(tp,fp)
  r = recall(tp,fn)
  if (p+r == 0):
    return 0.0
  return 2 * ((p*r)/(p+r))

def average(values):
  return sum(values) / len(values) 

# lists of entity lists
def evaluate_labels(true_list,pred_list):
  tp, tn, fp, fn = 0, 0, 0, 0
  precision_list, recall_list, f1_list = [], [], []
  empty_values = 0
  for index in range(len(true_list)):
    # normalize entities
    valid_entities = [normalize(e) for e in true_list[index] if e != '']
    predicted_entities = [normalize(e) for e in pred_list[index]]
    ptp, ptn, pfp, pfn = 0, 0, 0, 0
    if (len(valid_entities)==0):
      empty_values += 1
    for entity in valid_entities:
      if (entity not in predicted_entities):
        pfn += 1
    for entity in predicted_entities:
      if (entity in valid_entities):
        ptp += 1
      else:
        pfp += 1    
    precision_list.append(precision(ptp,pfp))
    recall_list.append(recall(ptp,pfn))
    f1_list.append(f1(ptp,pfp,pfn))
    tp += ptp
    tn += ptn
    fp += pfp
    fn += pfn  
  return  {
      'total': index,
      'empty': empty_values,
      'tp': tp,
      'tn': tn, 
      'fp': fp,
      'fn':fn,
      'micro-precision': precision(tp,fp),
      'micro-recall': recall(tp,fn),
      'micro-f1': f1(tp,fp,fn),
      'macro-precision': average(precision_list),
      'macro-recall': average(recall_list),
      'macro-f1': average(f1_list)
  }  

## SOTA Methods

### DBpedia Spotlight


In [17]:
%%capture
!pip install spacy-dbpedia-spotlight

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [120]:
!pip install --upgrade numpy

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting numpy
  Using cached numpy-1.22.3-cp39-cp39-macosx_11_0_arm64.whl (12.8 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.4
    Uninstalling numpy-1.21.4:
      Successfully uninstalled numpy-1.21.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-web-sm 3.2.0 requires spacy<3.3.0,>=3.2.0, but you have spacy 3.0.6 which is incompatible.[0m
Successfully installed numpy-1.22.3


In [20]:
import spacy

lang = "en"
spacy_nlp = spacy.blank(lang)
spacy_nlp.add_pipe('dbpedia_spotlight', config={'confidence': 0.4, 'overwrite_ents':False, 'language_code': lang})


spotlight_cache = {}
def get_resources_by_dbpedia_spotlight(query,entities=[]):
  key = query.replace(" ","_")  
  if (key in spotlight_cache):
        print("use of spotlight cache!")
        return spotlight_cache[key]
  doc = spacy_nlp(query)
  resources = []
  for ent in doc.spans['dbpedia_spotlight']:
    uri = ent.kb_id_.split("/")
    resources.append({
        'label': ent.text,
        'id': uri[len(uri)-1],
        'score': ent._.dbpedia_raw_result['@similarityScore']
    })
  spotlight_cache[key]=resources
  return resources

def retrieve_resources_by_dbpedia_spotlight(query,entities=[]):
  return [ r['id'] for r in  get_resources_by_dbpedia_spotlight(query,entities)]

print(get_resources_by_dbpedia_spotlight("Which Swiss conductor's cause of death is myocardial infarction?"))
print(retrieve_resources_by_dbpedia_spotlight("Which Swiss conductor's cause of death is myocardial infarction?"))

[{'label': 'Swiss', 'id': 'Switzerland', 'score': '0.9724504598633273'}, {'label': 'conductor', 'id': 'Conducting', 'score': '0.9962600187612703'}, {'label': 'infarction', 'id': 'Infarction', 'score': '0.983796696254073'}]
use of spotlight cache!
['Switzerland', 'Conducting', 'Infarction']


### Wikipedia Entity Linker

In [20]:
%%capture
!pip install spacy-entity-linker
!python -m spacy_entity_linker "download_knowledge_base"
!pip install spacy==3.0.6
!python -m spacy download en_core_web_sm
# require restart runtime!!!!

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [21]:
import spacy  # version 3.0.6'

# initialize language model
nlp = spacy.load("en_core_web_sm")

# add pipeline (declared through entry_points in setup.py)
nlp.add_pipe("entityLinker", last=True)

wel_cache = {}
def get_resources_by_wikidata_entity_linking(query,entities=[]):
  key = query.replace(" ","_")  
  if (key in wel_cache):
        print("use of wel cache!")
        return wel_cache[key]
  doc = nlp(query)  
  # returns all entities in the whole document
  all_linked_entities = doc._.linkedEntities
  # iterates over sentences and prints linked entities
  resources = []
  for sent in doc.sents:
    for i in sent._.linkedEntities:
      resources.append({
          'label':i.label,
          'id':"Q"+str(i.identifier),
          'score':-1
      })      
    #sent._.linkedEntities.pretty_print()
  wel_cache[key] = resources
  return resources

def retrieve_resources_by_wikidata_entity_linking(query,entities=[]):
  return [ r['id'] for r in  get_resources_by_wikidata_entity_linking(query,entities)]

print(get_resources_by_wikidata_entity_linking("I watched the Pirates of the Caribbean last silvester"))
print(retrieve_resources_by_wikidata_entity_linking("I watched the Pirates of the Caribbean last silvester"))

[{'label': 'Pirates of the Caribbean', 'id': 'Q194318', 'score': -1}, {'label': 'Caribbean', 'id': 'Q664609', 'score': -1}, {'label': 'Silvester', 'id': 'Q12525597', 'score': -1}]
use of wel cache!
['Q194318', 'Q664609', 'Q12525597']


# Results

In [22]:
from IPython.display import clear_output
import json
import pandas as pd

def json_file(name):
  return name+"-resources.json"

def csv_file(name):
  return name+"-resources.csv"

def retrieve_wikidata_candidates(query,entities,use_cache=True):
  resources = []
  for e in entities:
    for r in get_wikidata_candidates(e,use_cache):
      resources.append(r['id'])
  return resources

def retrieve_wikidata_resources(query,entities,max=-1,use_cache=True):
  resources = []
  for e in entities:
    for r in get_wikidata_resource(query,e,max,use_cache):
      resources.append(r['id'])
  return resources

def evaluate_wiki_data(name,dataframe, max=-1):
  l1, l2, l3,l4, l5, l6 = [], [], [], [], [], []
  total = 0
  for index, row in dataframe.iterrows():
      if (max != -1) and (total > max):
        break
      question = row['question']
      entities = row['entities']
      print(index,":",question)
      l1.append(retrieve_wikidata_candidates(question,entities))
      l2.append(retrieve_wikidata_resources(question,entities))
      l3.append(retrieve_wikidata_resources(question,entities,1))
      l4.append(retrieve_wikidata_resources(question,entities,2))
      l5.append(retrieve_wikidata_resources(question,entities,3))  
      l6.append(retrieve_resources_by_wikidata_entity_linking(question,entities))      
      total += 1
  dataframe['MuHeQA_Cand']=l1
  dataframe['MuHeQA_Rank']=l2
  dataframe['MuHeQA_Top1']=l3
  dataframe['MuHeQA_Top2']=l4
  dataframe['MuHeQA_Top3']=l5
  dataframe['WEL']=l6  
  clear_output(wait=True)
  print(total,"questions analyzed!")
  dataframe.to_json(json_file(name), orient='split')
  dataframe.to_csv(csv_file(name))
  return dataframe

def retrieve_dbpedia_candidates(query,entities,use_cache=True):
  resources = []
  for e in entities:
    for r in get_dbpedia_candidates(e,use_cache):
      resources.append(r['id'])
  return resources

def retrieve_dbpedia_resources(query,entities,max=-1,use_cache=True):
  resources = []
  for e in entities:
    for r in get_dbpedia_resource(query,e,max,use_cache):
      resources.append(r['id'])
  return resources


def evaluate_dbpedia_data(name,dataframe,max=-1):
  l1, l2, l3, l4, l5, l6 = [], [], [], [], [], []
  total = 0
  for index, row in dataframe.iterrows():
      if (max != -1) and (total > max):
        break
      question = row['question']
      entities = row['entities']
      print(index,":",question)
      l1.append(retrieve_dbpedia_candidates(question,entities))
      l2.append(retrieve_dbpedia_resources(question,entities))
      l3.append(retrieve_dbpedia_resources(question,entities,1))
      l4.append(retrieve_dbpedia_resources(question,entities,2))
      l5.append(retrieve_dbpedia_resources(question,entities,3))    
      l6.append(retrieve_resources_by_dbpedia_spotlight(question,entities))      
      total += 1
  dataframe['MuHeQA_Cand']=l1
  dataframe['MuHeQA_Rank']=l2
  dataframe['MuHeQA_Top1']=l3
  dataframe['MuHeQA_Top2']=l4
  dataframe['MuHeQA_Top3']=l5
  dataframe['Spotlight']=l6
  clear_output(wait=True)
  print(total,"questions analyzed!")
  dataframe.to_json(json_file(name), orient='split')
  dataframe.to_csv(csv_file(name))
  return dataframe

def make_report(name,additional=[]):
  
  df = pd.read_json(json_file(name), orient='split')
  y_true =df['resources'].tolist()
  results = []
  for col in df.columns:
    if (col == 'question') or (col == 'entities') or (col == 'resources'):
      continue
    y_pred = df[col].tolist()
    result = evaluate_labels(y_true,y_pred)
    result['model']=col
    results.append(result)

  for row in additional:
    results.append(row)

  df_results = pd.DataFrame(results)
  return df_results

In [26]:
question = "what language is spoken in the medic"
entities = ["the medic"]
    
print("Wikidata Candidates:",retrieve_wikidata_candidates(question,entities))
print("Wikidata Resources:",retrieve_wikidata_resources(question,entities))
print("Wikidata Linking:",retrieve_resources_by_wikidata_entity_linking(question,entities))
print("DBpedia Candidates:",retrieve_dbpedia_candidates(question,entities,use_cache=False))
print("DBpedia Resources:",retrieve_dbpedia_resources(question,entities,use_cache=False))
print("DBpedia Spotlight:",retrieve_resources_by_dbpedia_spotlight(question,entities))


Wikidata Candidates: ['Q1517084', 'Q3631400', 'Q7750866', 'Q26842045', 'Q7750863', 'Q27711957', 'Q27711998', 'Q27709884', 'Q22122164']
Wikidata Resources: ['Q7750866']
Wikidata Linking: ['Q315', 'Q1201260']
DBpedia Candidates: ['Doctor_of_Medicine', 'Combat_medic', 'SS_Medic', 'Medic', 'Chaplain–Medic_massacre', 'Street_medic', 'Mass_media', 'The_Medic', 'Medic_(TV_series)', 'The_Medic_Droid']
DBpedia Resources: ['Medic_(TV_series)']
DBpedia Spotlight: ['Language', 'Medic']


In [139]:
import sys
import logging

so = open("out.log", 'w', 10)
sys.stdout.echo = so
sys.stderr.echo = so

get_ipython().log.handlers[0].stream = so
get_ipython().log.setLevel(logging.INFO)

## SimpleQuestions Dataset

### Wikidata SimpleQuestions Dataset

In [None]:
evaluate_wiki_data('wsq',wsq_df)

In [None]:
make_report('wsq')

### DBpedia SimpleQuestions Dataset

In [141]:
evaluate_dbpedia_data('dsq',dsq_df)

3688 questions analyzed!


Unnamed: 0,question,entities,resources,MuHeQA_Cand,MuHeQA_Rank,MuHeQA_Top1,MuHeQA_Top2,MuHeQA_Top3,Spotlight
0,Where did roger marquis die,[Roger Marquis],[Roger_Marquis],"[Roger_Marquis,_2nd_Earl_of_Woolton, Ça_Ira_(o...",[Roger_Marquis],[Roger_Marquis],[Roger_Marquis],[Roger_Marquis],[Roger_Marquis]
1,What position does carlos gomez play?,[Carlos Gomez],[Carlos_Gómez],"[Carlos_Gomes_Júnior, Juan_Carlos_Gómez, Carlo...",[Carlos_Gómez],[],[Scott_Gomez],[Scott_Gomez],[Carlos_Gómez]
2,how does engelbert zaschka identify,[Engelbert Zaschka],[Engelbert_Zaschka],[Engelbert_Zaschka],[Engelbert_Zaschka],[Engelbert_Zaschka],[Engelbert_Zaschka],[Engelbert_Zaschka],[Engelbert_Zaschka]
3,what position does pee wee reese play in baseball,[Pee Wee Reese],[Pee_Wee_Reese],"[Pee_Wee_Reese, American_Amateur_Baseball_Cong...",[Pee_Wee_Reese],[Pee_Wee_Reese],[Pee_Wee_Reese],[Pee_Wee_Reese],"[Pee_Wee_Reese, Urine]"
4,Which Swiss conductor's cause of death is myoc...,[Myocardial infarction],[Myocardial_infarction],"[Myocardial_infarction, Myocardial_infarction_...",[Myocardial_infarction],[Myocardial_infarction],[Myocardial_infarction],[Myocardial_infarction],"[Switzerland, Conducting, Infarction]"
...,...,...,...,...,...,...,...,...,...
3683,"What country was malouf abraham, sr. born to","[Malouf Abraham, Sr.]","[Malouf_Abraham,_Sr.]",[],[],[],[],[],"[Abraham, Serbian_language]"
3684,is zhang ziyi female or male,[Zhang Ziyi],[Zhang_Ziyi],"[Zhang_Ziyi, Ziyu_Zhang, List_of_awards_and_no...",[Zhang_Ziyi],[Zhang_Ziyi],[Zhang_Ziyi],[Zhang_Ziyi],[Zhang_Ziyi]
3685,What genre is the book circle of friends?,[Circle of Friends (novel)],[Circle_of_Friends_(novel)],"[Circle_of_Friends_(novel), Dōjin, Neil_LaBute...",[Circle_of_Friends_(novel)],[Circle_of_Friends_(novel)],[Circle_of_Friends_(novel)],[Circle_of_Friends_(novel)],[Circle]
3686,Who is a notable figure that was born in barce...,[Barcelona],[Barcelona],"[Barcelona, FC_Barcelona, FC_Barcelona_B, Prov...",[Barcelona],[Barcelona],[Barcelona],[Barcelona],[Barcelona]


In [142]:
make_report('dsq')

Unnamed: 0,total,empty,tp,tn,fp,fn,micro-precision,micro-recall,micro-f1,macro-precision,macro-recall,macro-f1,model
0,3687,0,3506,0,25380,183,0.121374,0.950393,0.215257,0.210764,0.95038,0.296229,MuHeQA_Cand
1,3687,0,3367,0,198,321,0.94446,0.912961,0.928443,0.912012,0.912961,0.912328,MuHeQA_Rank
2,3687,0,3369,0,169,319,0.952233,0.913503,0.932466,0.913503,0.913503,0.913503,MuHeQA_Top1
3,3687,0,3351,0,467,337,0.877685,0.908623,0.892886,0.888964,0.908623,0.895517,MuHeQA_Top2
4,3687,0,3356,0,718,332,0.82376,0.909978,0.864726,0.86438,0.909978,0.879338,MuHeQA_Top3
5,3687,0,2345,0,1930,1344,0.548538,0.635674,0.5889,0.548717,0.635575,0.576762,Spotlight


## WikidataQA

In [89]:
evaluate_wiki_data('wqa',wqa_df)

101 questions analyzed!


Unnamed: 0,question,entities,resources,MuHeQA_Cand,MuHeQA_Rank,MuHeQA_Top1,MuHeQA_Top2,MuHeQA_Top3,WEL
0,Who is the president of Poland?,[president of Poland],[Q1054799],"[Q1054799, Q7241287]",[Q1054799],[Q1054799],[Q1054799],[Q1054799],"[Q1054799, Q36]"
1,How many Turing awards have people from Austri...,"[Turing awards, Austria]","[Q185667, Q40]","[Q185667, Q56067342, Q185667, Q40, Q131964, Q2...","[Q185667, Q185667, Q40]","[Q185667, Q40]","[Q185667, Q185667, Q40]","[Q185667, Q185667, Q40]","[Q185667, Q5, Q40]"
2,Give me all countries that have won a FIFA Wor...,[FIFA World Cup],[Q19317],"[Q19317, Q864001, Q176883, Q170645, Q150933, Q...",[Q19317],[Q19317],[Q19317],[Q19317],"[Q6256, Q19317]"
3,What is the population of Chile?,[Chile],[Q298],"[Q298, Q1045129, Q396324, Q18418541, Q5490088,...",[Q298],[Q298],[Q298],[Q298],"[Q2625603, Q298]"
4,Who is the author of One Piece?,[One Piece],[Q673],"[Q200539, Q673, Q710324, Q28667972, Q4431905, ...",[Q28667972],[Q673],"[Q673, Q710324]","[Q673, Q28667972, Q710324]","[Q482980, Q1048718, Q27953041]"
...,...,...,...,...,...,...,...,...,...
96,Who wrote The Old Man and the Sea?,[The Old Man and the Sea],[Q26505],"[Q26505, Q1198269, Q177145, Q387241, Q7754883,...",[Q26505],[Q26505],[Q26505],[Q26505],"[Q1055469, Q498805]"
97,Which YouTube channels talk about maths?,"[YouTube channels, maths]","[Q17558136, Q395]","[Q108932203, Q108931581, Q17558136, Q63185508,...","[Q110991190, Q6786758]","[Q110991190, Q21148294]","[Q17558136, Q21148294]","[Q17558136, Q21148294]","[Q866, Q395]"
98,List Italian sauces.,"[sauce, Italy]","[Q178359, Q38]","[Q178359, Q249114, Q1242466, Q429855, Q1015993...","[Q178359, Q38]","[Q178359, Q38]","[Q178359, Q38]","[Q178359, Q38]","[Q178359, Q52715628]"
99,What diseases are associated with the gene FGF14?,[FGF14],[Q17928040],"[Q17928040, Q18250567, Q24785004, Q24396191, Q...",[Q17928040],[Q17928040],[Q17928040],[Q17928040],"[Q12136, Q7187, Q17928040]"


In [90]:
make_report('wqa')

Unnamed: 0,total,empty,tp,tn,fp,fn,micro-precision,micro-recall,micro-f1,macro-precision,macro-recall,macro-f1,model
0,100,4,136,0,1513,22,0.082474,0.860759,0.150526,0.122803,0.828383,0.18893,MuHeQA_Cand
1,100,4,83,0,107,71,0.436842,0.538961,0.482558,0.496445,0.520627,0.503386,MuHeQA_Rank
2,100,4,84,0,62,66,0.575342,0.56,0.567568,0.575083,0.570132,0.571782,MuHeQA_Top1
3,100,4,90,0,76,62,0.542169,0.592105,0.566038,0.558251,0.585809,0.568725,MuHeQA_Top2
4,100,4,93,0,106,59,0.467337,0.611842,0.529915,0.518152,0.605611,0.54989,MuHeQA_Top3
5,100,4,95,0,112,55,0.458937,0.633333,0.532213,0.521452,0.636964,0.546582,WEL
