# Entity Recognition

In [None]:
%%capture
!pip install --upgrade pip
!pip install flair
!pip install unidecode

In [None]:
import logging
#logging.basicConfig(filename='myfirstlog.log',level=logging.INFO,filemode='w',format='%(asctime)s | %(name)s | %(levelname)s | %(message)s')
logging.basicConfig(level=logging.INFO,format='%(asctime)s | %(name)s | %(levelname)s | %(message)s')
logging.info('This message will be logged')
logging.debug('This message will not be logged')

In [2]:
import unidecode
import hashlib

# PoS tagger
from flair.data import Sentence
from flair.models import SequenceTagger

# uncased NER model
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

def hash_text(text):
    hash_object = hashlib.md5(text.encode())
    md5_hash = hash_object.hexdigest()
    return str(md5_hash) 

pos_language_model = "flair/pos-english"
ner_language_model = "dslim/bert-base-NER-uncased"

ner_tagger = SequenceTagger.load(pos_language_model)
ner_tokenizer = AutoTokenizer.from_pretrained(ner_language_model)
ner_model = AutoModelForTokenClassification.from_pretrained(ner_language_model)
ner_nlp = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)

entities_cache = {}

def unicase(label):
  return unidecode.unidecode(label.strip()).lower()

def get_pos_entities(text,main_categories=['CD','NN','NNS','NNP','NNPS'],additional_categories=['JJ','NNS','CC'],drop_categories=['IN']):
    # make sentence
    sentence = Sentence(text)

    # predict NER tags
    ner_tagger.predict(sentence)

    # iterate over entities 
    entities = []
    current_entity = ""
    partial_entity = ""
    for t in sentence.tokens:
        for label in t.annotation_layers.keys():
            text = t.text
            label = t.get_labels(label)[0].value   
            if (label in main_categories):
                if (len(partial_entity)>0):
                  current_entity = partial_entity + " " + text
                  partial_entity = ""
                elif (current_entity == ""):
                  current_entity += text
                else:
                    current_entity += " " + text
            elif(label in additional_categories):
              if (len(current_entity)>0):
                current_entity += " " + text
              elif (len(partial_entity)>0):
                partial_entity += " " + text
              else:
                partial_entity += text
            elif(label in drop_categories):
               current_entity = ""
               partial_entity = ""
            elif len(current_entity) > 0:
                entities.append(current_entity)
                current_entity = ""
                partial_entity = ""
          
    if (len(current_entity)>0):
        entities.append(current_entity)
    return entities

def get_entities(text,additional=True):
    key = hash_text(text)
    if (key in entities_cache):
        #print("use of entities_cache!")
        return entities_cache[key]
    entities = []
    entity = ""
    index = -1
    offset = -1
    for token in ner_nlp(text):
        if (index == -1):
            index = token['index']
            offset = token['start']
        word = token['word']
        if (word[0] == '#'):
            word = token['word'].replace("#","")
            
        if (token['start']== offset):
            entity += word
        elif (token['index']-index < 2):
            entity += " " + word
        else:
            entities.append(entity)
            entity = word
        index = token['index']
        offset = token['end']
        
    if (len(entity) > 0):    
        entities.append(entity)
    if additional:      
      pos_entities = [unicase(e) for e in get_pos_entities(text)]      
      if (len(entities) == 0):
        return pos_entities
      final_entities = [] 
      for e in entities:
          final_entity = e
          for pe in pos_entities:
            if (e in pe):
              final_entity = pe
          final_entities.append(final_entity)
      if (len(final_entities)>0):
        return list(set(final_entities))
    entities_cache[key] = entities
    return entities


2022-05-03 11:35:20,037 loading file /Users/cbadenes/.flair/models/pos-english/a9a73f6cd878edce8a0fa518db76f441f1cc49c2525b2b4557af278ec2f0659e.121306ea62993d04cd1978398b68396931a39eb47754c8a06a87f325ea70ac63
2022-05-03 11:35:20,820 SequenceTagger predicts: Dictionary with 53 tags: <unk>, O, UH, ,, VBD, PRP, VB, PRP$, NN, RB, ., DT, JJ, VBP, VBG, IN, CD, NNS, NNP, WRB, VBZ, WDT, CC, TO, MD, VBN, WP, :, RP, EX, JJR, FW, XX, HYPH, POS, RBR, JJS, PDT, NNPS, RBS, AFX, WP$, -LRB-, -RRB-, ``, '', LS, $, SYM, ADD


In [3]:

text = "How many Turing awards have people from Austria won?"
print("PoS entities:",get_pos_entities(text))
print("Entities:", get_entities(text))

PoS entities: ['many Turing awards', 'Austria']
Entities: ['austria', 'many turing awards']


# Entity Linking

In [None]:
%%capture
!pip install sentence-transformers
!pip3 install sparqlwrapper

In [4]:
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import time
from SPARQLWrapper import SPARQLWrapper, JSON  
import pandas as pd
import urllib.request as url
import json
import requests

sentence_language_model = "sentence-transformers/all-distilroberta-v1"
sentence_model = SentenceTransformer(sentence_language_model)
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/cbadenes/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cbadenes/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
wiki_cache = {}
dbpedia_cache = {}

In [47]:
def lemmatize(text):
  result = []
  for token in text.split(" "):
    result.append(lemmatizer.lemmatize(token))
  return " ".join(result)

print("Lemma:",lemmatize("Turing awards"))

def sort_by_similar(text,texts):
  sentences = [text]
  sentences.extend(texts)
  embeddings = sentence_model.encode(sentences)
  sim_list = []
  index=0
  for e in embeddings[1:]:
    ref = embeddings[0]
    score = cosine_similarity([ref], [e])
    score_val = round(score[0][0], 1)
    sim_list.append({'id':index, 'text':texts[index], 'score':score_val})  
    index+=1
  sim_list.sort(key=lambda x: x.get('score'),reverse=True)
  return sim_list

def get_top_candidates(ref_text,candidates,max=-1):
  top_candidates = []
  if (len(candidates) == 0):
        return top_candidates
  sorted_candidates = sort_by_similar(ref_text,[c['text'] for c in candidates])  
  best_score = sorted_candidates[0]['score']
  for index, c in enumerate(sorted_candidates):
    if (index < max) or (c['score'] == best_score):
        candidate = candidates[c['id']]
        candidate['score'] = c['score']
        top_candidates.append(candidate)
  return top_candidates  

def print_candidates(criteria,candidates):
    print("## Sorted Candidates by",criteria,": ",[{'name':c['label'], 'id': c['id'], 'description':c['description'],'score':c['score']} for c in candidates])

def get_resources_by_candidates(context,label,candidates,max=-1):
  if (len(candidates) == 0):
    return []
  # sort candidates by name/label
  candidates_by_name = []  
  top_candidates_by_name = get_top_candidates(label,[ {'id':i, 'text':c['label'] } for i,c in enumerate(candidates)],10)
  for t in top_candidates_by_name:
    candidate = candidates[t['id']]
    candidate['score'] = t['score']
    candidates_by_name.append(candidate)
  #print_candidates("By Name",candidates_by_name)
  
  # sort candidates by properties
  candidates_by_properties = []
  candidate_properties = []
  for i,c in enumerate(candidates_by_name):
     for p in c['properties']:
        candidate_properties.append({'id':i, 'text':p['value'] })
  top_candidates_by_prop = get_top_candidates(context.replace(c['label'],""),candidate_properties,10) 
  for t in top_candidates_by_prop:
     candidate = candidates_by_name[t['id']]
     if (candidate not in candidates_by_properties):
        candidate['score'] = (2*candidate['score'] + 4*t['score']) / 6.0
        candidates_by_properties.append(candidate)
  #print_candidates("By Properties",candidates_by_properties)
  
        
  # sort candidates by description
  candidates_by_description = []
  top_candidates_by_desc = get_top_candidates(context.replace(c['label'],""),[ {'id':i, 'text':c['description'] } for i,c in enumerate(candidates_by_properties)],max)    
  for t in top_candidates_by_desc:
     candidate = candidates_by_properties[t['id']]
     if (candidate not in candidates_by_description):
        candidate['score'] = (2*candidate['score'] + 1*t['score']) / 3.0
        candidates_by_description.append(candidate)  
  #print_candidates("By Description",candidates_by_description)
  return candidates_by_description

##########################################################################################
# Wikidata Linker
##########################################################################################
wiki_sparql = SPARQLWrapper("https://query.wikidata.org/sparql",agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36')
wiki_sparql.setReturnFormat(JSON)
wiki_sparql.setTimeout(timeout=60)

def get_wikidata_properties(entity,use_cache=False):
  if (use_cache) and (entity in wiki_cache):
    #print("use of cache!")
    return wiki_cache[entity].copy()
  query = """
      PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
      PREFIX wd: <http://www.wikidata.org/entity/> 
      SELECT distinct ?prop ?propLabel
      WHERE
      {
        { wd:ENTITY ?a ?b }
              union
              { ?s ?a wd:ENTITY } .

        SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } 
        ?prop wikibase:directClaim ?a .
      } 
      LIMIT 250
      """
  query_text = query.replace('ENTITY',entity)
  wiki_sparql.setQuery(query_text)
  result = []
  while (len(result) == 0):
    try:
        ret = wiki_sparql.queryAndConvert()
        for r in ret["results"]["bindings"]:
            if ('propLabel' in r) and ('value' in r['propLabel']):
                    value = r['propLabel']['value']
                    id = r['prop']['value'].split("http://www.wikidata.org/entity/")[1]
                    if ('id' not in value.lower()) and ('link' not in value.lower()) and ('has abstract' not in value.lower()) and ('wiki' not in value.lower()) and ('instance of' not in value.lower()):
                        result.append({'id':id, 'value':value})
    except Exception as e:
        print("Error on wikidata property query:",e,"->",query_text)
    break           
  wiki_cache[entity] = result
  return result

def get_wikidata_candidates(label,use_cache=True,verbose=False):
    if (use_cache) and (label in wiki_cache):
      #print("use of cache for label:",label)
      return wiki_cache[label].copy()
    candidates = []
    if (label==""):
        return candidates
    # type: One of the following values: form, form, item, lexeme, property, sense, sense
    headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    query_path = "https://www.wikidata.org/w/api.php?action=wbsearchentities&search=QUERY_TEXT&language=en&limit=10&type=item&format=json"
    request = query_path.replace("QUERY_TEXT",label)
    if (verbose):
        print("Request:",request)
    r = requests.get(request,headers = headers)
    if (len(r.json()['search']) == 0):
      if (verbose):
        print("search by lemma:",lemmatize(label))
      r = requests.get(query_path.replace("QUERY_TEXT",lemmatize(label)))
      size = len(label.split(" "))
      index = 1
      while(('search' in r.json()) and (len(r.json()['search']) == 0) and (index<size)):
        query_label = " ".join(label.split(" ")[index:])
        index += 1  
        if (verbose):
          print("search by Partial Label:",query_label)
        r = requests.get(query_path.replace("QUERY_TEXT",query_label)) 
    if (verbose):
        print("Response:",r.json())
    for answer in r.json()['search']:
        description = ""
        if ('description' in answer['display']):
          description = answer['display']['description']['value']
          if 'disambiguation' in description:
                continue
        candidate = {
            'label': answer['display']['label']['value'],
            'id':answer['id'],
            'description' : description,
            'properties' : get_wikidata_properties(answer['id'],use_cache)
        }
        candidates.append(candidate)
    wiki_cache[label]=candidates
    #print("cache '",label,"' updated with:'",[c['id'] for c in candidates])
    return candidates

def get_wikidata_resource(context,entity,max=-1,use_cache=True):
    candidates = get_wikidata_candidates(entity,use_cache)
    lema_entity = lemmatize(entity)
    if (entity != lema_entity):
        candidate_ids = [c['id'] for c in candidates]
        for ac in get_wikidata_candidates(lema_entity,use_cache):
            if (ac['id'] not in candidate_ids):
                candidates.append(ac)
    resources = get_resources_by_candidates(context, entity, candidates,max)
    return resources

##########################################################################################
# DBpedia Linker
##########################################################################################

dbpedia_sparql = SPARQLWrapper("https://dbpedia.org/sparql/")
dbpedia_sparql.setReturnFormat(JSON)
dbpedia_sparql.setTimeout(timeout=60)

def get_dbpedia_properties(entity,use_cache=True):
    #select distinct ?property ?label {
      key = entity+"_props"
      if (use_cache) and (key in dbpedia_cache):
          #print("use of cache!")
          return dbpedia_cache[key].copy()
      query = """
          PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
          PREFIX dbr: <http://dbpedia.org/resource/> 
          select distinct ?property ?label {
              { <http://dbpedia.org/resource/ENTITY> ?property ?o }
              union
              { ?s ?property <http://dbpedia.org/resource/ENTITY> }

              optional { 
                ?property rdfs:label ?label .
                filter langMatches(lang(?label), 'en')
              }
              filter(regex(?property, "property", "i" )) 
          }
          LIMIT 250
          """
      query_text = query.replace('ENTITY',entity)
      dbpedia_sparql.setQuery(query_text)
      result = []
      while (len(result) == 0):
        try:
            ret = dbpedia_sparql.queryAndConvert()
            for r in ret["results"]["bindings"]:
                if ('label' in r) and ('value' in r['label']):
                    value = r['label']['value']
                    id = r['property']['value']
                    if ('id' not in value.lower()) and ('link' not in value.lower()) and ('has abstract' not in value.lower()) and ('wiki' not in value.lower()) and ('instance of' not in value.lower()):
                        result.append({'id':id, 'value':value})
        except Exception as e:
            print("Error on SPARQL query:",e)
        break           
      dbpedia_cache[key] = result
      #print(len(result),"properties found")
      return result

def get_dbpedia_candidates(label,use_cache=True,verbose=False):
    if (use_cache) and (label in dbpedia_cache):
      #print("use of cache!")
      return dbpedia_cache[label].copy()
    candidates = []
    if (label==""):
        return candidates
    # type: One of the following values: form, form, item, lexeme, property, sense, sense
    #query_path = "https://www.wikidata.org/w/api.php?action=wbsearchentities&search=QUERY_TEXT&language=en&limit=10&type=item&format=json"
    query_path = "https://lookup.dbpedia.org/api/search?format=JSON&query=QUERY_TEXT&maxResults=10"
    url = query_path.replace("QUERY_TEXT",label)
    #print("->",url)
    r = requests.get(url)
    if (len(r.json()['docs']) == 0):
        if (verbose):
            print("Use of lemmatize literal:",lemmatize(label))
        r = requests.get(query_path.replace("QUERY_TEXT",lemmatize(label)))
        size = len(label.split(" "))
        index = 1  
        while(('search' in r.json()) and (len(r.json()['search']) == 0) and (index<size)):
            query_label = " ".join(label.split(" ")[index:])
            index += 1  
            if (verbose):
              print("search by Partial Label:",query_label)
            r = requests.get(query_path.replace("QUERY_TEXT",query_label)) 
    for answer in r.json()['docs']:
        description,label,id = "","",""
        properties = []
        if ('comment' in answer) and (len(answer['comment']) > 0):
          description = answer['comment'][0].replace("<B>","").replace("</B>","")
        if ('resource' in answer) and (len(answer['resource']) > 0):
          id = answer['resource'][0].split("http://dbpedia.org/resource/")[1]
          properties = get_dbpedia_properties(id,use_cache)
        if ('label' in answer) and (len(answer['label']) > 0):
          label = answer['label'][0].replace("<B>","").replace("</B>","")
        else:
          label = id        
        candidate = {
            'label': label,
            'id':id,
            'description' : description,
            'properties' : properties
        }
        candidates.append(candidate)
    dbpedia_cache[label]=candidates
    return candidates

def get_dbpedia_resource(context,entity,max=-1,use_cache=True):
    candidates = get_dbpedia_candidates(entity,use_cache)
    lema_entity = lemmatize(entity)
    if (entity != lema_entity):
        candidate_ids = [c['id'] for c in candidates]
        for ac in get_dbpedia_candidates(lema_entity,use_cache):
            if (ac['id'] not in candidate_ids):
                candidates.append(ac)
    resources = get_resources_by_candidates(context, entity, candidates,max)
    return resources    


Lemma: Turing award


In [12]:
for r in get_wikidata_resource("What position does carlos gomez play?","Carlos Gomez",2,use_cache=True):
    print(r['id'],r['label'],r['description'])


## Sorted Candidates by By Name :  [{'name': 'Carlos Gomez', 'id': 'Q89898891', 'description': 'researcher', 'score': 1.0}, {'name': 'Carlos A Gomez', 'id': 'Q91676432', 'description': 'researcher (ORCID 0000-0001-5486-5710)', 'score': 0.9}, {'name': 'Carlos Gómez', 'id': 'Q949506', 'description': 'American actor', 'score': 0.8}, {'name': 'Carlos Gómez', 'id': 'Q203210', 'description': 'Mexican footballer (1952-2017)', 'score': 0.8}, {'name': 'Carlos Gómez', 'id': 'Q51944192', 'description': 'researcher', 'score': 0.8}, {'name': 'Carlos Gómez', 'id': 'Q2747238', 'description': 'Dominican Republic baseball player', 'score': 0.8}, {'name': 'Carlos Gómez', 'id': 'Q3660067', 'description': 'Argentinian comics artist', 'score': 0.8}, {'name': 'Carlos Gómez', 'id': 'Q5750557', 'description': 'Chilean association football player born 1992', 'score': 0.8}, {'name': 'Carlos M Gómez', 'id': 'Q40124092', 'description': 'researcher', 'score': 0.7}]
## Sorted Candidates by By Properties :  [{'name'

In [629]:
for r in get_dbpedia_candidates("roger marquis",use_cache=False,verbose=False):
    print("[",r['id'],"]",r['label'],":",r['properties'])

[ Roger_Marquis,_2nd_Earl_of_Woolton ] Roger Marquis, 2nd Earl of Woolton : [{'id': 'http://dbpedia.org/property/after', 'value': 'after'}, {'id': 'http://dbpedia.org/property/before', 'value': 'before'}, {'id': 'http://dbpedia.org/property/coronet', 'value': 'coronet'}, {'id': 'http://dbpedia.org/property/creationDate', 'value': 'creation date'}, {'id': 'http://dbpedia.org/property/crest', 'value': 'crest'}, {'id': 'http://dbpedia.org/property/escutcheon', 'value': 'escutcheon'}, {'id': 'http://dbpedia.org/property/motto', 'value': 'motto'}, {'id': 'http://dbpedia.org/property/supporters', 'value': 'supporters'}, {'id': 'http://dbpedia.org/property/title', 'value': 'title'}, {'id': 'http://dbpedia.org/property/years', 'value': 'years'}, {'id': 'http://dbpedia.org/ontology/thumbnail', 'value': 'thumbnail'}, {'id': 'http://dbpedia.org/property/parents', 'value': 'parents'}, {'id': 'http://dbpedia.org/ontology/parent', 'value': 'parent'}]
[ Ça_Ira_(opera) ] Ça Ira (opera) : [{'id': 'http

In [None]:
for r in get_wikidata_candidates("sam edward",use_cache=True):
    print(r['id'],r['label'],r['description'])

In [633]:
for p in get_dbpedia_properties("Roger_Marquis",use_cache=False):
    print(p)

{'id': 'http://dbpedia.org/property/name', 'value': 'name'}
{'id': 'http://dbpedia.org/property/deathPlace', 'value': 'death place'}
{'id': 'http://dbpedia.org/property/birthPlace', 'value': 'birth place'}
{'id': 'http://dbpedia.org/property/bats', 'value': 'bats'}
{'id': 'http://dbpedia.org/property/birthDate', 'value': 'birth date'}
{'id': 'http://dbpedia.org/property/deathDate', 'value': 'death date'}
{'id': 'http://dbpedia.org/property/debutdate', 'value': 'debutdate'}
{'id': 'http://dbpedia.org/property/debutleague', 'value': 'debutleague'}
{'id': 'http://dbpedia.org/property/debutteam', 'value': 'debutteam'}
{'id': 'http://dbpedia.org/property/debutyear', 'value': 'debutyear'}
{'id': 'http://dbpedia.org/property/finaldate', 'value': 'finaldate'}
{'id': 'http://dbpedia.org/property/finalleague', 'value': 'finalleague'}
{'id': 'http://dbpedia.org/property/finalteam', 'value': 'finalteam'}
{'id': 'http://dbpedia.org/property/finalyear', 'value': 'finalyear'}
{'id': 'http://dbpedia.o

In [None]:
#wiki_cache['sam edward']=[]
#wiki_cache['sam edwards']=[]
print(lemmatize('sam edwards'))
for r in wiki_cache['sam edwards']:
    print(r['id'],r['label'],r['description'])

# Entity Summary

In [31]:
import hashlib

def verbalize(entity,property,value,verbose=False):
    tokens = ["The",property,"of",entity,"is",value]
    if (verbose):
        print("Tokens:",tokens)
    return " ".join(tokens)

def hash_text(text):
    hash_object = hashlib.md5(text.encode())
    md5_hash = hash_object.hexdigest()
    return str(md5_hash) 

##########################################################################################
# Wikidata Summarizer
##########################################################################################

def get_wikidata_property_value(filter,use_cache=True,verbose=False):
  key = hash_text(filter)
  if (use_cache) and (key in wiki_cache):
    #print("use of cache!")
    return wiki_cache[key].copy()
  query = """
      PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
      PREFIX bd: <http://www.bigdata.com/rdf#>
      PREFIX wd: <http://www.wikidata.org/entity/> 
      PREFIX wdt: <http://www.wikidata.org/prop/direct/>
      PREFIX wikibase: <http://wikiba.se/ontology#>
      SELECT distinct ?obj ?objLabel
      WHERE
      FILTER 
      LIMIT 250
      """
  query_text = query.replace('FILTER',filter)
  wiki_sparql.setQuery(query_text)
  result = []
  while (len(result) == 0):
    try:
        #if (verbose):
        #    print("SPARQL Query:",query_text)
        ret = wiki_sparql.queryAndConvert() 
        #if (verbose):
        #    print("SPARQL Response:",ret)
        for r in ret["results"]["bindings"]:
            id = r['obj']['value']
            value = id
            if ('objLabel' in r) and ('value' in r['objLabel']):
                    value = r['objLabel']['value']                
            if (' id ' not in value.lower()) and (' link ' not in value.lower()) and ('has abstract' not in value.lower()) and ('wiki' not in value.lower()) and ('instance of' not in value.lower()):
                    result.append({'id':id, 'value':value})
    except Exception as e:
        print("Error on wikidata property value query:",e,"->",query_text)
    break           
  wiki_cache[key] = result
  return result

def get_forward_wikidata_property_value(entity,property,use_cache=True,verbose=False): 
  #query_filter = """
  #            {
  #              wd:ENTITY p:PROPERTY ?data .
  #              ?data ps:PROPERTY ?obj .
  #              ?data pq:585 ?time .
  #              SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
  #            }
  #            ORDER BY DESC ( ?time )
  #            """
  query_filter = """
              {
                wd:ENTITY wdt:PROPERTY ?obj .
                SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
              }                            
              """
  return get_wikidata_property_value(query_filter.replace("ENTITY",entity).replace("PROPERTY",property),use_cache,verbose)

def get_backward_wikidata_property_value(entity,property,use_cache=True,verbose=False):
  query_filter = """
              {
                ?obj wdt:PROPERTY wd:ENTITY .
                SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
              }                            
              """
  return get_wikidata_property_value(query_filter.replace("ENTITY",entity).replace("PROPERTY",property),use_cache,verbose)

def get_wikidata_summary(candidate,context,max=5,use_cache=True,verbose=False):
    candidate_id = candidate['id']
    properties = candidate['properties']
    top_properties = get_top_candidates(context.lower().replace(candidate['label'].lower(),""),[ {'id':i, 'text':p['value'] } for i,p in enumerate(properties)],max)
    if (verbose):
        print("Top Properties:", top_properties)    
    sentences = []
    for p in top_properties:
        property = properties[p['id']]
        if (verbose):
            print(property)
        for v in get_forward_wikidata_property_value(candidate_id,property['id'],verbose):
            if (verbose):
                print("Forward Value:",v)
            sentences.append(verbalize(candidate['label'],property['value'],v['value'],verbose)+".")
        for v in get_backward_wikidata_property_value(candidate_id,property['id'],verbose):
            if (verbose):
                print("Backward Value:",v)
            sentences.append(verbalize(candidate['label'],property['value'],v['value'],verbose)+".")
    return " ".join(sentences)
    
##########################################################################################
# DBpedia Summarizer
##########################################################################################

def get_dbpedia_property_value(filter,use_cache=True,verbose=False):
  key = hash_text(filter)
  if (use_cache) and (key in dbpedia_cache):
    #print("use of cache!")
    return dbpedia_cache[key].copy()
  query = """
      PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
      PREFIX dbr: <http://dbpedia.org/resource/> 
      select distinct ?object ?label {
          { FILTER }

          optional { 
            ?object rdfs:label ?label .
            filter langMatches(lang(?label), 'en')
          }
      }
      LIMIT 250
      """
  query_text = query.replace('FILTER',filter) 
  dbpedia_sparql.setQuery(query_text)
  result = []
  while (len(result) == 0):
    try:
        if (verbose):
            print("SPARQL Query:",query_text)
        ret = dbpedia_sparql.queryAndConvert()
        if (verbose):
            print("SPARQL Response:",ret)
        for r in ret["results"]["bindings"]:
            id = r['object']['value']
            value = id
            if ('label' in r) and ('value' in r['label']):
                value = r['label']['value']            
            if (' id ' not in value.lower()) and (' link ' not in value.lower()) and ('has abstract' not in value.lower()) and ('wiki' not in value.lower()) and ('instance of' not in value.lower()):
                result.append({'id':id, 'value':value})
    except Exception as e:
        print("Error on SPARQL query:",e)
    break           
  dbpedia_cache[key] = result
  #print(len(result),"properties found")
  return result


def get_forward_dbpedia_property_value(entity,property,use_cache=True,verbose=False):
  query_filter ="<http://dbpedia.org/resource/ENTITY> <PROPERTY> ?object" 
  return get_dbpedia_property_value(query_filter.replace("ENTITY",entity).replace("PROPERTY",property),use_cache,verbose) 

def get_backward_dbpedia_property_value(entity,property,use_cache=True,verbose=False):
  query_filter ="?object <PROPERTY> <http://dbpedia.org/resource/ENTITY>"
  return get_dbpedia_property_value(query_filter.replace("ENTITY",entity).replace("PROPERTY",property),use_cache,verbose) 

def get_dbpedia_summary(candidate,context,max=5,use_cache=True,verbose=False):
    candidate_id = candidate['id']
    properties = candidate['properties']
    top_properties = get_top_candidates(context.lower().replace(candidate['label'].lower(),""),[ {'id':i, 'text':p['value'] } for i,p in enumerate(properties)],max)
    if (verbose):
        print("Top Properties:", top_properties)    
    sentences = []
    for p in top_properties:
        property = properties[p['id']]
        if (verbose):
            print(property)
        for v in get_forward_dbpedia_property_value(candidate_id,property['id'],use_cache,verbose):
            if (verbose):
                print("Forward Value:",v)
            sentences.append(verbalize(candidate['label'],property['value'],v['value'],verbose)+".")
        for v in get_backward_dbpedia_property_value(candidate_id,property['id'],use_cache,verbose):
            if (verbose):
                print("Backward Value:",v)
            sentences.append(verbalize(candidate['label'],property['value'],v['value'],verbose)+".")
    return " ".join(sentences)
    

In [27]:
sample_question = "What is the medication of schizophrenia?"
sample_entity = "schizophrenia"
top = 2


print("Summaries for question:",sample_question,"and entity:", sample_entity)
for index, candidate in enumerate(get_wikidata_resource(sample_question,sample_entity,top)):
    print(candidate['id'],"->",get_wikidata_summary(candidate,sample_question,use_cache=True,verbose=False))

Summaries for question: What is the medication of schizophrenia? and entity: schizophrenia
## Sorted Candidates by By Name :  [{'name': 'schizophrenia', 'id': 'Q41112', 'description': 'psychotic disorder characterized by emotional responsiveness and disintegration of thought process', 'score': 1.0}, {'name': 'Schizophrenia', 'id': 'Q55868331', 'description': 'scholarly article about schizophrenia', 'score': 1.0}, {'name': 'Schizophrenia', 'id': 'Q28267772', 'description': 'scientific article (publication date: 19 June 2004)', 'score': 1.0}, {'name': 'Schizophrenia', 'id': 'Q72240986', 'description': 'scientific article published on 01 March 1994', 'score': 1.0}, {'name': 'Schizophrenia', 'id': 'Q79226206', 'description': 'scientific article published on 01 October 2003', 'score': 1.0}, {'name': 'Schizophrenia', 'id': 'Q56959104', 'description': 'article', 'score': 1.0}, {'name': 'Schizophrenia', 'id': 'Q71574804', 'description': 'scientific article published on 01 January 1996', 'score

In [26]:
# sample code
    
sample_question = "What drug is used to treat schizophrenia?"
sample_entity = "schizophrenia"
top = 2


print("Summaries for question:",sample_question,"and entity:", sample_entity)
for index, candidate in enumerate(get_dbpedia_resource(sample_question,sample_entity,top)):
    print(candidate['id'],"->",get_dbpedia_summary(candidate,sample_question,use_cache=True,verbose=False))

Summaries for question: What drug is used to treat schizophrenia? and entity: schizophrenia
## Sorted Candidates by By Name :  [{'name': 'Schizophrenia', 'id': 'Schizophrenia', 'description': 'Schizophrenia is a mental illness characterized by relapsing episodes of psychosis. Major symptoms', 'score': 1.0}, {'name': 'Paranoid schizophrenia', 'id': 'Paranoid_schizophrenia', 'description': 'Paranoid schizophrenia was long diagnosed as the most common type of schizophrenia, but this sub', 'score': 0.8}, {'name': 'Schizophrenia (disambiguation)', 'id': 'Schizophrenia_(disambiguation)', 'description': '', 'score': 0.8}, {'name': 'Childhood schizophrenia', 'id': 'Childhood_schizophrenia', 'description': 'Childhood schizophrenia (also known as childhood-onset schizophrenia, and very early-onset', 'score': 0.7}, {'name': 'Disorganized schizophrenia', 'id': 'Disorganized_schizophrenia', 'description': 'Disorganized schizophrenia, or hebephrenia, is a subtype of schizophrenia, although', 'score'

In [None]:
print(get_forward_wikidata_property_value("Q439893","P413",use_cache=True,verbose=True))


In [None]:
print("DBpedia summary for question:",sample_question,"and entity:", sample_entity)
for index, candidate in enumerate(get_dbpedia_resource(sample_question,sample_entity,top)):
    print(candidate['id'],"->",get_dbpedia_summary(candidate,sample_question))

# Answer Extraction

In [32]:
from transformers import pipeline

#qa_language_model = "deepset/roberta-base-squad2-covid" #roberta-covid
qa_language_model = "deepset/roberta-base-squad2" #roberta

question_answerer = pipeline("question-answering", model=qa_language_model, tokenizer=qa_language_model)

In [33]:
import spacy
sent_nlp = spacy.load('en_core_web_sm')


def chunks(lst,n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def get_answers_in_context(question,text,verbose=False):
    doc = sent_nlp(text)
    answers = []
    for s in doc.sents:
      context = s.text
      #context = s  
      try:
        if (len(s) == 0):
            continue
        #print("num tokens:", len(text.split(" ")), "num_characters:", len(text))
        result = question_answerer(question=question, context=context, min_answer_len=1, max_answer_len=100)
        #print(f"Partial Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
        score = round(result['score'], 1)
        if (score == 0.0):
            continue
        if (verbose):
            print("[",score,"]",context)        
        answer = {}
        answer['value']=result['answer'].replace(","," ")
        answer['score']=score
        answer['summary']=context
        answer['start']=result['start']
        answer['end']=result['end']
        answers.append(answer)
      except Exception as e:
        print("Error extracting answer:",e)
    return answers


In [None]:
sample_question = "what is the place of birth of sam edwards?"
sample_summary = "The place of birth of Sam Edwards is Swansea. The date of birth of Sam Edwards is 1928-02-01T00:00:00Z. The family name of Sam Edwards is Edwards. The given name of Sam Edwards is Samuel."
for a in get_answers_in_context(sample_question,sample_summary):
    print(a)

In [None]:
sample_question = "what is the place of birth of sam edwards?"
sample_summary = "The place of birth of Sam Edwards is Macon. The date of birth of Sam Edwards is 1915-05-26T00:00:00Z. The place of death of Sam Edwards is Durango. The family name of Sam Edwards is Edwards. The given name of Sam Edwards is Sam."
print("Answer:",get_answer_in_context(sample_question,sample_summary))

# Response Creation

In [29]:
from statistics import mean

def get_answers(question,max=-1,wiki=True,dbpedia=True,verbose=False):
    if (verbose):
        print("Question:",question)
    # identify entities
    entities = get_entities(question)
    if (verbose):
        print("Entities:",entities)
    summaries = []
    for e in entities:
        if (wiki):
            for r in get_wikidata_resource(question,e):
                if (verbose):
                    print("- Wiki Resources:",{'id':r['id'],'label':r['label'],'description':r['description']})
                summaries.append(get_wikidata_summary(r,question))
        if (dbpedia):            
            for r in get_dbpedia_resource(question,e):
                if (verbose):
                    print("- DBpedia Resources:",{'id':r['id'],'label':r['label'],'description':r['description']})
                summaries.append(get_dbpedia_summary(r,question))
    answers_by_value = {}
    if (verbose):
        print(len(summaries),"summaries created")
    for s in summaries:
        for a in get_answers_in_context(question,s):
            if (a['score']>0.0):
                if (a['value'] not in answers_by_value):
                    answers_by_value[a['value']]=[]
                answers_by_value[a['value']].append(a)
                
    # merge duplicates and update scores
    answers = []
    for k in answers_by_value:
        partial_answers = answers_by_value[k]
        partial_answers.sort(key=lambda x: x.get('score'),reverse=True)
        best_answer = partial_answers[0]
        best_answer['relevance'] = len(partial_answers)
        best_answer['score']=mean([a['score'] for a in partial_answers])
        answers.append(best_answer)
    
    # sort by score
    answers.sort(key=lambda x: (x.get('score'),x.get('relevance')),reverse=True)
    if (max>0):
        return answers[:max]
    elif len(answers)>0:
        best_score = answers[0]['score']
        return [a for a in answers if a['score']==best_score]
    else:
        return answers

In [657]:
for i,a in enumerate(get_answers("what is the place of birth of sam edwards?",max=-1,wiki=True,dbpedia=True,verbose=True)):
    print(i,":",a)

Question: what is the place of birth of sam edwards?
Entities: ['sam edwards']
- Wiki Resources: {'id': 'Q472382', 'label': 'Sam Edwards', 'description': 'Welsh physicist (1928-2015)'}
- Wiki Resources: {'id': 'Q3470479', 'label': 'Sam Edwards', 'description': '1915-2004 American actor'}
- DBpedia Resources: {'id': 'Sam_Edwards', 'label': 'Sam Edwards', 'description': 'Sam Edwards (May 26, 1915 – July 28, 2004) was an American actor. His most famous role'}
- DBpedia Resources: {'id': 'Sam_Edwards_(physicist)', 'label': 'Sam Edwards (physicist)', 'description': 'Sir Samuel Frederick Edwards FLSW FRS (1 February 1928 – 7 May 2015), "universally known as \'Sam'}
4 summaries created
0 : {'value': 'Swansea', 'score': 1.0, 'summary': 'The place of birth of Sam Edwards is Swansea.', 'start': 37, 'end': 44, 'relevance': 2}


# Evaluation

## Metrics

In [43]:
import unidecode

def normalize(label):
  return unidecode.unidecode(label.strip()).lower()

def precision(tp,fp):
  if (fp+tp == 0):
    return 0.0
  return tp / (fp + tp)

def recall(tp,fn):
  if (fn+tp == 0):
    return 0.0
  return tp / (fn + tp)

def f1(tp,fp,fn):
  p = precision(tp,fp)
  r = recall(tp,fn)
  if (p+r == 0):
    return 0.0
  return 2 * ((p*r)/(p+r))

def average(values):
  return sum(values) / len(values) 

# lists of entity lists
def evaluate_labels(true_list,pred_list):
  tp, tn, fp, fn = 0, 0, 0, 0
  precision_list, recall_list, f1_list = [], [], []
  empty_values = 0
  for index in range(len(true_list)):
    # normalize entities
    valid_entities = [normalize(e) for e in true_list[index] if e != '']
    predicted_entities = [normalize(e) for e in pred_list[index]]
    ptp, ptn, pfp, pfn = 0, 0, 0, 0
    if (len(valid_entities)==0):
      empty_values += 1
    for entity in valid_entities:
      if (entity not in predicted_entities):
        pfn += 1
    for entity in predicted_entities:
      if (entity in valid_entities):
        ptp += 1
      else:
        pfp += 1    
    precision_list.append(precision(ptp,pfp))
    recall_list.append(recall(ptp,pfn))
    f1_list.append(f1(ptp,pfp,pfn))
    tp += ptp
    tn += ptn
    fp += pfp
    fn += pfn  
  return  {
      'total': index,
      'empty': empty_values,
      'tp': tp,
      'tn': tn, 
      'fp': fp,
      'fn':fn,
      'micro-precision': precision(tp,fp),
      'micro-recall': recall(tp,fn),
      'micro-f1': f1(tp,fp,fn),
      'macro-precision': average(precision_list),
      'macro-recall': average(recall_list),
      'macro-f1': average(f1_list)
  }

## Tests

In [58]:
from IPython.display import clear_output
import json
import pandas as pd
import collections

def json_file(name):
  return name+"-answers.json"

def csv_file(name):
  return name+"-answers.csv"

def evaluate_data(name,dataframe,max=-1):
  l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12 = [], [], [], [], [], [], [], [], [], [], [], []
  total = 0
  for index, row in dataframe.iterrows():
      if (max>0) and (total>max):
          l1.append("")
          l2.append("")
          l3.append("")
          l4.append("")
          l5.append("")
          l6.append("")
          l7.append("")
          l8.append("")
          l9.append("")
          l10.append("")
          l11.append("")
          l12.append("")
      else:
          question = row['question']
          print(index,":",question)
          answers = get_answers(question,-1,wiki=True,dbpedia=True)
          l1.append([a['value'] for a in answers])  
          answers = get_answers(question,3,wiki=True,dbpedia=True)  
          l2.append([a['value'] for a in answers[:1]])
          l3.append([a['value'] for a in answers[:2]])
          l4.append([a['value'] for a in answers[:3]])
          answers = get_answers(question,-1,wiki=True,dbpedia=False)
          l5.append([a['value'] for a in answers])           
          answers = get_answers(question,3,wiki=True,dbpedia=False)
          l6.append([a['value'] for a in answers[:1]])
          l7.append([a['value'] for a in answers[:2]])
          l8.append([a['value'] for a in answers[:3]])
          answers = get_answers(question,-1,wiki=False,dbpedia=True)
          l9.append([a['value'] for a in answers])           
          answers = get_answers(question,3,wiki=False,dbpedia=True)
          l10.append([a['value'] for a in answers[:1]])
          l11.append([a['value'] for a in answers[:2]])
          l12.append([a['value'] for a in answers[:3]])    
      total += 1
  dataframe['MuHeQA_Rank']=l1
  dataframe['MuHeQA_Top1']=l2
  dataframe['MuHeQA_Top2']=l3
  dataframe['MuHeQA_Top3']=l4
  dataframe['Wiki_Rank']=l5                     
  dataframe['Wiki_Top1']=l6
  dataframe['Wiki_Top2']=l7
  dataframe['Wiki_Top3']=l8
  dataframe['DBpedia_Rank']=l9                     
  dataframe['DBpedia_Top1']=l10
  dataframe['DBpedia_Top2']=l11
  dataframe['DBpedia_Top3']=l12
  clear_output(wait=True)  
  print(total,"questions analyzed!")
  dataframe.to_json(json_file(name), orient='split')
  dataframe.to_csv(csv_file(name))
  return dataframe

def make_report(name,additional=[],filter=None,max=-1):
  
  df = pd.read_json(json_file(name), orient='split')
  types = [] 
  if ('types' in df):
        types = df['type'].tolist()
  responses = df['responses'].tolist()  
  limit = len(responses)
  if (max>0):
    limit = max
  y_true =[]
  for i,r in enumerate(responses[:limit]):
    partial_response = []
    if (filter) and (types[i] != filter):
        continue
    for response in r:
        resp_value = ""
        if (response):
            resp_value = normalize(response)
        partial_response.append(resp_value)
    y_true.append(partial_response)
  results = []
  for col in df.columns:
    if (col == 'question') or (col == 'responses') or (col == 'type'):
      continue
    predictions = df[col].tolist()
    y_pred = []
    for i,p in enumerate(predictions[:limit]):
        if (filter) and (types[i] != filter):
            continue
        partial_prediction = []
        for prediction in p:
            partial_prediction.append(normalize(prediction))
        y_pred.append(partial_prediction)        
    result = evaluate_labels(y_true,y_pred)
    result['model']=col
    results.append(result)

  for row in additional:
    results.append(row)

  df_results = pd.DataFrame(results)
  return df_results

def visualize_results(name,max=10):
    df = pd.read_json(json_file(name), orient='split')
    return df.head(max)

def show_errors(name,column,max=-1):
    df = pd.read_json(json_file(name), orient='split')
    types = df['type'].tolist()
    questions = df['question'].tolist()
    responses = df['responses'].tolist()  
    limit = len(responses)
    if (max>0):
        limit = max
    y_true =[]
    for r in responses[:limit]:
        partial_response = []
        for response in r:
            partial_response.append(normalize(response))
        y_true.append(partial_response)
    predictions = df[column].tolist()
    y_pred = []
    for p in predictions[:limit]:
        partial_prediction = []
        for prediction in p:
            partial_prediction.append(normalize(prediction))
        y_pred.append(partial_prediction)
    for i,r in enumerate(y_true):
        p = y_pred[i]
        if collections.Counter(r) == collections.Counter(p):
            continue
        else:
            print("Question:",questions[i],"[",types[i],"]","Expected:",r,"Predicted:",p)
        
    

# Results

## SimpleQuestions Dataset

In [37]:
# Read wikipedia compatible SimpleQuestions Dataset
import pandas as pd
df = pd.read_csv('data/wsq-labels.csv', index_col=0)
# inverse predicates contain no entity. In all other cases, the entity corresponds to the subject.
#my_df = df[df.predicate.str.contains('P',case=False)]
#my_df.head()
types = []
responses = []
for index, row in df.iterrows():
  response = row['object_label'] 
  q_type = "F"
  if row['predicate'][0] == 'R':
    q_type = "B"
  responses.append([response])
  types.append(q_type)
df['type'] = types
df['responses'] = responses
sq_df = df.drop(['subject','predicate','object','subject_label','predicate_label','object_label'], axis=1)
sq_df.head(10)

Unnamed: 0,question,type,responses
0,Where did roger marquis die,F,[Holyoke]
1,what was the cause of death of yves klein,F,[myocardial infarction]
2,What position does carlos gomez play?,F,[center fielder]
3,how does engelbert zaschka identify,F,[male]
4,what position does pee wee reese play in baseball,F,[shortstop]
5,Which Swiss conductor's cause of death is myoc...,B,[Karl Anton Rickenbacher]
6,where was padraic mcguinness's place of death,F,[Sydney]
7,what is the place of birth of sam edwards?,F,[Swansea]
8,Which home is an example of italianate archite...,B,[John and Maria Adams House]
9,who published neo contra,F,[Konami]


In [39]:
# read dbpedia compatible SimpleQuestions
import pandas as pd
df = pd.read_csv('data/dsq-labels.csv', index_col=0)
types = []
responses = []
for index, row in df.iterrows():
  response = row['object_label']   
  responses.append([response])
df['responses'] = responses
dsq_df = df.drop(['subject','predicate','object','subject_label','property_label','object_label'], axis=1)
dsq_df.head(10)

Unnamed: 0,question,responses
0,Where did roger marquis die,"[Holyoke, Massachusetts]"
1,What position does carlos gomez play?,[Center fielder]
2,how does engelbert zaschka identify,[nan]
3,what position does pee wee reese play in baseball,[Shortstop]
4,Which Swiss conductor's cause of death is myoc...,[Moses J. Epstein]
5,where was padraic mcguinness's place of death,[Australia]
6,what is the place of birth of sam edwards?,[Swansea]
7,Which home is an example of italianate archite...,[280 Broadway]
8,who published neo contra,[Konami]
9,what is angie estes's profession,[Poet]


In [683]:
# Merge questions into a unique dataframe
counter = 0
valid_rows = []
for index, r in sq_df.iterrows():
    question = r['question']
    if (normalize(question) in dbpedia_questions):
        counter += 1
        valid_rows.append(r)
print("Valid Questions:",counter)
squ_df = pd.DataFrame(columns=sq_df.columns, data=valid_rows)
squ_df.head(10)

Valid Questions: 3667


Unnamed: 0,question,type,responses
0,Where did roger marquis die,F,[Holyoke]
2,What position does carlos gomez play?,F,[center fielder]
3,how does engelbert zaschka identify,F,[male]
4,what position does pee wee reese play in baseball,F,[shortstop]
5,Which Swiss conductor's cause of death is myoc...,B,[Karl Anton Rickenbacher]
6,where was padraic mcguinness's place of death,F,[Sydney]
7,what is the place of birth of sam edwards?,F,[Swansea]
8,Which home is an example of italianate archite...,B,[John and Maria Adams House]
9,who published neo contra,F,[Konami]
10,what is angie estes's profession,F,[poet]


In [40]:
dsq_df.describe()

Unnamed: 0,question,responses
count,3688,3688
unique,3656,1741
top,Name an actor.,[nan]
freq,12,1076


In [48]:
#wiki_cache = {}
#dbpedia_cache = {}
#entities_cache = {}
df = evaluate_data('dsq',dsq_df,100)
dsq_df.head(10)

3688 questions analyzed!


Unnamed: 0,question,responses,MuHeQA_Rank,MuHeQA_Top1,MuHeQA_Top2,MuHeQA_Top3,Wiki_Rank,Wiki_Top1,Wiki_Top2,Wiki_Top3,DBpedia_Rank,DBpedia_Top1,DBpedia_Top2,DBpedia_Top3
0,Where did roger marquis die,"[Holyoke, Massachusetts]","[Holyoke, Holyoke Massachusetts]",[Holyoke],"[Holyoke, Holyoke Massachusetts]","[Holyoke, Holyoke Massachusetts, 2004-07-19]",[Holyoke],[Holyoke],"[Holyoke, 2004-07-19T00:00:00]","[Holyoke, 2004-07-19T00:00:00, 1969]",[Holyoke Massachusetts],[Holyoke Massachusetts],"[Holyoke Massachusetts, 2004-07-19]","[Holyoke Massachusetts, 2004-07-19]"
1,What position does carlos gomez play?,[Center fielder],"[defender, Center fielder]",[defender],"[defender, Center fielder]","[defender, Center fielder, association football]",[defender],[defender],"[defender, association football]","[defender, association football, Atlético Poto...",[Center fielder],[Center fielder],"[Center fielder, New York Mets]","[Center fielder, New York Mets, MLB]"
2,how does engelbert zaschka identify,[nan],[German],[German],"[German, male]","[German, male, Engelbert]",[male],[male],"[male, Engelbert]","[male, Engelbert, Freiburg im Breisgau]",[German],[German],"[German, Orionette]","[German, Orionette, Unterschrift von Engelbert..."
3,what position does pee wee reese play in baseball,[Shortstop],"[shortstop, Shortstop]",[shortstop],"[shortstop, Shortstop]","[shortstop, Shortstop, Los Angeles Dodgers]",[shortstop],[shortstop],"[shortstop, Los Angeles Dodgers]","[shortstop, Los Angeles Dodgers, Major League ...",[Shortstop],[Shortstop],"[Shortstop, Right]","[Shortstop, Right, MLB]"
4,Which Swiss conductor's cause of death is myoc...,[Moses J. Epstein],"[Thure Widlund, Helge Bäcklund, Ángel José Ram...",[Thure Widlund],"[Thure Widlund, Helge Bäcklund]","[Thure Widlund, Helge Bäcklund, Ángel José Ram...","[Thure Widlund, Helge Bäcklund, Ángel José Ram...",[Thure Widlund],"[Thure Widlund, Helge Bäcklund]","[Thure Widlund, Helge Bäcklund, Ángel José Ram...",[],[],[],[]
5,where was padraic mcguinness's place of death,[Australia],[Sydney],[Sydney],"[Sydney, Sydney New South Wales Australia]","[Sydney, Sydney New South Wales Australia, 2...",[Sydney],[Sydney],"[Sydney, skin cancer]","[Sydney, skin cancer, 2008-01-26]",[Sydney New South Wales Australia],[Sydney New South Wales Australia],"[Sydney New South Wales Australia, 2008-01-26]","[Sydney New South Wales Australia, 2008-01-26]"
6,what is the place of birth of sam edwards?,[Swansea],"[Swansea, Macon]",[Swansea],"[Swansea, Macon]","[Swansea, Macon, Swansea Wales]","[Swansea, Macon]",[Swansea],"[Swansea, Macon]","[Swansea, Macon, 1928]",[Swansea Wales],[Swansea Wales],"[Swansea Wales, Macon Georgia]","[Swansea Wales, Macon Georgia, 1928-02-01]"
7,Which home is an example of italianate archite...,[280 Broadway],"[Brown Township Building, Chatsworth House, 35...",[Francis D. Alling House],"[Francis D. Alling House, George B. Cox House]","[Francis D. Alling House, George B. Cox House,...","[Arlington Hotel, East 78th Street Houses, Bro...",[Arlington Hotel],"[Arlington Hotel, East 78th Street Houses]","[Arlington Hotel, East 78th Street Houses, Bro...","[Baston Lodge, Piper-Price House, Townhill Par...",[Baston Lodge],"[Baston Lodge, Piper-Price House]","[Baston Lodge, Piper-Price House, Townhill Par..."
8,who published neo contra,[Konami],[Konami],[Konami],"[Konami, Jim Lee]","[Konami, Jim Lee, Sota Fujimori]",[Konami],[Konami],"[Konami, 2004-10-19T00:00:00Z]","[Konami, 2004-10-19T00:00:00Z]",[Konami],[Konami],"[Konami, Jim Lee]","[Konami, Jim Lee, Sota Fujimori]"
9,what is angie estes's profession,[Poet],"[writer, poet]",[writer],"[writer, poet]","[writer, poet, California Polytechnic State Un...","[writer, poet]",[writer],"[writer, poet]","[writer, poet, California Polytechnic State Un...","[The name of Angie Estes is Angie Estes., Univ...",[The name of Angie Estes is Angie Estes.],"[The name of Angie Estes is Angie Estes., Univ...","[The name of Angie Estes is Angie Estes., Univ..."


In [59]:
make_report('dsq',max=100)

Unnamed: 0,total,empty,tp,tn,fp,fn,micro-precision,micro-recall,micro-f1,macro-precision,macro-recall,macro-f1,model
0,99,23,25,0,882,54,0.027563,0.316456,0.05071,0.157884,0.23,0.173624,MuHeQA_Rank
1,99,23,17,0,80,60,0.175258,0.220779,0.195402,0.17,0.17,0.17,MuHeQA_Top1
2,99,23,26,0,164,53,0.136842,0.329114,0.193309,0.13,0.24,0.166667,MuHeQA_Top2
3,99,23,29,0,250,50,0.103943,0.367089,0.162011,0.101667,0.27,0.146,MuHeQA_Top3
4,99,23,18,0,498,59,0.034884,0.233766,0.060708,0.145387,0.18,0.151774,Wiki_Rank
5,99,23,16,0,77,61,0.172043,0.207792,0.188235,0.16,0.16,0.16,Wiki_Top1
6,99,23,21,0,157,56,0.117978,0.272727,0.164706,0.105,0.21,0.14,Wiki_Top2
7,99,23,24,0,228,53,0.095238,0.311688,0.145897,0.086667,0.24,0.126667,Wiki_Top3
8,99,23,17,0,588,60,0.028099,0.220779,0.049853,0.12005,0.17,0.13129,DBpedia_Rank
9,99,23,12,0,74,65,0.139535,0.155844,0.147239,0.12,0.12,0.12,DBpedia_Top1


In [697]:
#wiki_cache = {}
#dbpedia_cache = {}
#entities_cache = {}
df = evaluate_data('squ',squ_df,1000)
squ_df.head(10)

0 : Where did roger marquis die
2 : What position does carlos gomez play?
3 : how does engelbert zaschka identify 
4 : what position does pee wee reese play in baseball
5 : Which Swiss conductor's cause of death is myocardial infarction?
6 : where was padraic mcguinness's place of death
7 : what is the place of birth of sam edwards?
8 : Which home is an example of italianate architecture?
9 : who published neo contra
10 : what is angie estes's profession 
11 : what position does josé francisco torres play?
12 : what male actor was born in  warsaw
13 : who was also born in jakarta
14 : Who was born in prague
15 : where was guy pnini born
16 : what is an album recorded by scott grimes
17 : what was the country of origin of the tv show sidewalks entertainment
18 : who was the architect of the structure tour perret
19 : What is an album by guy clark?
20 : what's the name of an Australian rock and roll
21 : what is the category of the celestial object 1241 dysona
22 : What is a type of game

KeyboardInterrupt: 

In [692]:
make_report('squ',max=100)

Unnamed: 0,total,empty,tp,tn,fp,fn,micro-precision,micro-recall,micro-f1,macro-precision,macro-recall,macro-f1,model
0,99,0,44,0,680,59,0.060773,0.427184,0.106409,0.327833,0.41,0.352,MuHeQA_Rank
1,99,0,38,0,59,62,0.391753,0.38,0.385787,0.38,0.38,0.38,MuHeQA_Top1
2,99,0,52,0,138,50,0.273684,0.509804,0.356164,0.27,0.5,0.346667,MuHeQA_Top2
3,99,0,54,0,224,49,0.194245,0.524272,0.283465,0.201667,0.51,0.282333,MuHeQA_Top3
4,99,0,47,0,296,53,0.137026,0.47,0.21219,0.407566,0.47,0.422121,Wiki_Rank
5,99,0,42,0,51,58,0.451613,0.42,0.435233,0.42,0.42,0.42,Wiki_Top1
6,99,0,50,0,128,50,0.280899,0.5,0.359712,0.275,0.5,0.35,Wiki_Top2
7,99,0,53,0,199,47,0.210317,0.53,0.301136,0.226667,0.53,0.306667,Wiki_Top3
8,99,0,17,0,562,83,0.029361,0.17,0.050074,0.135,0.17,0.146667,DBpedia_Rank
9,99,0,13,0,73,87,0.151163,0.13,0.139785,0.13,0.13,0.13,DBpedia_Top1


In [693]:
make_report('squ',filter="F",max=100)

Unnamed: 0,total,empty,tp,tn,fp,fn,micro-precision,micro-recall,micro-f1,macro-precision,macro-recall,macro-f1,model
0,70,0,43,0,74,31,0.367521,0.581081,0.450262,0.447653,0.56338,0.48169,MuHeQA_Rank
1,70,0,37,0,31,34,0.544118,0.521127,0.532374,0.521127,0.521127,0.521127,MuHeQA_Top1
2,70,0,50,0,83,23,0.37594,0.684932,0.485437,0.366197,0.676056,0.469484,MuHeQA_Top2
3,70,0,52,0,142,22,0.268041,0.702703,0.38806,0.2723,0.690141,0.381221,MuHeQA_Top3
4,70,0,43,0,43,28,0.5,0.605634,0.547771,0.542723,0.605634,0.561033,Wiki_Rank
5,70,0,40,0,27,31,0.597015,0.56338,0.57971,0.56338,0.56338,0.56338,Wiki_Top1
6,70,0,47,0,80,24,0.370079,0.661972,0.474747,0.366197,0.661972,0.464789,Wiki_Top2
7,70,0,49,0,128,22,0.276836,0.690141,0.395161,0.298122,0.690141,0.401408,Wiki_Top3
8,70,0,15,0,77,56,0.163043,0.211268,0.184049,0.161972,0.211268,0.178404,DBpedia_Rank
9,70,0,11,0,47,60,0.189655,0.15493,0.170543,0.15493,0.15493,0.15493,DBpedia_Top1


In [694]:
make_report('squ',filter="B",max=100)

Unnamed: 0,total,empty,tp,tn,fp,fn,micro-precision,micro-recall,micro-f1,macro-precision,macro-recall,macro-f1,model
0,28,0,1,0,606,28,0.001647,0.034483,0.003145,0.034483,0.034483,0.034483,MuHeQA_Rank
1,28,0,1,0,28,28,0.034483,0.034483,0.034483,0.034483,0.034483,0.034483,MuHeQA_Top1
2,28,0,2,0,55,27,0.035088,0.068966,0.046512,0.034483,0.068966,0.045977,MuHeQA_Top2
3,28,0,2,0,82,27,0.02381,0.068966,0.035398,0.028736,0.068966,0.04023,MuHeQA_Top3
4,28,0,4,0,253,25,0.015564,0.137931,0.027972,0.076664,0.137931,0.082027,Wiki_Rank
5,28,0,2,0,24,27,0.076923,0.068966,0.072727,0.068966,0.068966,0.068966,Wiki_Top1
6,28,0,3,0,48,26,0.058824,0.103448,0.075,0.051724,0.103448,0.068966,Wiki_Top2
7,28,0,4,0,71,25,0.053333,0.137931,0.076923,0.051724,0.137931,0.074713,Wiki_Top3
8,28,0,2,0,485,27,0.004107,0.068966,0.007752,0.068966,0.068966,0.068966,DBpedia_Rank
9,28,0,2,0,26,27,0.071429,0.068966,0.070175,0.068966,0.068966,0.068966,DBpedia_Top1


In [None]:
show_errors('sq',"Wiki_Top1",max=20)

## Wikidata QA Dataset

In [None]:
import pandas as pd
df = pd.read_csv('wqa-labels.csv', index_col=0)
responses = []
for index, row in df.iterrows():
  responses.append(row['object_labels'].replace("[","").replace("]","").replace("\'","").split(","))
df['responses'] = responses
wqa_df = df.drop(['subjects','predicates','objects','subject_labels','predicate_labels','object_labels'], axis=1)
wqa_df.head()

In [665]:
evaluate_data('wqa',wqa_df)

0 : Who is the president of Poland?
1 : How many Turing awards have people from Austria won?
2 : Give me all countries that have won a FIFA World Cup
3 : What is the population of Chile?
4 : Who is the author of One Piece?
5 : Which musicians are from Sweden?
6 : How many movies has Tarantino directed?
7 : Is New York bigger than Beijing?
8 : what countries are part of the European Union?
9 : What computer scientists were born after 1970?
10 : Is Leonardo Dicaprio older than Joaquin Phoenix?
11 : In which country is the Museo del Prado located?
12 : When did Caesars Palace open?
13 : Which political party was Abraham Lincoln a member of?
14 : Which paintings feature a banana?
15 : Do any national flags use pink?
16 : When did Ian Curtis commit suicide?
17 : Where can I see paintings by El Greco?
18 : In which movies did Al Pacino and Robert de Niro appear together?
19 : Who is the youngest head of state in the world?
20 : Which mountains are on Mars?
21 : What alcoholic drinks are made

TypeError: unsupported operand type(s) for +: 'timeout' and 'str'

In [None]:
make_report('wqa')

## Free Tests

In [None]:
wiki_cache = {}
dbpedia_cache = {}
entities_cache = {}

In [None]:
sample_question = "what was the country of origin of the tv show sidewalks entertainment"
sample_entity = "tv show sidewalks entertainment"
for index, candidate in enumerate(get_wikidata_candidates(sample_question,sample_entity)):
    print(index,":",candidate['id'], candidate['label'])

In [None]:
sample_question = "what was the country of origin of the tv show sidewalks entertainment"
sample_entity = "tv show sidewalks entertainment"
for index, candidate in enumerate(get_wikidata_resource(sample_question,sample_entity)):
    print(index,":",candidate['id'], candidate['label'])
    summary = get_wikidata_summary(candidate,sample_question,3)
    print("Summary:",summary)
    answer = get_answer_in_context(sample_question,summary,verbose=True)
    print("Answer:",answer)

In [699]:
sample_question = "What drug is used to treat schizophrenia?"
sample_entity = "roger marquis"
print("DBpedia Resource for question:", sample_question, "and entity:",sample_entity)
for index, candidate in enumerate(get_dbpedia_resource(sample_question,sample_entity)):
    print(index,":",candidate['id'], candidate['label'])
    print("Summary:",get_dbpedia_summary(candidate,sample_question,3))

DBpedia Resource for question: Where did roger marquis die and entity: roger marquis
0 : Roger_Marquis Roger Marquis
Summary: The death place of Roger Marquis is Holyoke, Massachusetts. The death date of Roger Marquis is 2004-07-19. The name of Roger Marquis is Roger Marquis.


In [34]:
for i,a in enumerate(get_answers("What position does carlos gomez play?",max=-1,wiki=True,dbpedia=True,verbose=True)):
    print(i,":",a)

Question: What position does carlos gomez play?
Entities: ['carlos gomez']
## Sorted Candidates by By Name :  [{'name': 'Carlos Gomez', 'id': 'Q89898891', 'description': 'researcher', 'score': 0.9}, {'name': 'Carlos A Gomez', 'id': 'Q91676432', 'description': 'researcher (ORCID 0000-0001-5486-5710)', 'score': 0.9}, {'name': 'Carlos Gómez', 'id': 'Q949506', 'description': 'American actor', 'score': 0.7}, {'name': 'Carlos Gómez', 'id': 'Q203210', 'description': 'Mexican footballer (1952-2017)', 'score': 0.7}, {'name': 'Carlos Gómez', 'id': 'Q51944192', 'description': 'researcher', 'score': 0.7}, {'name': 'Carlos Gómez', 'id': 'Q2747238', 'description': 'Dominican Republic baseball player', 'score': 0.7}, {'name': 'Carlos Gómez', 'id': 'Q3660067', 'description': 'Argentinian comics artist', 'score': 0.7}, {'name': 'Carlos Gómez', 'id': 'Q5750557', 'description': 'Chilean association football player born 1992', 'score': 0.7}, {'name': 'Carlos M Gómez', 'id': 'Q40124092', 'description': 're

In [702]:
sample_question = "What drug is used to treat schizophrenia?"
for e in get_entities(sample_question):
    print("Entity:",e)
    print("Wikidata Resources:")
    for r in get_wikidata_resource(sample_question,e):
        print(r)
    print("DBpedia Resources:")
    for r in get_wikidata_resource(sample_question,e):
        print(r)

Entity: schizophrenia
Wikidata Resources:
{'label': 'schizophrenia', 'id': 'Q41112', 'description': 'psychotic disorder characterized by emotional responsiveness and disintegration of thought process', 'properties': [{'id': 'P101', 'value': 'field of work'}, {'id': 'P138', 'value': 'named after'}, {'id': 'P373', 'value': 'Commons category'}, {'id': 'P494', 'value': 'ICD-10'}, {'id': 'P557', 'value': 'DiseasesDB'}, {'id': 'P493', 'value': 'ICD-9'}, {'id': 'P672', 'value': 'MeSH tree code'}, {'id': 'P279', 'value': 'subclass of'}, {'id': 'P301', 'value': "category's main topic"}, {'id': 'P180', 'value': 'depicts'}, {'id': 'P780', 'value': 'symptoms and signs'}, {'id': 'P828', 'value': 'has cause'}, {'id': 'P924', 'value': 'possible treatment'}, {'id': 'P910', 'value': "topic's main category"}, {'id': 'P921', 'value': 'main subject'}, {'id': 'P989', 'value': 'spoken text audio'}, {'id': 'P1050', 'value': 'medical condition'}, {'id': 'P971', 'value': 'category combines topics'}, {'id': 'P1