# Wikidata Simple-Question with Answers

We consider only questions with a single answer (predicate = 'P').

In [5]:
import pandas as pd
df = pd.read_csv('wikidata-sqa2.csv', index_col=0)
df.head()

Unnamed: 0,subject,predicate,object,question,Answer
0,Q7358590,P20,Q1637790,Where did roger marquis die,Holyoke
1,Q154335,P509,Q12152,what was the cause of death of yves klein,myocardial infarction
2,Q2747238,P413,Q5059480,What position does carlos gomez play?,center fielder
3,Q62498,P21,Q6581097,how does engelbert zaschka identify,male
4,Q182485,P413,Q1143358,what position does pee wee reese play in baseball,shortstop


### Named Entity Recognition based on Language Models, PoS tagging and Subwords

In [6]:
#!pip install --upgrade --user pip
#!pip install --user flair
from flair.data import Sentence
from flair.models import SequenceTagger

# load tagger
tagger = SequenceTagger.load("flair/pos-english")


2022-04-12 11:50:33,823 loading file /home/jupyter-cbadenes/.flair/models/pos-english/a9a73f6cd878edce8a0fa518db76f441f1cc49c2525b2b4557af278ec2f0659e.121306ea62993d04cd1978398b68396931a39eb47754c8a06a87f325ea70ac63
2022-04-12 11:50:34,397 SequenceTagger predicts: Dictionary with 53 tags: <unk>, O, UH, ,, VBD, PRP, VB, PRP$, NN, RB, ., DT, JJ, VBP, VBG, IN, CD, NNS, NNP, WRB, VBZ, WDT, CC, TO, MD, VBN, WP, :, RP, EX, JJR, FW, XX, HYPH, POS, RBR, JJS, PDT, NNPS, RBS, AFX, WP$, -LRB-, -RRB-, ``, '', LS, $, SYM, ADD


In [7]:
def get_pos_entities(text,category):
    # make example sentence
    sentence = Sentence(text)

    # predict NER tags
    tagger.predict(sentence)

    # print sentence
    #print(sentence)
    # iterate over entities and print
    entities = []
    current_entity = ""
    for t in sentence.tokens:
        for label in t.annotation_layers.keys():
            text = t.text
            label = t.get_labels(label)[0].value   
            if (label == category):
                if (current_entity == ""):
                    current_entity += text
                else:
                    current_entity += " " + text
            elif len(current_entity) > 0:
                entities.append(current_entity)
                current_entity = ""
          
    if (len(current_entity)>0):
        entities.append(current_entity)
    return entities

r = get_pos_entities("who's a kung fu star from hong kong","NN")
print(r)

['kung fu star']


In [8]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [9]:
def get_entities(text):
    entities = []
    entity = ""
    index = -1
    offset = -1
    for token in nlp(text):
        if (index == -1):
            index = token['index']
            offset = token['start']
        word = token['word']
        if (word[0] == '#'):
            word = token['word'].replace("#","")
            
        if (token['start']== offset):
            entity += word
        elif (token['index']-index < 2):
            entity += " " + word
        else:
            entities.append(entity)
            entity = word
        index = token['index']
        offset = token['end']
        
    if (len(entity) > 0):    
        entities.append(entity)
    if (len(entities) == 0):
        cardinal_entities =  get_pos_entities(text,"CD")
        if (len(cardinal_entities)>0):
            return cardinal_entities
        noun_entities =  get_pos_entities(text,"NN")
        if (len(noun_entities)>0):
            return noun_entities
        
    return entities

r = get_entities("which city did carl-alfred schumacher die")
print(r)

['carl-alfred schumacher']


## Identification of entities

In [None]:
entities = []
for index,row in df.iterrows():
    question = row['question']
    print(index,":",question)
    q_entities = get_entities(question)
    print("\t entities:",q_entities)
    if (len(q_entities)<1):
        print("No entities found!")
        entities.append("")
    elif (len(q_entities)>1):
        print("More than one entity found!")
        entities.append(q_entities)
    else:        
        entities.append(q_entities[0])

0 : Where did roger marquis die
	 entities: ['roger marquis']
1 : what was the cause of death of yves klein
	 entities: ['yves klein']
2 : What position does carlos gomez play?
	 entities: ['carlos gomez']
3 : how does engelbert zaschka identify 
	 entities: ['engelbert zaschka']
4 : what position does pee wee reese play in baseball
	 entities: ['pee wee reese']
5 : Which Swiss conductor's cause of death is myocardial infarction?
	 entities: ['swiss']
6 : where was padraic mcguinness's place of death
	 entities: ['padraic mcguinness']
7 : what is the place of birth of sam edwards?
	 entities: ['sam edwards']
8 : Which home is an example of italianate architecture?
	 entities: ['italianate']
9 : who published neo contra
	 entities: ['neo contra']
10 : what is angie estes's profession 
	 entities: ['angie estes']
11 : what position does josé francisco torres play?
	 entities: ['jose francisco torres']
12 : what male actor was born in  warsaw
	 entities: ['warsaw']
13 : who was also born 

In [None]:
df['entity']=entities
df.to_csv('wikidata-sqa-e.csv')
df.head()

### Wikidata Entity Linking based on MediaWiki API

The MediaWiki Action API is a web service that allows access to some wiki-features like authentication, page operations, and search. It can provide meta information about the wiki and the logged-in user.

action=wbsearchentities

Searches for entities using labels and aliases.

Returns a label and description for the entity in the user language if possible. Returns details of the matched term. The matched term text is also present in the aliases key if different from the display label.

In [None]:
import requests
def get_wikidata_candidates(label):
    candidates = []
    if (label==""):
        return candidates
    # type: One of the following values: form, form, item, lexeme, property, sense, sense
    query_path = "https://www.wikidata.org/w/api.php?action=wbsearchentities&search=QUERY_TEXT&language=en&limit=10&type=item&format=json"
    r = requests.get(query_path.replace("QUERY_TEXT",label))
    
    for answer in r.json()['search']:
        candidate = {
            'label': answer['display']['label']['value'],
            'id':answer['id']
#            'description' : answer['display']['description']
        }
        candidates.append(candidate)
    return candidates

In [None]:
entities = []
wikidata_items = []
for index,row in df.iterrows():
    question = row['question']
    print(index,":",question)
    q_entities = get_entities(question)
    print("\t entities:",q_entities)
    if (len(q_entities)<1):
        print("No entities found!")
        entities.append("")
        wikidata_item.append("")
    elif (len(q_entities)>1):
        print("More than one entity found!")
        entities.append(q_entities)
    else:        
        entities.append(q_entities[0])
    q_wiki_entities = []    
    for entity in q_entities:
        for item in get_wikidata_candidates(entity):
            q_wiki_entities.append(item['id'])
    print("\t wiki:",q_wiki_entities)
    wikidata_items.append(q_wiki_entities)
    print("\t reference:",row['subject'])


In [None]:
df['wikidata']=wikidata_items
df.to_csv('wikidata-sqa-ew.csv')
df.head()

In [None]:
from sklearn.metrics import confusion_matrix
y_true = df['subject']
y_pred = df['wikidata']
confusion_matrix(y_true, y_pred, labels=df['entity'])


### DBpedia Entity Linking based on DBpedia Lookup service

The DBpedia Lookup is an entity retrieval service for Linked Data. It provides a straightforward solution for the frequent use case of resolving keywords and natural language to related resource identifiers in the DBpedia knowledge graph. Related means that either the label or abstract of a resource matches, or an anchor text that was frequently used in Wikipedia to refer to a specific resource matches (e.g. the resource http://dbpedia.org/resource/United_States can be looked up by the string “USA”). 

So whether you need an auto-complete service for your RDF application, Linked Data enhancements for your CSV tables or simply a way to retrieve specific DBpedia identifiers – the DBpedia Lookup is for you!

As a part of the DBpedia Technology Stack the DBpedia Lookup can be deployed conveniently via Docker and works well with DBpedia Databus Collections. The DBpedia Lookup uses an Apache Lucene Index for resource indexing and retrieval and provides a web interface for querying.

In [4]:
import requests
def get_dbpedia_candidates(label):
    candidates = []
    if (label==""):
        return candidates
    # type: One of the following values: form, form, item, lexeme, property, sense, sense
    query_path = "http://lookup.dbpedia.org/api/search?query=QUERY_TEXT&maxResults=10&type=class&format=json"
    r = requests.get(query_path.replace("QUERY_TEXT",label))
    for answer in r.json()['docs']:
        candidate = {
            'label': answer['label'][0],
            'id':answer['resource'][0]
#            'description' : answer['display']['description']
        }
        candidates.append(candidate)
    return candidates

r = get_dbpedia_candidates("Berlín")
print(r)

[{'label': '<B>Berlin</B>', 'id': 'http://dbpedia.org/resource/Berlin'}, {'label': '<B>Bergen</B>', 'id': 'http://dbpedia.org/resource/Bergen'}, {'label': 'Tennis Borussia <B>Berlin</B>', 'id': 'http://dbpedia.org/resource/Tennis_Borussia_Berlin'}, {'label': '1. FC Union <B>Berlin</B>', 'id': 'http://dbpedia.org/resource/1._FC_Union_Berlin'}, {'label': 'West <B>Berlin</B>', 'id': 'http://dbpedia.org/resource/West_Berlin'}, {'label': 'East <B>Berlin</B>', 'id': 'http://dbpedia.org/resource/East_Berlin'}, {'label': '<B>Berlín</B>, Usulután', 'id': 'http://dbpedia.org/resource/Berlín,_Usulután'}, {'label': 'Humboldt University of <B>Berlin</B>', 'id': 'http://dbpedia.org/resource/Humboldt_University_of_Berlin'}, {'label': 'Hertha BSC', 'id': 'http://dbpedia.org/resource/Hertha_BSC'}, {'label': 'Carlos <B>Berlín</B> Montero', 'id': 'http://dbpedia.org/resource/Carlos_Berlín_Montero'}]
