# Wikidata Simple-Question with Answers

We consider only questions with a single answer (predicate = 'P').

In [None]:
import pandas as pd
df = pd.read_csv('wikidata-sqa2.csv', index_col=0)
df.head()

### Named Entity Recognition based on Language Models, PoS tagging and Subwords

In [None]:
#!pip install --upgrade --user pip
#!pip install --user flair
from flair.data import Sentence
from flair.models import SequenceTagger

# load tagger
tagger = SequenceTagger.load("flair/pos-english")


In [None]:
def get_pos_entities(text,category):
    # make example sentence
    sentence = Sentence(text)

    # predict NER tags
    tagger.predict(sentence)

    # print sentence
    #print(sentence)
    # iterate over entities and print
    entities = []
    current_entity = ""
    for t in sentence.tokens:
        for label in t.annotation_layers.keys():
            text = t.text
            label = t.get_labels(label)[0].value   
            if (label == category):
                if (current_entity == ""):
                    current_entity += text
                else:
                    current_entity += " " + text
            elif len(current_entity) > 0:
                entities.append(current_entity)
                current_entity = ""
          
    if (len(current_entity)>0):
        entities.append(current_entity)
    return entities

r = get_pos_entities("who's a kung fu star from hong kong","NN")
print(r)

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [None]:
def get_entities(text):
    entities = []
    entity = ""
    index = -1
    offset = -1
    for token in nlp(text):
        if (index == -1):
            index = token['index']
            offset = token['start']
        word = token['word']
        if (word[0] == '#'):
            word = token['word'].replace("#","")
            
        if (token['start']== offset):
            entity += word
        elif (token['index']-index < 2):
            entity += " " + word
        else:
            entities.append(entity)
            entity = word
        index = token['index']
        offset = token['end']
        
    if (len(entity) > 0):    
        entities.append(entity)
    if (len(entities) == 0):
        cardinal_entities =  get_pos_entities(text,"CD")
        if (len(cardinal_entities)>0):
            return cardinal_entities
        noun_entities =  get_pos_entities(text,"NN")
        if (len(noun_entities)>0):
            return noun_entities
        
    return entities

r = get_entities("which city did carl-alfred schumacher die")
print(r)

### Wikidata Entity Linking based on MediaWiki API

The MediaWiki Action API is a web service that allows access to some wiki-features like authentication, page operations, and search. It can provide meta information about the wiki and the logged-in user.

action=wbsearchentities

Searches for entities using labels and aliases.

Returns a label and description for the entity in the user language if possible. Returns details of the matched term. The matched term text is also present in the aliases key if different from the display label.

In [None]:
import requests
def get_wikidata_candidates(label):
    candidates = []
    if (label==""):
        return candidates
    # type: One of the following values: form, form, item, lexeme, property, sense, sense
    query_path = "https://www.wikidata.org/w/api.php?action=wbsearchentities&search=QUERY_TEXT&language=en&limit=10&type=item&format=json"
    r = requests.get(query_path.replace("QUERY_TEXT",label))
    
    for answer in r.json()['search']:
        candidate = {
            'label': answer['display']['label']['value'],
            'id':answer['id']
#            'description' : answer['display']['description']
        }
        candidates.append(candidate)
    return candidates

## Identification of entities

In [None]:
entities = []
for index,row in df.iterrows():
    question = row['question']
    print(index,":",question)
    q_entities = get_entities(question)
    print("\t entities:",q_entities)
    if (len(q_entities)<1):
        print("No entities found!")
        entities.append("")
    elif (len(q_entities)>1):
        print("More than one entity found!")
        entities.append(q_entities)
    else:        
        entities.append(q_entities[0])

In [None]:
df['entity']=entities
df.head()

## Wikidata linking

In [None]:
entities = []
wikidata_items = []
for index,row in df.iterrows():
    question = row['question']
    print(index,":",question)
    q_entities = get_entities(question)
    print("\t entities:",q_entities)
    if (len(q_entities)<1):
        print("No entities found!")
        entities.append("")
        wikidata_item.append("")
    elif (len(q_entities)>1):
        print("More than one entity found!")
        entities.append(q_entities)
    else:        
        entities.append(q_entities[0])
    q_wiki_entities = []    
    for entity in q_entities:
        for item in get_wikidata_candidates(entity):
            q_wiki_entities.append(item['id'])
    print("\t wiki:",q_wiki_entities)
    wikidata_items.append(q_wiki_entities)
    print("\t reference:",row['subject'])


In [None]:
df['entity']=entities
df['wikidata']=wikidata_items
df.to_csv('wikidata-sqa-ew.csv')

In [None]:
from sklearn.metrics import confusion_matrix
y_true = df['subject']
y_pred = df['wikidata']
confusion_matrix(y_true, y_pred, labels=df['entity'])


In [None]:
e = get_entities("what type of celestial object is (101180) 1998 sh9")
print(e)