In [1]:
import os
os.chdir('/mpqa/KBQA/src')
from setup import IndexSearch, Mongo_Connector

e_index = IndexSearch('dbpedia201604e')
p_index = IndexSearch('dbpedia201604p')

import json
from collections import Counter

### LC-QUAD

In [2]:
mongo = Mongo_Connector('kbqa', 'lcquad')
os.chdir("/mpqa/KBQA/data/lcquad")
lcquad_path = "lcquad_answers.json"
mongo.load_json(lcquad_path)
mongo.count_all_docs()
doc = mongo.get_sample(limit=1).next()
doc

Inserting 4998 new docs
4998 docs


{'_id': ObjectId('6113d58560089a940b8a2516'),
 'sparql_id': '301',
 'question': 'Which comic characters are painted by Bill Finger?',
 'SerialNumber': '1',
 'sparql_query': 'SELECT DISTINCT ?uri WHERE {?uri <http://dbpedia.org/ontology/creator> <http://dbpedia.org/resource/Bill_Finger>  . ?uri <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/ComicsCharacter>}',
 'checked': 'true',
 'entity mapping': [{'label': 'Bill Finger',
   'matchedBy': 'spotlight',
   'uri': 'http://dbpedia.org/resource/Bill_Finger',
   'seq': '38,49'}],
 'id': 'f0a9f1ca14764095ae089b152e0e7f12',
 'predicate mapping': [{'label': 'painted by',
   'uri': 'http://dbpedia.org/ontology/creator',
   'seq': '27,37',
   'mappedBy': 'manual corrections'},
  {'label': 'comic characters',
   'uri': 'http://dbpedia.org/ontology/ComicsCharacter',
   'seq': '6,22',
   'mappedBy': 'manual corrections'}],
 'answers': ['http://dbpedia.org/resource/Batman',
  'http://dbpedia.org/resource/Alfred_Pennywo

In [3]:
# reproduce the original lcquad test/train splits
limit = None
lcquad_train_path = 'train-data.json'  # wget https://raw.githubusercontent.com/AskNowQA/LC-QuAD/data/train-data.json

X_train = []
   
with open(lcquad_train_path, "r") as json_file:
    docs = json.load(json_file)
    for doc in docs:
        X_train.append(str(int(doc['_id']) + 1))
    print("%d docs loaded"%len(docs))

print("%d train samples" % (len(X_train)))
# annotate
samples = mongo.get_sample(limit=limit)
train_count = 0
for doc in samples:
    if doc['SerialNumber'] in X_train:
        doc['train'] = True
        train_count += 1
    else:
        doc['train'] = False
    mongo.col.update_one({'_id': doc['_id']}, {"$set": doc}, upsert=True)
print("Train: {}, Test: {}".format(train_count, samples.count() - train_count))
sample = mongo.get_sample(limit=1)[100]
print(sample['train'])

4000 docs loaded
4000 train samples
Train: 3999, Test: 999
True




In [4]:
# annotate GS entities and predicates across hops
# parse entities and predicates annotations across hops from the SQL query and update MongoDB collection
verbose = False
limit = None

samples = mongo.get_sample(limit=limit)
count = 0
for doc in samples:
    # fix URI !
    sparql_query = doc['sparql_query']
    # parse the SPARQL query into spo triples
    tripples = sparql_query[sparql_query.find("{")+1:sparql_query.find("}")].split('. ')

    # collect entities and predicates separately for 2 hops
    correct_intermediate_predicates = []
    correct_intermediate_entities = []
    correct_question_predicates = []
    correct_question_entities = []

    for tripple in tripples:
        if tripple:
            entities = []
            s, p, o = tripple.strip().split()
            if s[0] != '?':
                entities.append(s[1:-1])
            if o[0] != '?':
                entities.append(o[1:-1])
            p = p[1:-1]

            if '?uri' not in tripple:
                correct_intermediate_predicates.append(p)
                correct_intermediate_entities.extend(entities)
            else:
                correct_question_predicates.append(p)
                correct_question_entities.extend(entities)
    if verbose:
        print('\n')
        print(sparql_query)
        print(correct_intermediate_entities, correct_intermediate_predicates)
        print(correct_question_entities, correct_question_predicates)

    # update question annotations save in MongoDB
    if not correct_intermediate_predicates:
        # 1 hop
        doc['1hop'] = [correct_question_entities, correct_question_predicates]
        doc['2hop'] = [[], []]
    else:
        # 2 hops
        doc['1hop'] = [correct_intermediate_entities, correct_intermediate_predicates]
        doc['2hop'] = [correct_question_entities, correct_question_predicates]

    doc['entity_uris'] = list(set(correct_question_entities+correct_intermediate_entities))
    # store all predicate URIs for subgraph extraction
    doc['predicate_uris'] = list(set(correct_question_predicates+correct_intermediate_predicates))

    mongo.col.update_one({'_id': doc['_id']}, {"$set": doc}, upsert=True)
    count +=1

print("%d documents annotated with entities and predicates URIs across hops"%count)

# show sample annotation
sample = mongo.get_by_id("3").next()
print(sample['1hop'])
print(sample['2hop'])
print(sample['entity_uris'])
print(sample['predicate_uris'])

4998 documents annotated with entities and predicates URIs across hops
[['http://dbpedia.org/resource/Gestapo'], ['http://dbpedia.org/ontology/parentOrganisation']]
[[], ['http://dbpedia.org/ontology/leader']]
['http://dbpedia.org/resource/Gestapo']
['http://dbpedia.org/ontology/leader', 'http://dbpedia.org/ontology/parentOrganisation']


In [5]:
# store HDT IDs across hops
limit = None

def annotate_hop_ids(hop):
    e_found = 0
    e_not_found = 0
    p_found = 0
    p_not_found = 0
    
    samples = mongo.get_all(limit=limit)
    count = 0
    for doc in samples:
        e, p = doc[hop]
        e_ids = []
        for uri in e:
            try:
                e_ids.append(e_index.look_up_by_uri(uri)[0]['_source']['id'])
                e_found += 1
            except IndexError:
                uri = str(uri).encode('utf-8')
                print("{} not found in the entity catalog".format(uri))
                e_not_found += 1

        p_ids = []
        for uri in p:
            try:
                p_ids.append(p_index.look_up_by_uri(uri)[0]['_source']['id'])
                p_found += 1
            except IndexError:
                uri = str(uri).encode('utf-8')
                print("{} not found in the predicate catalog".format(uri))
                p_not_found += 1
        
        doc[hop+'_ids'] = e_ids, p_ids
            
        # update doc in MongoDB
        mongo.col.update_one({'_id': doc['_id']}, {"$set": doc}, upsert=True)
        count += 1
        
    print("%d documents annotated with ids"%count)
    print(e_found, e_not_found, p_found, p_not_found)


annotate_hop_ids('1hop')
annotate_hop_ids('2hop')

# show sample annotations
sample = mongo.get_sample(limit=1)[0]
print(sample['1hop_ids'])
print(sample['2hop_ids'])

b'http://dbpedia.org/resource/Bill_Finger' not found in the entity catalog
b'http://dbpedia.org/ontology/ComicsCharacter' not found in the entity catalog
b'http://dbpedia.org/ontology/creator' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Selwyn_Lloyd' not found in the entity catalog
b'http://dbpedia.org/resource/Winston_Churchill' not found in the entity catalog
b'http://dbpedia.org/ontology/primeMinister' not found in the predicate catalog
b'http://dbpedia.org/resource/Gestapo' not found in the entity catalog
b'http://dbpedia.org/ontology/parentOrganisation' not found in the predicate catalog
b'http://dbpedia.org/resource/Mumbai_North_(Lok_Sabha_constituency)' not found in the entity catalog
b'http://dbpedia.org/property/constituency' not found in the predicate catalog
b'http://dbpedia.org/resource/Roberto_Clemente_Bridge' not found in the entity catalog
b'http://dbpedia.org/ontol

b'http://dbpedia.org/resource/Michigan_Stadium' not found in the entity catalog
b'http://dbpedia.org/property/stadium' not found in the predicate catalog
b'http://dbpedia.org/resource/David_Johnston' not found in the entity catalog
b'http://dbpedia.org/ontology/Country' not found in the entity catalog
b'http://dbpedia.org/ontology/leader' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Fran\xc3\xa7ois-Marie_Le_Marchand_de_Lignery' not found in the entity catalog
b'http://dbpedia.org/ontology/MilitaryConflict' not found in the entity catalog
b'http://dbpedia.org/ontology/battle' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Myntdu_River' not found in the entity catalog
b'http://dbpedia.org/ontology/Town' not found in the entity catalog
b'http://dbpedia.org/ontology/origin' not found

b'http://dbpedia.org/resource/Sings_Kristofferson' not found in the entity catalog
b'http://dbpedia.org/resource/Milk_Cow_Blues' not found in the entity catalog
b'http://dbpedia.org/property/artist' not found in the predicate catalog
b'http://dbpedia.org/ontology/artist' not found in the predicate catalog
b'http://dbpedia.org/resource/Thesaban' not found in the entity catalog
b'http://dbpedia.org/ontology/Town' not found in the entity catalog
b'http://dbpedia.org/property/settlementType' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Micronesia' not found in the entity catalog
b'http://dbpedia.org/ontology/EthnicGroup' not found in the entity catalog
b'http://dbpedia.org/ontology/related' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Stephen_E._Ambrose' not found in the entity cat

b'http://dbpedia.org/resource/Levon_Ashotovich_Grigorian' not found in the entity catalog
b'http://dbpedia.org/property/relatives' not found in the predicate catalog
b'http://dbpedia.org/resource/Mediolanum' not found in the entity catalog
b'http://dbpedia.org/resource/History_of_Trier' not found in the entity catalog
b'http://dbpedia.org/ontology/capital' not found in the predicate catalog
b'http://dbpedia.org/ontology/capital' not found in the predicate catalog
b'http://dbpedia.org/resource/Joel_Schumacher' not found in the entity catalog
b'http://dbpedia.org/ontology/Film' not found in the entity catalog
b'http://dbpedia.org/ontology/director' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Metro_Manila' not found in the entity catalog
b'http://dbpedia.org/property/state' not found in the predicate catalog
b'http://dbpedia.org/resource/Saumarez_(horse)' not found in the entity cata

b'http://dbpedia.org/resource/John_Betts_(Connecticut_politician)' not found in the entity catalog
b'http://dbpedia.org/ontology/associate' not found in the predicate catalog
b'http://dbpedia.org/resource/Samuel_Isham' not found in the entity catalog
b'http://dbpedia.org/property/education' not found in the predicate catalog
b'http://dbpedia.org/resource/Canidae' not found in the entity catalog
b'http://dbpedia.org/ontology/Mammal' not found in the entity catalog
b'http://dbpedia.org/ontology/family' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Kharghar' not found in the entity catalog
b'http://dbpedia.org/ontology/School' not found in the entity catalog
b'http://dbpedia.org/ontology/city' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Henry_Cluney' not found in the entity catalo

b'http://dbpedia.org/resource/University_of_the_West_Indies' not found in the entity catalog
b'http://dbpedia.org/ontology/institution' not found in the predicate catalog
b'http://dbpedia.org/resource/Pope_Pius_X' not found in the entity catalog
b'http://dbpedia.org/property/beatifiedBy' not found in the predicate catalog
b'http://dbpedia.org/resource/Dolley_Madison' not found in the entity catalog
b'http://dbpedia.org/ontology/OfficeHolder' not found in the entity catalog
b'http://dbpedia.org/property/spouse' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Kakae' not found in the entity catalog
b'http://dbpedia.org/resource/Robert_William_Wilcox' not found in the entity catalog
b'http://dbpedia.org/property/title' not found in the predicate catalog
b'http://dbpedia.org/property/district' not found in the predicate catalog
b'http://dbpedia.org/resource/Richard_Taylor_(colonel)' not fo

b'http://dbpedia.org/resource/Atlant_Moscow_Oblast' not found in the entity catalog
b'http://dbpedia.org/property/playedFor' not found in the predicate catalog
b'http://dbpedia.org/resource/BBC' not found in the entity catalog
b'http://dbpedia.org/ontology/Company' not found in the entity catalog
b'http://dbpedia.org/property/parent' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Janaka_Thissakuttiarachchi' not found in the entity catalog
b'http://dbpedia.org/resource/Ati\xc5\x9ba' not found in the entity catalog
b'http://dbpedia.org/property/religion' not found in the predicate catalog
b'http://dbpedia.org/ontology/occupation' not found in the predicate catalog
b'http://dbpedia.org/resource/Maharashtra' not found in the entity catalog
b'http://dbpedia.org/property/state' not found in the predicate catalog
b'http://dbpedia.org/resource/Trade_association' not found in the entity catal

b'http://dbpedia.org/resource/Robert_Schenkkan' not found in the entity catalog
b'http://dbpedia.org/ontology/Play' not found in the entity catalog
b'http://dbpedia.org/property/writer' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Westminster_Abbey' not found in the entity catalog
b'http://dbpedia.org/ontology/Royalty' not found in the entity catalog
b'http://dbpedia.org/property/majorShrine' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Robert_Nutting' not found in the entity catalog
b'http://dbpedia.org/ontology/BaseballTeam' not found in the entity catalog
b'http://dbpedia.org/property/owner' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Private_school' not found in the en

b'http://dbpedia.org/resource/Eliza_Schneider' not found in the entity catalog
b'http://dbpedia.org/ontology/TelevisionShow' not found in the entity catalog
b'http://dbpedia.org/property/voices' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Dubai_World_Cup' not found in the entity catalog
b'http://dbpedia.org/ontology/Jockey' not found in the entity catalog
b'http://dbpedia.org/ontology/race' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Lawrence_Okoye' not found in the entity catalog
b'http://dbpedia.org/property/nationality' not found in the predicate catalog
b'http://dbpedia.org/resource/Michigan_Wolverines' not found in the entity catalog
b'http://dbpedia.org/ontology/AmericanFootballPlayer' not found in the entity catalog
b'http://dbpedia.org/property/currentteam' not found 

b'http://dbpedia.org/resource/Ernest_Rutherford' not found in the entity catalog
b'http://dbpedia.org/ontology/doctoralAdvisor' not found in the predicate catalog
b'http://dbpedia.org/resource/Doctor_Who_Confidential' not found in the entity catalog
b'http://dbpedia.org/ontology/TelevisionShow' not found in the entity catalog
b'http://dbpedia.org/ontology/related' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/City_council' not found in the entity catalog
b'http://dbpedia.org/property/governingBody' not found in the predicate catalog
b'http://dbpedia.org/resource/Moondog_Matinee' not found in the entity catalog
b'http://dbpedia.org/ontology/subsequentWork' not found in the predicate catalog
b'http://dbpedia.org/resource/New_York_City_FC' not found in the entity catalog
b'http://dbpedia.org/property/currentclub' not found in the predicate catalog
b'http://dbpedia.org/resource/Roh_Tae-

b'http://dbpedia.org/resource/Arun_Chandra_Guha' not found in the entity catalog
b'http://dbpedia.org/resource/Indira_Gandhi' not found in the entity catalog
b'http://dbpedia.org/ontology/party' not found in the predicate catalog
b'http://dbpedia.org/ontology/party' not found in the predicate catalog
b'http://dbpedia.org/resource/Firmicutes' not found in the entity catalog
b'http://dbpedia.org/ontology/Bacteria' not found in the entity catalog
b'http://dbpedia.org/ontology/division' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Craig_Van_Tilbury' not found in the entity catalog
b'http://dbpedia.org/resource/PyChess' not found in the entity catalog
b'http://dbpedia.org/ontology/occupation' not found in the predicate catalog
b'http://dbpedia.org/property/genre' not found in the predicate catalog
b'http://dbpedia.org/resource/Autoconf' not found in the entity catalog
b'http://dbpedia.o

b'http://dbpedia.org/resource/Broadcast_syndication' not found in the entity catalog
b'http://dbpedia.org/resource/Eliza_Schneider' not found in the entity catalog
b'http://dbpedia.org/ontology/TelevisionShow' not found in the entity catalog
b'http://dbpedia.org/ontology/distributor' not found in the predicate catalog
b'http://dbpedia.org/property/voices' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Justina_Machado' not found in the entity catalog
b'http://dbpedia.org/resource/John_Englehart' not found in the entity catalog
b'http://dbpedia.org/property/birthPlace' not found in the predicate catalog
b'http://dbpedia.org/ontology/birthPlace' not found in the predicate catalog
b'http://dbpedia.org/resource/Quest_Software' not found in the entity catalog
b'http://dbpedia.org/property/successor' not found in the predicate catalog
b'http://dbpedia.org/resource/Ryan_Seacrest' not found i

b'http://dbpedia.org/resource/Hilda_de_Duhalde' not found in the entity catalog
b'http://dbpedia.org/ontology/associate' not found in the predicate catalog
b'http://dbpedia.org/resource/Jason_Carter_(actor)' not found in the entity catalog
b'http://dbpedia.org/ontology/TelevisionShow' not found in the entity catalog
b'http://dbpedia.org/ontology/starring' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Spanish_language' not found in the entity catalog
b'http://dbpedia.org/property/language' not found in the predicate catalog
b"http://dbpedia.org/resource/Tennis_at_the_2012_Summer_Olympics_\xe2\x80\x93_Men's_singles" not found in the entity catalog
b'http://dbpedia.org/resource/Switzerland_at_the_2008_Summer_Olympics' not found in the entity catalog
b'http://dbpedia.org/ontology/silverMedalist' not found in the predicate catalog
b'http://dbpedia.org/property/flagbearer' not found in th

b'http://dbpedia.org/resource/Henry_Bourchier,_1st_Earl_of_Essex' not found in the entity catalog
b'http://dbpedia.org/ontology/spouse' not found in the predicate catalog
b'http://dbpedia.org/resource/Buckhurst_Hill_County_High_School' not found in the entity catalog
b'http://dbpedia.org/ontology/localAuthority' not found in the predicate catalog
b'http://dbpedia.org/resource/Krzysztof_Wielicki' not found in the entity catalog
b'http://dbpedia.org/ontology/firstAscentPerson' not found in the predicate catalog
b'http://dbpedia.org/resource/United_States' not found in the entity catalog
b'http://dbpedia.org/ontology/birthPlace' not found in the predicate catalog
b'http://dbpedia.org/resource/McKechnie_Field' not found in the entity catalog
b'http://dbpedia.org/property/architect' not found in the predicate catalog
b'http://dbpedia.org/resource/Avangard_Omsk' not found in the entity catalog
b'http://dbpedia.org/property/playedFor' not found in the predicate catalog
b'http://dbpedia.org/re

b'http://dbpedia.org/resource/Marie_Sisters' not found in the entity catalog
b'http://dbpedia.org/property/producer' not found in the predicate catalog
b'http://dbpedia.org/resource/Clatsop_County,_Oregon' not found in the entity catalog
b'http://dbpedia.org/resource/Columbia_Lake' not found in the entity catalog
b'http://dbpedia.org/ontology/River' not found in the entity catalog
b'http://dbpedia.org/ontology/riverMouth' not found in the predicate catalog
b'http://dbpedia.org/ontology/source' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Bernard_Herrmann' not found in the entity catalog
b'http://dbpedia.org/ontology/Film' not found in the entity catalog
b'http://dbpedia.org/ontology/musicComposer' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/My_Truly,_Truly_Fair' not found in t

b'http://dbpedia.org/resource/Harihar_Airport' not found in the entity catalog
b'http://dbpedia.org/ontology/Company' not found in the entity catalog
b'http://dbpedia.org/ontology/operator' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Marine_Corps_Air_Station_Kaneohe_Bay' not found in the entity catalog
b'http://dbpedia.org/resource/Burnet_R._Maybank' not found in the entity catalog
b'http://dbpedia.org/property/architect' not found in the predicate catalog
b'http://dbpedia.org/property/branch' not found in the predicate catalog
b'http://dbpedia.org/resource/Heinz_Field' not found in the entity catalog
b'http://dbpedia.org/property/tenants' not found in the predicate catalog
b'http://dbpedia.org/resource/Colorado_wine' not found in the entity catalog
b'http://dbpedia.org/property/officialName' not found in the predicate catalog
b'http://dbpedia.org/resource/The_United_States_Steel_

b'http://dbpedia.org/resource/Laemmle_Theatres' not found in the entity catalog
b'http://dbpedia.org/ontology/service' not found in the predicate catalog
b'http://dbpedia.org/resource/Kalutara_Electoral_District' not found in the entity catalog
b'http://dbpedia.org/ontology/MemberOfParliament' not found in the entity catalog
b'http://dbpedia.org/ontology/region' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Edwin_Catmull' not found in the entity catalog
b'http://dbpedia.org/resource/RenderMan_(software)' not found in the entity catalog
b'http://dbpedia.org/ontology/Company' not found in the entity catalog
b'http://dbpedia.org/ontology/keyPerson' not found in the predicate catalog
b'http://dbpedia.org/property/products' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Warner_Bros.' n

b'http://dbpedia.org/resource/Bass_guitar' not found in the entity catalog
b'http://dbpedia.org/property/instruments' not found in the predicate catalog
b'http://dbpedia.org/resource/PlayStation_2' not found in the entity catalog
b'http://dbpedia.org/resource/Asus' not found in the entity catalog
b'http://dbpedia.org/ontology/InformationAppliance' not found in the entity catalog
b'http://dbpedia.org/property/predecessor' not found in the predicate catalog
b'http://dbpedia.org/ontology/manufacturer' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Detroit_Red_Wings' not found in the entity catalog
b'http://dbpedia.org/property/employer' not found in the predicate catalog
b'http://dbpedia.org/resource/Macintosh_Quadra_660AV' not found in the entity catalog
b'http://dbpedia.org/ontology/Software' not found in the entity catalog
b'http://dbpedia.org/ontology/operatingSystem' not found in t

b'http://dbpedia.org/resource/The_Pentagon' not found in the entity catalog
b'http://dbpedia.org/ontology/MilitaryUnit' not found in the entity catalog
b'http://dbpedia.org/property/garrison' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Frasier' not found in the entity catalog
b'http://dbpedia.org/ontology/TelevisionShow' not found in the entity catalog
b'http://dbpedia.org/ontology/subsequentWork' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Andrew_Jackson' not found in the entity catalog
b'http://dbpedia.org/ontology/commander' not found in the predicate catalog
b'http://dbpedia.org/resource/New_York' not found in the entity catalog
b'http://dbpedia.org/property/place' not found in the predicate catalog
b'http://dbpedia.org/resource/A_Jitney_Elopement' not found in the entity

b'http://dbpedia.org/resource/Bangladesh' not found in the entity catalog
b'http://dbpedia.org/ontology/Organisation' not found in the entity catalog
b'http://dbpedia.org/property/membership' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Torrey_Pines_Gliderport' not found in the entity catalog
b'http://dbpedia.org/property/owner' not found in the predicate catalog
b'http://dbpedia.org/resource/Deion_Sanders' not found in the entity catalog
b'http://dbpedia.org/property/debutteam' not found in the predicate catalog
b'http://dbpedia.org/resource/Adrian_A._Basora' not found in the entity catalog
b'http://dbpedia.org/resource/Lori_Black' not found in the entity catalog
b'http://dbpedia.org/ontology/OfficeHolder' not found in the entity catalog
b'http://dbpedia.org/property/successor' not found in the predicate catalog
b'http://dbpedia.org/property/children' not found in the predicate ca

b'http://dbpedia.org/resource/Kelly_Osbourne' not found in the entity catalog
b'http://dbpedia.org/resource/Ozzy_Osbourne' not found in the entity catalog
b'http://dbpedia.org/property/relatives' not found in the predicate catalog
b'http://dbpedia.org/property/relatives' not found in the predicate catalog
b'http://dbpedia.org/resource/Ontario' not found in the entity catalog
b'http://dbpedia.org/ontology/location' not found in the predicate catalog
b'http://dbpedia.org/resource/William_Anthony_(artist)' not found in the entity catalog
b'http://dbpedia.org/ontology/training' not found in the predicate catalog
b'http://dbpedia.org/resource/United_States' not found in the entity catalog
b'http://dbpedia.org/ontology/Company' not found in the entity catalog
b'http://dbpedia.org/ontology/foundationPlace' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/City_of_Miami_Cemetery' not found in t

b'http://dbpedia.org/resource/Huey,_Dewey,_and_Louie' not found in the entity catalog
b'http://dbpedia.org/resource/Duck_family_(Disney)' not found in the entity catalog
b'http://dbpedia.org/ontology/FictionalCharacter' not found in the entity catalog
b'http://dbpedia.org/property/relatives' not found in the predicate catalog
b'http://dbpedia.org/ontology/relative' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Robin_(comics)' not found in the entity catalog
b'http://dbpedia.org/property/characters' not found in the predicate catalog
b'http://dbpedia.org/resource/Ben_Wilson_(American_football)' not found in the entity catalog
b'http://dbpedia.org/ontology/highschool' not found in the predicate catalog
b'http://dbpedia.org/resource/Fr._Agnel_Stadium' not found in the entity catalog
b'http://dbpedia.org/property/operator' not found in the predicate catalog
b'http://dbpedia.org/resource

b'http://dbpedia.org/resource/The_Palace_of_Auburn_Hills' not found in the entity catalog
b'http://dbpedia.org/resource/Stan_Van_Gundy' not found in the entity catalog
b'http://dbpedia.org/ontology/BasketballTeam' not found in the entity catalog
b'http://dbpedia.org/property/arena' not found in the predicate catalog
b'http://dbpedia.org/property/president' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Wallace_and_Gromit' not found in the entity catalog
b'http://dbpedia.org/ontology/Writer' not found in the entity catalog
b'http://dbpedia.org/property/notableworks' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/resource/Concrete' not found in the entity catalog
b'http://dbpedia.org/property/material' not found in the predicate catalog
b'http://dbpedia.org/resource/Alvin_Bell' not found in t

b'http://dbpedia.org/ontology/leader' not found in the predicate catalog
b'http://dbpedia.org/property/location' not found in the predicate catalog
b'http://dbpedia.org/property/deathPlace' not found in the predicate catalog
b'http://dbpedia.org/property/appointer' not found in the predicate catalog
b'http://dbpedia.org/ontology/producer' not found in the predicate catalog
b'http://dbpedia.org/ontology/associatedMusicalArtist' not found in the predicate catalog
b'http://dbpedia.org/property/awards' not found in the predicate catalog
b'http://dbpedia.org/ontology/University' not found in the entity catalog
b'http://dbpedia.org/property/affiliations' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/property/president' not found in the predicate catalog
b'http://dbpedia.org/property/appointer' not found in the predicate catalog
b'http://dbpedia.org/ontology/producer' not found in the predicate cat

b'http://dbpedia.org/ontology/religion' not found in the predicate catalog
b'http://dbpedia.org/ontology/Newspaper' not found in the entity catalog
b'http://dbpedia.org/property/headquarters' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/property/discipline' not found in the predicate catalog
b'http://dbpedia.org/property/name' not found in the predicate catalog
b'http://dbpedia.org/ontology/Award' not found in the entity catalog
b'http://dbpedia.org/property/awards' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/property/mascot' not found in the predicate catalog
b'http://dbpedia.org/ontology/field' not found in the predicate catalog
b'http://dbpedia.org/ontology/Sport' not found in the entity catalog
b'http://dbpedia.org/ontology/sport' not found in the predicate catalog
b'http://www.w3.org/1999/

b'http://dbpedia.org/ontology/owner' not found in the predicate catalog
b'http://dbpedia.org/ontology/outflow' not found in the predicate catalog
b'http://dbpedia.org/ontology/division' not found in the predicate catalog
b'http://dbpedia.org/ontology/BasketballTeam' not found in the entity catalog
b'http://dbpedia.org/property/president' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/property/cities' not found in the predicate catalog
b'http://dbpedia.org/ontology/Building' not found in the entity catalog
b'http://dbpedia.org/property/location' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/ontology/servingRailwayLine' not found in the predicate catalog
b'http://dbpedia.org/ontology/successor' not found in the predicate catalog
b'http://dbpedia.org/ontology/VideoGame' not found in the entity catalog

b'http://dbpedia.org/ontology/locationCity' not found in the predicate catalog
b'http://dbpedia.org/property/state' not found in the predicate catalog
b'http://dbpedia.org/property/awards' not found in the predicate catalog
b'http://dbpedia.org/property/poleDriver' not found in the predicate catalog
b'http://dbpedia.org/ontology/layout' not found in the predicate catalog
b'http://dbpedia.org/property/crosses' not found in the predicate catalog
b'http://dbpedia.org/ontology/maintainedBy' not found in the predicate catalog
b'http://dbpedia.org/property/origin' not found in the predicate catalog
b'http://dbpedia.org/ontology/restingPlace' not found in the predicate catalog
b'http://dbpedia.org/ontology/sport' not found in the predicate catalog
b'http://dbpedia.org/ontology/AmericanFootballTeam' not found in the entity catalog
b'http://dbpedia.org/property/city' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http:/

b'http://dbpedia.org/ontology/religion' not found in the predicate catalog
b'http://dbpedia.org/property/constituency' not found in the predicate catalog
b'http://dbpedia.org/property/almaMater' not found in the predicate catalog
b'http://dbpedia.org/ontology/Place' not found in the entity catalog
b'http://dbpedia.org/property/deathPlace' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/property/combatant' not found in the predicate catalog
b'http://dbpedia.org/property/origin' not found in the predicate catalog
b'http://dbpedia.org/ontology/sport' not found in the predicate catalog
b'http://dbpedia.org/ontology/location' not found in the predicate catalog
b'http://dbpedia.org/ontology/MilitaryConflict' not found in the entity catalog
b'http://dbpedia.org/ontology/commander' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog


b'http://dbpedia.org/property/locationCity' not found in the predicate catalog
b'http://dbpedia.org/ontology/currency' not found in the predicate catalog
b'http://dbpedia.org/property/chancellor' not found in the predicate catalog
b'http://dbpedia.org/ontology/TelevisionShow' not found in the entity catalog
b'http://dbpedia.org/property/themeMusicComposer' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/ontology/tenant' not found in the predicate catalog
b'http://dbpedia.org/ontology/associatedMusicalArtist' not found in the predicate catalog
b'http://dbpedia.org/ontology/formerBroadcastNetwork' not found in the predicate catalog
b'http://dbpedia.org/ontology/honours' not found in the predicate catalog
b'http://dbpedia.org/ontology/honours' not found in the predicate catalog
b'http://dbpedia.org/property/constituency' not found in the predicate catalog
b'http://dbpedia.org/property/placeOfBuri

b'http://dbpedia.org/property/services' not found in the predicate catalog
b'http://dbpedia.org/ontology/routeStart' not found in the predicate catalog
b'http://dbpedia.org/ontology/IceHockeyPlayer' not found in the entity catalog
b'http://dbpedia.org/ontology/team' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/property/keyPeople' not found in the predicate catalog
b'http://dbpedia.org/property/venue' not found in the predicate catalog
b'http://dbpedia.org/ontology/relation' not found in the predicate catalog
b'http://dbpedia.org/ontology/firstAscentPerson' not found in the predicate catalog
b'http://dbpedia.org/property/nearestCity' not found in the predicate catalog
b'http://dbpedia.org/property/homeTown' not found in the predicate catalog
b'http://dbpedia.org/property/region' not found in the predicate catalog
b'http://dbpedia.org/property/address' not found in the predicate catalog
b'htt

b'http://dbpedia.org/property/birthPlace' not found in the predicate catalog
b'http://dbpedia.org/ontology/knownFor' not found in the predicate catalog
b'http://dbpedia.org/ontology/award' not found in the predicate catalog
b'http://dbpedia.org/ontology/Person' not found in the entity catalog
b'http://dbpedia.org/property/training' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/ontology/religion' not found in the predicate catalog
b'http://dbpedia.org/property/cities' not found in the predicate catalog
b'http://dbpedia.org/ontology/award' not found in the predicate catalog
b'http://dbpedia.org/ontology/family' not found in the predicate catalog
b'http://dbpedia.org/ontology/ground' not found in the predicate catalog
b'http://dbpedia.org/property/nickname' not found in the predicate catalog
b'http://dbpedia.org/property/religion' not found in the predicate catalog
b'http://dbpedia.org/ontology

b'http://dbpedia.org/property/crosses' not found in the predicate catalog
b'http://dbpedia.org/ontology/parentCompany' not found in the predicate catalog
b'http://dbpedia.org/property/writer' not found in the predicate catalog
b'http://dbpedia.org/ontology/nearestCity' not found in the predicate catalog
b'http://dbpedia.org/property/mascot' not found in the predicate catalog
b'http://dbpedia.org/property/residence' not found in the predicate catalog
b'http://dbpedia.org/ontology/homeStadium' not found in the predicate catalog
b'http://dbpedia.org/property/address' not found in the predicate catalog
b'http://dbpedia.org/property/characterName' not found in the predicate catalog
b'http://dbpedia.org/ontology/country' not found in the predicate catalog
b'http://dbpedia.org/ontology/Country' not found in the entity catalog
b'http://dbpedia.org/property/leaderName' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http

b'http://dbpedia.org/property/locations' not found in the predicate catalog
b'http://dbpedia.org/property/title' not found in the predicate catalog
b'http://dbpedia.org/ontology/manufacturer' not found in the predicate catalog
b'http://dbpedia.org/property/destinations' not found in the predicate catalog
b'http://dbpedia.org/ontology/trainer' not found in the predicate catalog
b'http://dbpedia.org/property/developer' not found in the predicate catalog
b'http://dbpedia.org/ontology/builder' not found in the predicate catalog
b'http://dbpedia.org/property/country' not found in the predicate catalog
b'http://dbpedia.org/property/location' not found in the predicate catalog
b'http://dbpedia.org/ontology/Athlete' not found in the entity catalog
b'http://dbpedia.org/property/youthclubs' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/property/broadcastArea' not found in the predicate catalog
b'http:

b'http://dbpedia.org/property/writer' not found in the predicate catalog
b'http://dbpedia.org/property/area' not found in the predicate catalog
b'http://dbpedia.org/ontology/residence' not found in the predicate catalog
b'http://dbpedia.org/property/knownFor' not found in the predicate catalog
b'http://dbpedia.org/ontology/award' not found in the predicate catalog
b'http://dbpedia.org/property/residence' not found in the predicate catalog
b'http://dbpedia.org/property/destinations' not found in the predicate catalog
b'http://dbpedia.org/ontology/veneratedIn' not found in the predicate catalog
b'http://dbpedia.org/ontology/sport' not found in the predicate catalog
b'http://dbpedia.org/property/placeOfDeath' not found in the predicate catalog
b'http://dbpedia.org/property/associatedActs' not found in the predicate catalog
b'http://dbpedia.org/property/country' not found in the predicate catalog
b'http://dbpedia.org/ontology/territory' not found in the predicate catalog
b'http://dbpedia.o

b'http://dbpedia.org/ontology/Book' not found in the entity catalog
b'http://dbpedia.org/ontology/author' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/ontology/leader' not found in the predicate catalog
b'http://dbpedia.org/ontology/TelevisionShow' not found in the entity catalog
b'http://dbpedia.org/property/voices' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/ontology/OfficeHolder' not found in the entity catalog
b'http://dbpedia.org/property/successor' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/property/awards' not found in the predicate catalog
b'http://dbpedia.org/ontology/associatedMusicalArtist' not found in the predicate catalog
b'http://dbpedia.org/ontology/nonFictionSubject' not found in 

b'http://dbpedia.org/ontology/mouthCountry' not found in the predicate catalog
b'http://dbpedia.org/property/producer' not found in the predicate catalog
b'http://dbpedia.org/ontology/militaryUnit' not found in the predicate catalog
b'http://dbpedia.org/ontology/basedOn' not found in the predicate catalog
b'http://dbpedia.org/ontology/TelevisionShow' not found in the entity catalog
b'http://dbpedia.org/property/network' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/property/birthPlace' not found in the predicate catalog
b'http://dbpedia.org/property/playedFor' not found in the predicate catalog
b'http://dbpedia.org/ontology/ideology' not found in the predicate catalog
b'http://dbpedia.org/property/owner' not found in the predicate catalog
b'http://dbpedia.org/property/knownFor' not found in the predicate catalog
b'http://dbpedia.org/ontology/deathPlace' not found in the predicate catalog
b'h

b'http://dbpedia.org/property/country' not found in the predicate catalog
b'http://dbpedia.org/ontology/MusicGenre' not found in the entity catalog
b'http://dbpedia.org/ontology/stylisticOrigin' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/property/headquarters' not found in the predicate catalog
b'http://dbpedia.org/property/foundation' not found in the predicate catalog
b'http://dbpedia.org/ontology/occupation' not found in the predicate catalog
b'http://dbpedia.org/property/starring' not found in the predicate catalog
b'http://dbpedia.org/ontology/formerPartner' not found in the predicate catalog
b'http://dbpedia.org/property/broadcastArea' not found in the predicate catalog
b'http://dbpedia.org/ontology/locationCity' not found in the predicate catalog
b'http://dbpedia.org/property/associatedActs' not found in the predicate catalog
b'http://dbpedia.org/property/awards' not found in the p

b'http://dbpedia.org/ontology/colour' not found in the predicate catalog
b'http://dbpedia.org/ontology/operatedBy' not found in the predicate catalog
b'http://dbpedia.org/property/president' not found in the predicate catalog
b'http://dbpedia.org/property/city' not found in the predicate catalog
b'http://dbpedia.org/ontology/producer' not found in the predicate catalog
b'http://dbpedia.org/ontology/ground' not found in the predicate catalog
b'http://dbpedia.org/ontology/University' not found in the entity catalog
b'http://dbpedia.org/ontology/country' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/ontology/sport' not found in the predicate catalog
b'http://dbpedia.org/ontology/SportsTeam' not found in the entity catalog
b'http://dbpedia.org/ontology/formerTeam' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbp

b'http://dbpedia.org/property/parent' not found in the predicate catalog
b'http://dbpedia.org/ontology/Person' not found in the entity catalog
b'http://dbpedia.org/ontology/formerPartner' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/property/birthPlace' not found in the predicate catalog
b'http://dbpedia.org/property/starring' not found in the predicate catalog
b'http://dbpedia.org/property/awards' not found in the predicate catalog
b'http://dbpedia.org/property/awards' not found in the predicate catalog
b'http://dbpedia.org/ontology/owner' not found in the predicate catalog
b'http://dbpedia.org/property/fields' not found in the predicate catalog
b'http://dbpedia.org/ontology/product' not found in the predicate catalog
b'http://dbpedia.org/ontology/formerTeam' not found in the predicate catalog
b'http://dbpedia.org/ontology/occupation' not found in the predicate catalog
b'http://dbpedia.org

b'http://dbpedia.org/ontology/party' not found in the predicate catalog
b'http://dbpedia.org/property/owner' not found in the predicate catalog
b'http://dbpedia.org/ontology/launchSite' not found in the predicate catalog
b'http://dbpedia.org/property/founded' not found in the predicate catalog
b'http://dbpedia.org/property/almaMater' not found in the predicate catalog
b'http://dbpedia.org/ontology/knownFor' not found in the predicate catalog
b'http://dbpedia.org/ontology/starring' not found in the predicate catalog
b'http://dbpedia.org/ontology/religion' not found in the predicate catalog
b'http://dbpedia.org/property/fields' not found in the predicate catalog
b'http://dbpedia.org/ontology/city' not found in the predicate catalog
b'http://dbpedia.org/ontology/layout' not found in the predicate catalog
b'http://dbpedia.org/ontology/riverMouth' not found in the predicate catalog
b'http://dbpedia.org/property/hubs' not found in the predicate catalog
b'http://dbpedia.org/property/services'

b'http://dbpedia.org/ontology/broadcastArea' not found in the predicate catalog
b'http://dbpedia.org/ontology/TelevisionShow' not found in the entity catalog
b'http://dbpedia.org/property/distributor' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/ontology/Country' not found in the entity catalog
b'http://dbpedia.org/property/nationalOrigin' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/ontology/manufacturer' not found in the predicate catalog
b'http://dbpedia.org/ontology/Film' not found in the entity catalog
b'http://dbpedia.org/property/editing' not found in the predicate catalog
b'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' not found in the predicate catalog
b'http://dbpedia.org/ontology/Place' not found in the entity catalog
b'http://dbpedia.org/property/placeOfDeath' not found in the pre

In [None]:
# store HDT IDs for questions and answers URIs
loaded = False
limit = None

if not loaded:
    samples = mongo.get_sample(limit=limit)
    count = 0
    for doc in samples:
        # get all correct entity and predicate from the GS annotations
        e_ids = []
        for uri in doc['entity_uris']:
            try:
                e_ids.append(e_index.look_up_by_uri(uri)[0]['_source']['id'])
            except:
                print("%s not found in the entity catalog"%uri)
        doc['entity_ids'] = e_ids

        p_ids = []
        for uri in doc['predicate_uris']:
            try:
                p_ids.append(p_index.look_up_by_uri(uri)[0]['_source']['id'])
            except:
                print("%s not found in the predicate catalog"%uri)
        
        doc['predicate_ids'] = p_ids
        
        if 'answers' in doc:
            a_ids = []
            for uri in doc['answers']:
                try:
                    a_ids.append(e_index.look_up_by_uri(uri)[0]['_source']['id'])
                except:
                    print("%s not found in the entity catalog"%uri)

            doc['answers_ids'] = a_ids
            
        # update doc in MongoDB
        mongo.col.update_one({'_id': doc['_id']}, {"$set": doc}, upsert=True)
        count += 1
        
    print("%d documents annotated with ids"%count)

# show sample annotations
sample = mongo.get_sample(limit=1)[0]
print(sample['entity_ids'])
print(sample['predicate_ids'])
if 'answers' in sample:
    print(sample['answers_ids'])

In [None]:
# number of documents with 2 hops
print("%d complex questions (with more than one variable)"%mongo.col.find({"2hop": { "$ne": [[], []] }}).count())
print("%d complex questions (with more than one variable)"%mongo.col.find({"train": True, "2hop": { "$ne": [[], []] }}).count())
print("%d complex questions (with more than one variable)"%mongo.col.find({"train": False, "2hop": { "$ne": [[], []] }}).count())

In [None]:
# number of documents with >1 triple
limit = None
samples = mongo.get_sample(train=True, limit=limit)
counter = 0
for doc in samples:
    n_components = len(doc['predicate_ids']) + len(doc['entity_ids'])
    if n_components > 2:
        counter += 1
print(counter)

In [None]:
# question lengths distribution
limit = None
from keras.preprocessing.text import text_to_word_sequence
import numpy as np
samples = mongo.get_sample(limit=limit)
n_words_distr = []
for doc in samples:
    words = text_to_word_sequence(doc['question'])
    # add the sample to the dataset 
    n_words_distr.append(len(words))

# show basic stats
min_len = min(n_words_distr)
mean_len = np.mean(n_words_distr)
max_len = max(n_words_distr)
print("Min:%d Avg:%d Max:%d"%(min_len, mean_len, max_len))

In [None]:
# question number of question URIs distribution (predicate + entity)
from keras.preprocessing.text import text_to_word_sequence
import numpy as np
samples = mongo.get_sample(limit=limit)
n_distr = []
for doc in samples:
    n_distr.append(len(doc['entity_uris']+doc['predicate_uris']))

# show basic stats
min_len = min(n_distr)
mean_len = np.mean(n_distr)
max_len = max(n_distr)
print("Min:%d Avg:%d Max:%d"%(min_len, mean_len, max_len))

In [None]:
# number of answers per question
import numpy as np

samples = mongo.get_sample(limit=None)
n_distr = []
for doc in samples:
    n_distr.append(len(doc['answers']) if 'answers' in doc else 1)

# show basic stats
min_len = min(n_distr)
mean_len = np.median(n_distr)
max_len = max(n_distr)
print("Min:%d Median:%d Max:%d"%(min_len, mean_len, max_len))
p_distribution = Counter(n_distr)

In [None]:
# number of unique predicates and distribution 
limit = None

# training set
samples = mongo.get_sample(train=True, limit=limit)
predicates = []
for doc in samples:
    predicates.extend(doc['predicate_uris'])

# test set
samples = mongo.get_sample(train=False, limit=limit)
for doc in samples:
    predicates.extend(doc['predicate_uris'])    


p_distribution = Counter([p for p in predicates if p != 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'])
# most common
print (p_distribution.most_common(30))

predicates = list(set(predicates))
print("%d predicates"%len(predicates))

* 5 predicates not seen during training
* most frequent predicate: type ('http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 1568)

In [None]:
import matplotlib.pyplot as plt

def plot_distribution(counter):
    '''
    Plot the distribution stored in the counter object
    '''
    # prepare data
    import numpy as np
    labels, values = zip(*counter.items())
    indexes = np.arange(len(labels))

    # generate a plot
    import seaborn as sns
    sns.set(color_codes=True)
    sns.distplot(values)
    plt.show()

plot_distribution(p_distribution)

In [None]:
# write questions into file to generate embeddings
samples = mongo.get_sample(train=True, limit=limit)
os.chdir("/mpqa/KBQA/data/lcquad")
with open('questions.txt', 'w', encoding='utf-8') as fout:
    for doc in samples:
        fout.write(doc['question']+'\n')

In [None]:
# annotate types separately
loaded = False

verbose = False
limit = None

if not loaded:
    samples = mongo.get_sample(limit=limit)
    count = 0
    for doc in samples:
        # fix URI !
        sparql_query = doc['sparql_query']
        # parse the SPARQL query into spo triples
        tripples = sparql_query[sparql_query.find("{")+1:sparql_query.find("}")].split('. ')

        # collect entities and predicates separately for 2 hops
        classes = []
        cids = []
        for tripple in tripples:
            if tripple:
                entities = []
                s, p, o = tripple.strip().split()
#                     
                p = p[1:-1]
                if p == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type':
                    if o[0] != '?':
                        uri = o[1:-1]
                        classes.append(uri)
                        try:
                            cids.append(e_index.look_up_by_uri(uri)[0]['_source']['id'])
                        except:
                            print("%s not found in the entity catalog"%uri)
               
        if classes and verbose:
            print(sparql_query)
            print(classes)
            print(cids)
       
        doc['classes'] = classes
        doc['classes_ids'] = cids

        mongo.col.update_one({'_id': doc['_id']}, {"$set": doc}, upsert=True)
        count +=1

    print("%d documents annotated with entities and predicates URIs across hops"%count)

# show sample annotation
sample = mongo.get_by_id("2652").next()
print(sample['sparql_query'])
print(sample['classes'])
print(sample['classes_ids'])

## QALD-7

In [None]:
mongo = Mongo_Connector('kbqa', 'qald_7_train')
data_path = "qald-7-train-multilingual.json"

ENDPOINT = 'https://dbpedia.org/sparql'

# load LC-QUAD dataset
loaded = False

if not loaded:
    import os
    os.chdir("/mpqa/KBQA/data/qald-7")

    import json
    import pprint
    import requests
    with open(data_path, encoding='utf-8') as f:
        dataset = json.load(f)
        for q in dataset['questions']:
            doc = {}
            doc['SerialNumber'] = q['id']
            doc['question'] = q['question'][0]['string']
            doc['sparql_query'] = q['query']['sparql'].replace('\n', ' ')
            response = requests.get(ENDPOINT, params={'query': doc['sparql_query'], 'output': 'json'}).json()
            if 'results' in response:
                results = response['results']['bindings']
                doc['answers'] = [v['value'] for r in results for v in r.values()]
            elif 'boolean' in response:
                doc['bool_answer'] = response['boolean']
            doc['train'] = False
            mongo.col.insert_one(doc)
            
mongo.count_all_docs()
doc = mongo.get_sample(train=False, limit=1)[0]
pprint.pprint(doc)
print(doc['sparql_query'])

In [None]:
# parse the SPARQL query into spo triples
cursor = mongo.get_sample(limit=1)
with cursor:
    for doc in cursor:
        sparql_query = doc['sparql_query']
        print(sparql_query)
        tripples = sparql_query[sparql_query.find("{")+1:sparql_query.find("}")].split('. ')

        # collect entities and predicates
        for tripple in tripples:
            if tripple:
                entities, predicates = [], []
                clause = tripple.strip().split()
                if clause[0] != 'filter':
                    s, p, o = clause
                    if s[0] != '?':
                        entities.append(s)
                    if o[0] != '?':
                        entities.append(o)
                    predicates.append(p)
                print(entities)
                print(predicates)

## QALD-8

In [None]:

mongo = Mongo_Connector('kbqa', 'qald_7_train')
data_path = "qald-7-train-multilingual.json"

ENDPOINT = 'https://dbpedia.org/sparql'

# load LC-QUAD dataset
loaded = False

if not loaded:
    import os
    os.chdir("/mpqa/KBQA/data/qald-7")

    import json
    import pprint
    import requests
    with open(data_path, encoding='utf-8') as f:
        dataset = json.load(f)
        for q in dataset['questions']:
            doc = {}
            doc['SerialNumber'] = q['id']
            doc['question'] = q['question'][0]['string']
            doc['sparql_query'] = q['query']['sparql'].replace('\n', ' ')
            response = requests.get(ENDPOINT, params={'query': doc['sparql_query'], 'output': 'json'}).json()
            if 'results' in response:
                results = response['results']['bindings']
                doc['answers'] = [v['value'] for r in results for v in r.values()]
            elif 'boolean' in response:
                doc['bool_answer'] = response['boolean']
            doc['train'] = False
            mongo.col.insert_one(doc)
            
mongo.count_all_docs()
doc = mongo.get_sample(train=False, limit=1)[0]
pprint.pprint(doc)
print(doc['sparql_query'])

# parse the SPARQL query into spo triples
cursor = mongo.get_sample(limit=1)
with cursor:
    for doc in cursor:
        sparql_query = doc['sparql_query']
        print(sparql_query)
        tripples = sparql_query[sparql_query.find("{")+1:sparql_query.find("}")].split('. ')

        # collect entities and predicates
        for tripple in tripples:
            if tripple:
                entities, predicates = [], []
                clause = tripple.strip().split()
                if clause[0] != 'filter':
                    s, p, o = clause
                    if s[0] != '?':
                        entities.append(s)
                    if o[0] != '?':
                        entities.append(o)
                    predicates.append(p)
                print(entities)
                print(predicates)