# Entity candidates

In [1]:
# setup
dataset_name = 'lcquad'

import os
os.chdir('/mpqa_new/KBQA/src')

# connect to entity catalog indexed with Lucene 
from elasticsearch import Elasticsearch
from urllib.parse import quote

class IndexSearch:
    
    def __init__(self, index_name):
        # set up ES connection
        self.es = Elasticsearch()
        self.index = index_name
        self.type = 'terms'

    def match_label(self, string, top=100):
        return self.es.search(index=self.index,
                              body={"query": {"multi_match": {"query": string,
                                                              "operator": "and",
                                                              "fields": ["label^10", "label.ngrams"],
                                                              }}},
                              size=top, doc_type=self.type)['hits']['hits']

    def look_up_by_uri(self, uri, top=1):
        results = self.es.search(index=self.index,
                              body={"query": {"term": {"uri": quote(uri, safe='():/,')}}},
                              size=top, doc_type=self.type)['hits']['hits']
        if not results:
            # fall back to label match
            return self.match_label(uri.split('/')[-1], top=1)
            
        return results


e_index = IndexSearch('dbpedia201604e')

# set up connection to the MongoDB where the QA dataset is stored
# sudo service mongod start (27017 is the default port)
from pymongo import MongoClient
import json
import pprint

class Mongo_Connector():
    '''
    Wrapper class for some of the pymongo functions: http://api.mongodb.com/python/current/tutorial.html
    '''

    def __init__(self, db_name, col_name):
        # spin up database
        self.mongo_client = MongoClient()
        self.db = self.mongo_client[db_name]
        self.col = self.db[col_name]
        print("Connection success.")
    
    def count_all_docs(self):
        count = self.col.count_documents({})
        print ("%d docs"%count)
    
    def load_json(self, json_file_path):
        with open(json_file_path, "r") as json_file:
            docs = json.load(json_file)
        dataset_size = len(docs)
        print ("%d docs"%(dataset_size))
        self.col.insert_many(docs)

    def show_example(self):
        pprint.pprint(self.col.find_one())
    
    def get_sample(self, sample_size=100):
        cursor = self.col.find()
        if sample_size:
            cursor = cursor.limit(sample_size)
        return cursor

mongo = Mongo_Connector('kbqa', dataset_name)

Connection success.


In [2]:
# load lcquad samples from MongoDB
limit = None
qas = mongo.get_sample(limit)

# prepare data for entity and predicate mention extraction models training via sequence tagging
import urllib.parse
from keras.preprocessing.text import text_to_word_sequence

questions = []
question_words = []
n_words_distr = []

correct_e_spans = []
y_e = []
correct_entities_uris = []
correct_entities_ids = []

correct_answers_uris = []
correct_answers_ids = []

not_found_cnt = 0

print("Preparing %s dataset"%dataset_name)
for q in qas:
    # parse question
    question_o = q['question']
    questions.append(question_o)
    words = text_to_word_sequence(question_o)
    n_words_distr.append(len(words))
    question_words.append(words)

    # generate IO tags from mention spans
    entity_spans = [e['label'].lower() for e in q['entity mapping']]
    correct_e_spans.append(entity_spans)
    y_e.append([1 if word in [entity for entity_span in entity_spans for entity in entity_span.split()] else 0 for word in words])
    
    e_uris = [e['uri'].replace("'", "") for e in q['entity mapping']]
    correct_entities_uris.append(e_uris)
    try:
        e_ids = [e_index.look_up_by_uri(uri, top=1)[0]['_source']['id'] for uri in e_uris]
    except IndexError:
        e_ids = []
    correct_entities_ids.append(e_ids)
    if 'answers' in q:
        a_uris = [e_uri.replace("'", "") for e_uri in q['answers']]
    else:
        a_uris = []
    correct_answers_uris.append(a_uris)
    a_ids = []
    for uri in a_uris:
        try:
            a_ids.append(e_index.look_up_by_uri(uri, top=1)[0]['_source']['id'])
        except:
            not_found_cnt += 1
            print("%s not found in the entity catalog"%uri)
    correct_answers_ids.append(a_ids)
    
dataset_size = len(questions)

print(not_found_cnt)
print("Loaded %d %s questions"%(dataset_size, dataset_name))

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Preparing lcquad dataset
http://dbpedia.org/resource/Batman_(Terry_McGinnis) not found in the entity catalog
http://dbpedia.org/resource/Bryce_DeWitt not found in the entity catalog
http://dbpedia.org/resource/MapleMusic_Recordings not found in the entity catalog
http://dbpedia.org/resource/OrAnG_Rekod not found in the entity catalog
http://dbpedia.org/resource/UgEXPLODE_Records not found in the entity catalog
http://dbpedia.org/resource/DiscReet_Records not found in the entity catalog
http://dbpedia.org/resource/HagenBaden_235_(band) not found in the entity catalog
http://dbpedia.org/resource/Catherine_MacLellan not found in the entity catalog
http://dbpedia.org/resource/SuperStar_(Czech_and_Slovak_TV_series) not found in the entity catalog
http://dbpedia.org/resource/McClain_(band) not found in the entity catalog
http://dbpedia.org/resource/Lawrie_McMenemy not found in the entity catalog
http://dbpedia.org/resource/Bois_dOrange_River not found in the entity catalog
http://dbpedia.org

http://dbpedia.org/resource/Dorothy_McKibbin not found in the entity catalog
http://dbpedia.org/resource/OrgSync not found in the entity catalog
http://dbpedia.org/resource/OwnLocal not found in the entity catalog
http://dbpedia.org/resource/ReliantHeart_Inc. not found in the entity catalog
http://dbpedia.org/resource/GoodPop not found in the entity catalog
http://dbpedia.org/resource/DataSplice not found in the entity catalog
http://dbpedia.org/resource/ZipRecruiter not found in the entity catalog
http://dbpedia.org/resource/McKesson_Corporation not found in the entity catalog
http://dbpedia.org/resource/BitTorrent_(company) not found in the entity catalog
http://dbpedia.org/resource/TheFind.com not found in the entity catalog
http://dbpedia.org/resource/SpringSource not found in the entity catalog
http://dbpedia.org/resource/McGuff_Companies not found in the entity catalog
http://dbpedia.org/resource/NeoAccel not found in the entity catalog
http://dbpedia.org/resource/NetBase_Solutio

http://dbpedia.org/resource/AppNeta not found in the entity catalog
http://dbpedia.org/resource/OptiRTC not found in the entity catalog
http://dbpedia.org/resource/MetroWest_Regional_Transit_Authority not found in the entity catalog
http://dbpedia.org/resource/AquaBounty_Technologies not found in the entity catalog
http://dbpedia.org/resource/TransAmerican_Power_Products_CRV_Open not found in the entity catalog
http://dbpedia.org/resource/RocketShip_Tours not found in the entity catalog
http://dbpedia.org/resource/LavaRnd not found in the entity catalog
http://dbpedia.org/resource/McMurry_reaction not found in the entity catalog
http://dbpedia.org/resource/McCabe–Thiele_method not found in the entity catalog
http://dbpedia.org/resource/OS_X not found in the entity catalog
http://dbpedia.org/resource/John_McCain_presidential_campaign,_2008 not found in the entity catalog
http://dbpedia.org/resource/My_Dad,_John_McCain not found in the entity catalog
http://dbpedia.org/resource/MongoDB_I

http://dbpedia.org/resource/JoBoxers not found in the entity catalog
http://dbpedia.org/resource/SoundGirl not found in the entity catalog
http://dbpedia.org/resource/AlunaGeorge not found in the entity catalog
http://dbpedia.org/resource/CardLab not found in the entity catalog
http://dbpedia.org/resource/Rodney_McCray_(basketball) not found in the entity catalog
http://dbpedia.org/resource/Paul_McCracken_(basketball) not found in the entity catalog
http://dbpedia.org/resource/Doug_McDermott not found in the entity catalog
http://dbpedia.org/resource/Billy_McKinney_(basketball) not found in the entity catalog
http://dbpedia.org/resource/CheviReddy_Bhaskar_Reddy not found in the entity catalog
http://dbpedia.org/resource/Shri_ChandraPal_Verma not found in the entity catalog
http://dbpedia.org/resource/I.A.S. not found in the entity catalog
http://dbpedia.org/resource/Bois_dOrange_River not found in the entity catalog
http://dbpedia.org/resource/Grande_Rivière_de_lAnse_la_Raye not found 

http://dbpedia.org/resource/HSwMS_Sjöormen_(Sor) not found in the entity catalog
http://dbpedia.org/resource/HMCyS_Vijaya not found in the entity catalog
http://dbpedia.org/resource/HSC_FastCat_Ryde not found in the entity catalog
http://dbpedia.org/resource/HSC_FastCat_Shanklin not found in the entity catalog
http://dbpedia.org/resource/HNoMS_Thor_(1872) not found in the entity catalog
http://dbpedia.org/resource/HSwMS_Dristigheten not found in the entity catalog
http://dbpedia.org/resource/HSwMS_Fylgia not found in the entity catalog
http://dbpedia.org/resource/HNoMS_Gyller_(1938) not found in the entity catalog
http://dbpedia.org/resource/HNoMS_Odin_(1939) not found in the entity catalog
http://dbpedia.org/resource/HNoMS_Sleipner_(1936) not found in the entity catalog
http://dbpedia.org/resource/HNoMS_Æger_(1936) not found in the entity catalog
http://dbpedia.org/resource/HMS_Port_dEspagne_(1806) not found in the entity catalog
http://dbpedia.org/resource/HNoMS_Vidar_(N52) not found

http://dbpedia.org/resource/The_Fairly_OddParents:_Breakin_da_Rules not found in the entity catalog
http://dbpedia.org/resource/KonLive_Distribution not found in the entity catalog
http://dbpedia.org/resource/Ellen_DeGeneres:_Here_and_Now not found in the entity catalog
http://dbpedia.org/resource/Buddy_Holly:_Listen_to_Me;_The_Ultimate_Buddy_Party not found in the entity catalog
http://dbpedia.org/resource/SciGirls not found in the entity catalog
http://dbpedia.org/resource/H.E.L.P. not found in the entity catalog
http://dbpedia.org/resource/Tasty_Time_with_ZeFronk not found in the entity catalog
http://dbpedia.org/resource/Cash_McCall not found in the entity catalog
http://dbpedia.org/resource/Dangerous_Dan_McFoo not found in the entity catalog
http://dbpedia.org/resource/Legend_of_the_Guardians:_The_Owls_of_GaHoole not found in the entity catalog
http://dbpedia.org/resource/The_McConnell_Story not found in the entity catalog
http://dbpedia.org/resource/SkullGizzy(Gamer) not found in

http://dbpedia.org/resource/Glenn_MacDonald not found in the entity catalog
http://dbpedia.org/resource/Mary_McCarthy_(author) not found in the entity catalog
http://dbpedia.org/resource/Joseph_DeFilippis not found in the entity catalog
http://dbpedia.org/resource/Frank_McGee_(journalist) not found in the entity catalog
http://dbpedia.org/resource/Rob_LeDonne not found in the entity catalog
http://dbpedia.org/resource/McKensie_Garber not found in the entity catalog
http://dbpedia.org/resource/Kyle_McCarter not found in the entity catalog
http://dbpedia.org/resource/Lenny_McAllister not found in the entity catalog
http://dbpedia.org/resource/Ann_McNamee not found in the entity catalog
http://dbpedia.org/resource/Matthew_VanDyke not found in the entity catalog
http://dbpedia.org/resource/Andy_LoCascio not found in the entity catalog
http://dbpedia.org/resource/Alison_McCreary not found in the entity catalog
http://dbpedia.org/resource/R._Clayton_McWhorter not found in the entity catalog


http://dbpedia.org/resource/McCain_Institute not found in the entity catalog
http://dbpedia.org/resource/EveryoneOn not found in the entity catalog
http://dbpedia.org/resource/OpenGov_Foundation not found in the entity catalog
http://dbpedia.org/resource/AudioNow not found in the entity catalog
http://dbpedia.org/resource/CardHub.com not found in the entity catalog
http://dbpedia.org/resource/McCormick_House_(Washington,_D.C.) not found in the entity catalog
http://dbpedia.org/resource/FasterCures not found in the entity catalog
http://dbpedia.org/resource/IdeaScale not found in the entity catalog
http://dbpedia.org/resource/HiPNOTT_Records not found in the entity catalog
http://dbpedia.org/resource/Ramsey_Clark__VoteToImpeach__1 not found in the entity catalog
http://dbpedia.org/resource/MacLaren_Youth_Correctional_Facility not found in the entity catalog
http://dbpedia.org/resource/OpenAutonomy not found in the entity catalog
http://dbpedia.org/resource/OntoWiki not found in the enti

http://dbpedia.org/resource/Neil_S._McCarthy not found in the entity catalog
http://dbpedia.org/resource/Curtis_McElhinney not found in the entity catalog
http://dbpedia.org/resource/McNary,_Texas not found in the entity catalog
http://dbpedia.org/resource/Adobe_PageMill not found in the entity catalog
http://dbpedia.org/resource/OpenVibe not found in the entity catalog
http://dbpedia.org/resource/TigerVNC not found in the entity catalog
http://dbpedia.org/resource/FreeFem++ not found in the entity catalog
http://dbpedia.org/resource/OpenMPT not found in the entity catalog
http://dbpedia.org/resource/WhiskerControl not found in the entity catalog
http://dbpedia.org/resource/TetGen not found in the entity catalog
http://dbpedia.org/resource/CipherShed not found in the entity catalog
http://dbpedia.org/resource/GnucDNA not found in the entity catalog
http://dbpedia.org/resource/PackageForge not found in the entity catalog
http://dbpedia.org/resource/DrawPlus not found in the entity catal

http://dbpedia.org/resource/DeSoto_Custom not found in the entity catalog
http://dbpedia.org/resource/Balgowan,_KwaZulu-Natal not found in the entity catalog
http://dbpedia.org/resource/ShelleyDevoto not found in the entity catalog
http://dbpedia.org/resource/T.I. not found in the entity catalog
http://dbpedia.org/resource/On_An_On not found in the entity catalog
http://dbpedia.org/resource/TermBase_eXchange not found in the entity catalog
http://dbpedia.org/resource/Paul_McCartney_and_Wings not found in the entity catalog
http://dbpedia.org/resource/M.O.P. not found in the entity catalog
http://dbpedia.org/resource/W.E.L.T. not found in the entity catalog
http://dbpedia.org/resource/W.A.S.P. not found in the entity catalog
http://dbpedia.org/resource/L.E.O. not found in the entity catalog
http://dbpedia.org/resource/U.P.O. not found in the entity catalog
http://dbpedia.org/resource/R.E.M. not found in the entity catalog
http://dbpedia.org/resource/Oh_Ok not found in the entity catalog

http://dbpedia.org/resource/Annie_McGuire_(TV_series) not found in the entity catalog
http://dbpedia.org/resource/TeenNick_Top_10 not found in the entity catalog
http://dbpedia.org/resource/Hardcastle_and_McCormick not found in the entity catalog
http://dbpedia.org/resource/WWF_LiveWire not found in the entity catalog
http://dbpedia.org/resource/U_to_U not found in the entity catalog
http://dbpedia.org/resource/NHL_on_SportsChannel_America not found in the entity catalog
http://dbpedia.org/resource/Daily_News_Live_(Comcast_SportsNet) not found in the entity catalog
http://dbpedia.org/resource/SportsCentury not found in the entity catalog
http://dbpedia.org/resource/McKenna_(TV_series) not found in the entity catalog
http://dbpedia.org/resource/WOW:_The_CatholicTV_Challenge not found in the entity catalog
http://dbpedia.org/resource/IndyCar_Series_on_ABC not found in the entity catalog
http://dbpedia.org/resource/Fantasy_Fix_(Comcast_SportsNet) not found in the entity catalog
http://dbp

http://dbpedia.org/resource/Matt_LaBounty not found in the entity catalog
http://dbpedia.org/resource/R._W._McQuarters not found in the entity catalog
http://dbpedia.org/resource/Vance_McDonald not found in the entity catalog
http://dbpedia.org/resource/MacGruder_and_Loud not found in the entity catalog
http://dbpedia.org/resource/Kate_McShane not found in the entity catalog
http://dbpedia.org/resource/Batman_(Terry_McGinnis) not found in the entity catalog
http://dbpedia.org/resource/Heinrich_Louis_dArrest not found in the entity catalog
http://dbpedia.org/resource/McCurtain_County,_Oklahoma not found in the entity catalog
http://dbpedia.org/resource/Camps-sur-lAgly not found in the entity catalog
http://dbpedia.org/resource/M.A.N.T.I.S. not found in the entity catalog
http://dbpedia.org/resource/McChord_Field not found in the entity catalog
http://dbpedia.org/resource/McGuire_Air_Force_Base not found in the entity catalog
http://dbpedia.org/resource/OpenGL_Architecture_Review_Board n

http://dbpedia.org/resource/Louisiana_IceGators_(SPHL) not found in the entity catalog
http://dbpedia.org/resource/Dauphins_dÉpinal not found in the entity catalog
http://dbpedia.org/resource/Bryce_DeWitt not found in the entity catalog
http://dbpedia.org/resource/Dorothy_McKibbin not found in the entity catalog
http://dbpedia.org/resource/WakeMed_Soccer_Park not found in the entity catalog
http://dbpedia.org/resource/Charles_Thomas_McMillen not found in the entity catalog
http://dbpedia.org/resource/LeRon_Ellis not found in the entity catalog
http://dbpedia.org/resource/OrgSync not found in the entity catalog
http://dbpedia.org/resource/AutoBidsOnline not found in the entity catalog
http://dbpedia.org/resource/OwnLocal not found in the entity catalog
http://dbpedia.org/resource/ActivTrak not found in the entity catalog
http://dbpedia.org/resource/ReliantHeart_Inc. not found in the entity catalog
http://dbpedia.org/resource/WidgetCo,_Inc. not found in the entity catalog
http://dbpedia.

http://dbpedia.org/resource/OrgSync not found in the entity catalog
http://dbpedia.org/resource/AutoBidsOnline not found in the entity catalog
http://dbpedia.org/resource/OwnLocal not found in the entity catalog
http://dbpedia.org/resource/ActivTrak not found in the entity catalog
http://dbpedia.org/resource/ReliantHeart_Inc. not found in the entity catalog
http://dbpedia.org/resource/WidgetCo,_Inc. not found in the entity catalog
http://dbpedia.org/resource/GenSpera not found in the entity catalog
http://dbpedia.org/resource/TruTV_Presents:_Worlds_Dumbest... not found in the entity catalog
http://dbpedia.org/resource/Isola_del_Gran_Sasso_dItalia not found in the entity catalog
http://dbpedia.org/resource/WarioWare,_Inc.:_Mega_Microgames! not found in the entity catalog
http://dbpedia.org/resource/Zodas_Revenge:_StarTropics_II not found in the entity catalog
http://dbpedia.org/resource/Ys_I_&_II not found in the entity catalog
http://dbpedia.org/resource/Moero_TwinBee:_Cinnamon-hakase_

http://dbpedia.org/resource/U_to_U not found in the entity catalog
http://dbpedia.org/resource/100_Deeds_for_Eddie_McDowd not found in the entity catalog
http://dbpedia.org/resource/Todd_McFarlanes_Spawn not found in the entity catalog
http://dbpedia.org/resource/Geico_SportsNite not found in the entity catalog
http://dbpedia.org/resource/Daily_News_Live_(SportsNet_New_York) not found in the entity catalog
http://dbpedia.org/resource/McClain_(band) not found in the entity catalog
http://dbpedia.org/resource/McKeever_and_the_Colonel not found in the entity catalog
http://dbpedia.org/resource/The_McLean_Stevenson_Show not found in the entity catalog
http://dbpedia.org/resource/A.U.S.A. not found in the entity catalog
http://dbpedia.org/resource/In_the_Loop_with_iVillage not found in the entity catalog
http://dbpedia.org/resource/McDuff,_the_Talking_Dog not found in the entity catalog
http://dbpedia.org/resource/Meet_McGraw not found in the entity catalog
http://dbpedia.org/resource/The_S

http://dbpedia.org/resource/James_McEwen not found in the entity catalog
http://dbpedia.org/resource/Bill_McGarry not found in the entity catalog
http://dbpedia.org/resource/Malky_MacDonald not found in the entity catalog
http://dbpedia.org/resource/Frank_McLintock not found in the entity catalog
http://dbpedia.org/resource/Alan_McLeary not found in the entity catalog
http://dbpedia.org/resource/Bob_McRoberts not found in the entity catalog
http://dbpedia.org/resource/Jimmy_McMullan not found in the entity catalog
http://dbpedia.org/resource/Sean_McAuley not found in the entity catalog
http://dbpedia.org/resource/Jimmy_McIlroy not found in the entity catalog
http://dbpedia.org/resource/John_McGovern_(footballer) not found in the entity catalog
http://dbpedia.org/resource/Thomas_H._McIntosh not found in the entity catalog
http://dbpedia.org/resource/Billy_McEwan_(footballer,_born_1951) not found in the entity catalog
http://dbpedia.org/resource/Jim_McAnearney not found in the entity cat

http://dbpedia.org/resource/John_Patrick_McNaughton_Barn not found in the entity catalog
http://dbpedia.org/resource/NorthPark_Mall_(Oklahoma) not found in the entity catalog
http://dbpedia.org/resource/Fort_McCulloch not found in the entity catalog
http://dbpedia.org/resource/McGee_Creek_State_Park not found in the entity catalog
http://dbpedia.org/resource/McCurtain_County_Wilderness_Area not found in the entity catalog
http://dbpedia.org/resource/WeGoLook not found in the entity catalog
http://dbpedia.org/resource/Museu_Barbier-Mueller_dArt_Precolombí not found in the entity catalog
http://dbpedia.org/resource/Palau_Municipal_dEsports_de_Badalona not found in the entity catalog
http://dbpedia.org/resource/CineEurope not found in the entity catalog
http://dbpedia.org/resource/Aiguamolls_de_lEmpordà not found in the entity catalog
http://dbpedia.org/resource/Torre_PwC not found in the entity catalog
http://dbpedia.org/resource/Basílica_de_Santa_Maria_de_Castelló_dEmpúries not found in

http://dbpedia.org/resource/DigiCel_FlipBook not found in the entity catalog
http://dbpedia.org/resource/BuildAMation not found in the entity catalog
http://dbpedia.org/resource/OpenMPT not found in the entity catalog
http://dbpedia.org/resource/FalconView not found in the entity catalog
http://dbpedia.org/resource/Ulead_DVD_MovieFactory not found in the entity catalog
http://dbpedia.org/resource/VistaPro not found in the entity catalog
http://dbpedia.org/resource/MapInfo_Professional not found in the entity catalog
http://dbpedia.org/resource/EffectsLab_Pro not found in the entity catalog
http://dbpedia.org/resource/MapGuide_Open_Source not found in the entity catalog
http://dbpedia.org/resource/TestPartner not found in the entity catalog
http://dbpedia.org/resource/Buzans_iMindMap not found in the entity catalog
http://dbpedia.org/resource/SynfiniWay not found in the entity catalog
http://dbpedia.org/resource/WhiskerControl not found in the entity catalog
http://dbpedia.org/resource/

http://dbpedia.org/resource/WebSharper not found in the entity catalog
http://dbpedia.org/resource/@MAX_SyncUp not found in the entity catalog
http://dbpedia.org/resource/TapeTrack_Tape_Management_Framework not found in the entity catalog
http://dbpedia.org/resource/ProRealTime not found in the entity catalog
http://dbpedia.org/resource/MiniTool_Partition_Wizard not found in the entity catalog
http://dbpedia.org/resource/IBM_WebSphere_Application_Server not found in the entity catalog
http://dbpedia.org/resource/NetCDF_Operators not found in the entity catalog
http://dbpedia.org/resource/Xiph_QuickTime_Components not found in the entity catalog
http://dbpedia.org/resource/LoLiWin not found in the entity catalog
http://dbpedia.org/resource/BackupHDDVD not found in the entity catalog
http://dbpedia.org/resource/PeopleTools not found in the entity catalog
http://dbpedia.org/resource/NonVisual_Desktop_Access not found in the entity catalog
http://dbpedia.org/resource/MadCat_Media_Browser n

http://dbpedia.org/resource/MassTransit-Project not found in the entity catalog
http://dbpedia.org/resource/UltraMixer not found in the entity catalog
http://dbpedia.org/resource/OpenBLAS not found in the entity catalog
http://dbpedia.org/resource/BlueStacks__App_Player__1 not found in the entity catalog
http://dbpedia.org/resource/JetBrains__CLion__1 not found in the entity catalog
http://dbpedia.org/resource/CleVR__Stitcher__1 not found in the entity catalog
http://dbpedia.org/resource/Fluendo__LongoMatch__1 not found in the entity catalog
http://dbpedia.org/resource/William_C._McCool not found in the entity catalog
http://dbpedia.org/resource/Bruce_McCandless_II not found in the entity catalog
http://dbpedia.org/resource/Jon_McBride not found in the entity catalog
http://dbpedia.org/resource/Donald_R._McMonagle not found in the entity catalog
http://dbpedia.org/resource/James_McDivitt not found in the entity catalog
http://dbpedia.org/resource/Michael_J._McCulley not found in the en

http://dbpedia.org/resource/ArcSDE not found in the entity catalog
http://dbpedia.org/resource/OpenEMR not found in the entity catalog
http://dbpedia.org/resource/IconBuilder not found in the entity catalog
http://dbpedia.org/resource/LeechFTP not found in the entity catalog
http://dbpedia.org/resource/Microsoft_Visual_SourceSafe not found in the entity catalog
http://dbpedia.org/resource/FreePCB not found in the entity catalog
http://dbpedia.org/resource/WorldsAway not found in the entity catalog
http://dbpedia.org/resource/DrawPlus not found in the entity catalog
http://dbpedia.org/resource/DotSVN not found in the entity catalog
http://dbpedia.org/resource/JobScheduler not found in the entity catalog
http://dbpedia.org/resource/HP_QuickTest_Professional not found in the entity catalog
http://dbpedia.org/resource/BeaTunes not found in the entity catalog
http://dbpedia.org/resource/PySynth not found in the entity catalog
http://dbpedia.org/resource/ForeUI not found in the entity catalo

http://dbpedia.org/resource/SquidNT not found in the entity catalog
http://dbpedia.org/resource/WinUSB not found in the entity catalog
http://dbpedia.org/resource/AMD_CodeAnalyst not found in the entity catalog
http://dbpedia.org/resource/Go-oo not found in the entity catalog
http://dbpedia.org/resource/NDSTokyoTrim not found in the entity catalog
http://dbpedia.org/resource/StarDraw not found in the entity catalog
http://dbpedia.org/resource/PreSonus_Studio_One not found in the entity catalog
http://dbpedia.org/resource/SlimBrowser not found in the entity catalog
http://dbpedia.org/resource/SUSE_Studio_ImageWriter not found in the entity catalog
http://dbpedia.org/resource/EventMachine not found in the entity catalog
http://dbpedia.org/resource/PowerMILL not found in the entity catalog
http://dbpedia.org/resource/OpenElement not found in the entity catalog
http://dbpedia.org/resource/EXeLearning not found in the entity catalog
http://dbpedia.org/resource/CircuitMaker not found in the 

http://dbpedia.org/resource/BurningMUD not found in the entity catalog
http://dbpedia.org/resource/B.A.T.M.A.N. not found in the entity catalog
http://dbpedia.org/resource/MIDletPascal not found in the entity catalog
http://dbpedia.org/resource/LuaTeX not found in the entity catalog
http://dbpedia.org/resource/OpenPAM not found in the entity catalog
http://dbpedia.org/resource/OpenDNSSEC not found in the entity catalog
http://dbpedia.org/resource/SeaBIOS not found in the entity catalog
http://dbpedia.org/resource/OpenCaster not found in the entity catalog
http://dbpedia.org/resource/ZeroVM not found in the entity catalog
http://dbpedia.org/resource/Haiku_PackageInstaller not found in the entity catalog
http://dbpedia.org/resource/SourceMeter not found in the entity catalog
http://dbpedia.org/resource/NaviServer not found in the entity catalog
http://dbpedia.org/resource/WorldsAway not found in the entity catalog
http://dbpedia.org/resource/MediaLib not found in the entity catalog
http:

http://dbpedia.org/resource/FreeCast_(software) not found in the entity catalog
http://dbpedia.org/resource/NetDynamics_Inc. not found in the entity catalog
http://dbpedia.org/resource/MetaMachine not found in the entity catalog
http://dbpedia.org/resource/FreeHEP not found in the entity catalog
http://dbpedia.org/resource/OpenLink_Software not found in the entity catalog
http://dbpedia.org/resource/CreateTank not found in the entity catalog
http://dbpedia.org/resource/TriOviz not found in the entity catalog
http://dbpedia.org/resource/FreeMED_Software_Foundation not found in the entity catalog
http://dbpedia.org/resource/GeoSolutions not found in the entity catalog
http://dbpedia.org/resource/OpenPlans not found in the entity catalog
http://dbpedia.org/resource/TigerLogic not found in the entity catalog
http://dbpedia.org/resource/TheBrain_Technologies not found in the entity catalog
http://dbpedia.org/resource/OSEHRA_popHealth_Community not found in the entity catalog
http://dbpedia.

http://dbpedia.org/resource/MacJournal not found in the entity catalog
http://dbpedia.org/resource/Fluendo__LongoMatch__1 not found in the entity catalog
http://dbpedia.org/resource/Golden_Globe_Cecil_B._DeMille_Award not found in the entity catalog
http://dbpedia.org/resource/McNary,_Texas not found in the entity catalog
http://dbpedia.org/resource/A.A.R.M. not found in the entity catalog
http://dbpedia.org/resource/Ronnie_DeVoe not found in the entity catalog
http://dbpedia.org/resource/B.o.B not found in the entity catalog
http://dbpedia.org/resource/Jack_McVea not found in the entity catalog
http://dbpedia.org/resource/Dorothy_LaBostrie not found in the entity catalog
http://dbpedia.org/resource/Andy_McCluskey not found in the entity catalog
http://dbpedia.org/resource/Ralph_MacDonald not found in the entity catalog
http://dbpedia.org/resource/McFadden_&_Whitehead not found in the entity catalog
http://dbpedia.org/resource/Rose_Marie_McCoy not found in the entity catalog
http://dbp

http://dbpedia.org/resource/RiverBrink_Art_Museum not found in the entity catalog
http://dbpedia.org/resource/McMichael_Canadian_Art_Collection not found in the entity catalog
http://dbpedia.org/resource/InterContinental_Toronto_Centre not found in the entity catalog
http://dbpedia.org/resource/DundeeWealth not found in the entity catalog
http://dbpedia.org/resource/Tait_McKenzie_Centre not found in the entity catalog
http://dbpedia.org/resource/ZoomerMedia not found in the entity catalog
http://dbpedia.org/resource/ThoughtSpeed_Corporation not found in the entity catalog
http://dbpedia.org/resource/CaseWare_International not found in the entity catalog
http://dbpedia.org/resource/DeMoulas_Market_Basket not found in the entity catalog
http://dbpedia.org/resource/CareSouth not found in the entity catalog
http://dbpedia.org/resource/NetEqualizer not found in the entity catalog
http://dbpedia.org/resource/TriQuint_Semiconductor not found in the entity catalog
http://dbpedia.org/resource/T

http://dbpedia.org/resource/SpiderFab not found in the entity catalog
http://dbpedia.org/resource/IBM_Lotus_iNotes not found in the entity catalog
http://dbpedia.org/resource/MapInfo_Professional not found in the entity catalog
http://dbpedia.org/resource/TransModeler not found in the entity catalog
http://dbpedia.org/resource/RemObjects_Hydra not found in the entity catalog
http://dbpedia.org/resource/WorldWideWhiteboard not found in the entity catalog
http://dbpedia.org/resource/NetTutor not found in the entity catalog
http://dbpedia.org/resource/MyAcademicWorkshop not found in the entity catalog
http://dbpedia.org/resource/StorTrends not found in the entity catalog
http://dbpedia.org/resource/NeuroSolutions not found in the entity catalog
http://dbpedia.org/resource/TradingSolutions not found in the entity catalog
http://dbpedia.org/resource/ContourProfile_Gel_breast_implants not found in the entity catalog
http://dbpedia.org/resource/MemoryGel_breast_implants not found in the entit

http://dbpedia.org/resource/Thomas_McNamara_(soccer) not found in the entity catalog
http://dbpedia.org/resource/Tullio_DeSantis not found in the entity catalog
http://dbpedia.org/resource/Paul_McCarthy not found in the entity catalog
http://dbpedia.org/resource/Mount_McAdie not found in the entity catalog
http://dbpedia.org/resource/McConnell_Peak not found in the entity catalog
http://dbpedia.org/resource/MacFarlanes_bear not found in the entity catalog
http://dbpedia.org/resource/MacConnells_bat not found in the entity catalog
http://dbpedia.org/resource/MacConnells_climbing_mouse not found in the entity catalog
http://dbpedia.org/resource/Paul_McCartney_and_Wings not found in the entity catalog
http://dbpedia.org/resource/Brian_McComas not found in the entity catalog
http://dbpedia.org/resource/Lila_McCann not found in the entity catalog
http://dbpedia.org/resource/The_JaneDear_girls not found in the entity catalog
http://dbpedia.org/resource/McBride_&_the_Ride not found in the ent

http://dbpedia.org/resource/SimCity_(1989_video_game) not found in the entity catalog
http://dbpedia.org/resource/CyberMage:_Darklight_Awakening not found in the entity catalog
http://dbpedia.org/resource/SimCity_Creator not found in the entity catalog
http://dbpedia.org/resource/The_SimCity_Box not found in the entity catalog
http://dbpedia.org/resource/DeathSpank:_Thongs_of_Virtue not found in the entity catalog
http://dbpedia.org/resource/SimCity_Social not found in the entity catalog
http://dbpedia.org/resource/M.U.L.E. not found in the entity catalog
http://dbpedia.org/resource/CyberTiger not found in the entity catalog
http://dbpedia.org/resource/MySims_SkyHeroes not found in the entity catalog
http://dbpedia.org/resource/SimSafari not found in the entity catalog
http://dbpedia.org/resource/Clue_(iOS_game) not found in the entity catalog
http://dbpedia.org/resource/The_Sims_2:_FreeTime not found in the entity catalog
http://dbpedia.org/resource/Rock_Band_(iOS) not found in the en

http://dbpedia.org/resource/TortoiseGit not found in the entity catalog
http://dbpedia.org/resource/ImageVis3D_Mobile not found in the entity catalog
http://dbpedia.org/resource/StrixDB not found in the entity catalog
http://dbpedia.org/resource/CimTrak not found in the entity catalog
http://dbpedia.org/resource/OpenUniverse not found in the entity catalog
http://dbpedia.org/resource/Cocaine_(PaaS) not found in the entity catalog
http://dbpedia.org/resource/GetFEM++ not found in the entity catalog
http://dbpedia.org/resource/LaplacesDemon not found in the entity catalog
http://dbpedia.org/resource/RethinkDB not found in the entity catalog
http://dbpedia.org/resource/NetExpert not found in the entity catalog
http://dbpedia.org/resource/AcetoneISO not found in the entity catalog
http://dbpedia.org/resource/AndreaMosaic not found in the entity catalog
http://dbpedia.org/resource/SLinCA@Home not found in the entity catalog
http://dbpedia.org/resource/UltraMixer not found in the entity cata

http://dbpedia.org/resource/PacketVideo not found in the entity catalog
http://dbpedia.org/resource/MediNotes not found in the entity catalog
http://dbpedia.org/resource/MegaVision_(cameras) not found in the entity catalog
http://dbpedia.org/resource/DomainTools.com not found in the entity catalog
http://dbpedia.org/resource/UbiCare not found in the entity catalog
http://dbpedia.org/resource/DoctorBase not found in the entity catalog
http://dbpedia.org/resource/GreenBytes not found in the entity catalog
http://dbpedia.org/resource/PropertyRoom.com not found in the entity catalog
http://dbpedia.org/resource/RotoHog__Sports_Composite_DE,_Inc.__1 not found in the entity catalog
http://dbpedia.org/resource/The_Quick_Draw_McGraw_Show not found in the entity catalog
http://dbpedia.org/resource/Hot_Wheels_AcceleRacers not found in the entity catalog
http://dbpedia.org/resource/Mike_McCready not found in the entity catalog
http://dbpedia.org/resource/Tim_McIlrath not found in the entity catalo

In [3]:
# show sample question
i = 5
print(questions[i])
print(correct_e_spans[i])
print(y_e[i])
print(correct_entities_uris[i])
print(correct_entities_ids[i])
print(correct_answers_uris[i])
print(correct_answers_ids[i])

Which royalty was married to ptolemy XIII Theos Philopator and had mother named Cleopatra V ?
['cleopatra v', 'ptolemy xiii theos philopator']
[0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1]
['http://dbpedia.org/resource/Cleopatra_V_of_Egypt', 'http://dbpedia.org/resource/Ptolemy_XIII_Theos_Philopator']
[8078673, 18811966]
['http://dbpedia.org/resource/Cleopatra']
[4845538]


## Correct spans

Estimate the upper bound for entity scoring function performance on the correct entity spans

In [15]:
# check if the correct entities are in the subgraph 1-hop away from the top entities
# path to KG
from hdt import HDTDocument
hdt_path = "/mpqa_new/indexing/"
hdt_file = 'dbpedia2016-04en.hdt'
namespace = "http://dbpedia.org/"


def evaluate_entity_ranking(_e_spans, indices, top_n):
    '''
    Estimate ranking accuracy:
    n_samples <int> size of the sample questions pool
    top_n <int> threshold for the number of top entities 
    '''
    n_correct_entities, n_correct_entities_1hop = 0, 0
    n_correct_answers_1hop = 0
    # match entities
    for i in indices:
        top_e_ids = []
        
        # entities index lookup
        for span in _e_spans[i]:
            for match in e_index.match_label(span, top=top_n):
                top_e_ids.append(match['_source']['id'])
        
        if set(correct_entities_ids[i]).issubset(set(top_e_ids)):
            n_correct_entities += 1
        
        # extract a subgraph for top entities
        kg = HDTDocument(hdt_path+hdt_file)
        # all predicates: 1 hop
        kg.configure_hops(1, [], namespace, True)
        entities, _, _ = kg.compute_hops(top_e_ids)
        if set(correct_entities_ids[i]).issubset(set(entities)):
            n_correct_entities_1hop += 1
        if set(correct_answers_ids[i]).issubset(set(entities)):
            n_correct_answers_1hop += 1
        kg.remove()


    r_entities = float(n_correct_entities) / n_samples
    r_entities_1hop = float(n_correct_entities_1hop) / n_samples
    r_answers_1hop = float(n_correct_answers_1hop) / n_samples
    
    return [r_entities, r_entities_1hop, r_answers_1hop]


# define sample size for evaluation
n_samples = 500
top_n_range = [1, 5]
print("Entity match recall estimated on %d questions @%d"%(n_samples, top_n_range[1]))
# shuffle dataset to get a random sample
from random import shuffle
index_shuf = list(range(dataset_size))
shuffle(index_shuf)
index_shuf = index_shuf[:n_samples]
assert len(index_shuf) == n_samples

Entity match recall estimated on 500 questions @5


In [16]:
# evaluate on correct entity spans
top_n = top_n_range[0]
results = [[0, 0, 0]]  # recalls at 0
while top_n <= top_n_range[1]:
    results.append(evaluate_entity_ranking(correct_e_spans, index_shuf, top_n))
    top_n += 1
    
# show result
import pandas as pd
results = pd.DataFrame(results)
print(results)

# plot
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.figure(figsize=(12,12))
plt.plot(results[0], label='Entity match')
plt.plot(results[1], label='1-hop entity match')
plt.plot(results[2], label='1-hop answer match')
plt.legend()
plt.show()

AttributeError: 'hdt.HDTDocument' object has no attribute 'configure_hops'

## Extracted spans

Estimate performance on the extracted entity spans using the mention extraction model

In [31]:
# load pre-trained entity mention extraction model
embeddings_choice = 'glove6B100d'

from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
from keras.optimizers import Adam

def build_model(model_settings):
    # architecture
    input = Input(shape=(model_settings['max_len'],))
    model = Embedding(input_dim=model_settings['n_words']+1, output_dim=model_settings['emb_dim'],
                      weights=[model_settings['embeddings']],
                      input_length=model_settings['max_len'], mask_zero=True, trainable=False)(input)
    model = Bidirectional(LSTM(units=50, return_sequences=True,
                               recurrent_dropout=0.1))(model)  # variational biLSTM
    model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
    crf = CRF(model_settings['n_tags'])  # CRF layer
    out = crf(model)  # output
    model = Model(input, out)
    model.compile(optimizer=Adam(lr=0.0001), loss=crf.loss_function, metrics=[crf.accuracy])
    model.summary()
    return model

# load model settings
import pickle as pkl
with open('%s_%s.pkl'%(dataset_name, embeddings_choice), 'rb') as f:
    model_settings = pkl.load(f)
model = build_model(model_settings)

# load weights
model_name = 'entity_model'
model.load_weights('../models/'+model_name+'.h5')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 25)                0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 25, 100)           711400    
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 25, 100)           60400     
_________________________________________________________________
time_distributed_10 (TimeDis (None, 25, 50)            5050      
_________________________________________________________________
crf_10 (CRF)                 (None, 25, 2)             110       
Total params: 776,960
Trainable params: 65,560
Non-trainable params: 711,400
_________________________________________________________________


In [32]:
# evaluate entity span detection
import numpy as np
from keras.preprocessing.sequence import pad_sequences

def evaluate_entity_span_extraction(show_errors=False):
    n_correct = 0
    questions_e_spans = []
    for i, words in enumerate(question_words):
        x_test_sent = pad_sequences(sequences=[[model_settings['word2idx'].get(w, 'unk') for w in words]],
                                    padding="post", value=0, maxlen=model_settings['max_len'])
        p = model.predict(np.array([x_test_sent[0]]))
        p = np.argmax(p, axis=-1)[0]

        e_span, e_spans = [], []
        for w, pred in zip(words, p):
            if pred > 0:
                e_span.append(w)
            elif e_span:
                e_spans.append(" ".join(e_span))
                e_span = []
        # add last span
        if e_span:
            e_spans.append(" ".join(e_span))
            e_span = []

        if set(correct_e_spans[i]) == set(e_spans):
            n_correct += 1
        elif show_errors:
            print('\n')
            print(set(e_spans))
            # show correct spans
            print(set(correct_e_spans[i]))
        questions_e_spans.append(e_spans)
    p = float(n_correct) / dataset_size
    print("\nAcc: %.2f "%(p))
    return questions_e_spans

# evaluate
print("Accuracy estimated on %d questions"%(dataset_size))
extracted_e_spans = evaluate_entity_span_extraction()

Accuracy estimated on 4998 questions

Acc: 0.51 


In [33]:
# evaluate on correct entity spans
top_n = top_n_range[0]
results = [[0, 0, 0]]  # recalls at 0
while top_n <= top_n_range[1]:
    results.append(evaluate_entity_ranking(extracted_e_spans, index_shuf, top_n))
    top_n += 1
    
# show result
import pandas as pd
results = pd.DataFrame(results)
print(results)

# plot
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.figure(figsize=(12,12))
plt.plot(results[0], label='Entity match')
plt.plot(results[1], label='1-hop entity match')
plt.plot(results[2], label='1-hop answer match')
plt.legend()
plt.show()

AttributeError: 'hdt.HDTDocument' object has no attribute 'configure_hops'