# Entity candidates

In [1]:
# setup
dataset_name = 'lcquad'

import os
os.chdir('/mpqa/KBQA/src')

# connect to entity catalog indexed with Lucene 
from elasticsearch import Elasticsearch
from urllib.parse import quote

class IndexSearch:
    
    def __init__(self, index_name):
        # set up ES connection
        self.es = Elasticsearch()
        self.index = index_name
        self.type = 'terms'

    def match_label(self, string, top=100):
        return self.es.search(index=self.index,
                              body={"query": {"multi_match": {"query": string,
                                                              "operator": "and",
                                                              "fields": ["label^10", "label.ngrams"],
                                                              }}},
                              size=top, doc_type=self.type)['hits']['hits']

    def look_up_by_uri(self, uri, top=1):
        results = self.es.search(index=self.index,
                              body={"query": {"term": {"uri": quote(uri, safe='():/,')}}},
                              size=top, doc_type=self.type)['hits']['hits']
        if not results:
            # fall back to label match
            return self.match_label(uri.split('/')[-1], top=1)
            
        return results


e_index = IndexSearch('dbpedia201604e')

# set up connection to the MongoDB where the QA dataset is stored
# sudo service mongod start (27017 is the default port)
from pymongo import MongoClient
import json
import pprint

class Mongo_Connector():
    '''
    Wrapper class for some of the pymongo functions: http://api.mongodb.com/python/current/tutorial.html
    '''

    def __init__(self, db_name, col_name):
        # spin up database
        self.mongo_client = MongoClient()
        self.db = self.mongo_client[db_name]
        self.col = self.db[col_name]
        print("Connection success.")
    
    def count_all_docs(self):
        count = self.col.count_documents({})
        print ("%d docs"%count)
    
    def load_json(self, json_file_path):
        with open(json_file_path, "r") as json_file:
            docs = json.load(json_file)
        dataset_size = len(docs)
        print ("%d docs"%(dataset_size))
        self.col.insert_many(docs)

    def show_example(self):
        pprint.pprint(self.col.find_one())
    
    def get_sample(self, sample_size=100):
        cursor = self.col.find()
        if sample_size:
            cursor = cursor.limit(sample_size)
        return cursor

mongo = Mongo_Connector('kbqa', dataset_name)

Connection success.


In [2]:
# load lcquad samples from MongoDB
limit = None

# prepare data for entity and predicate mention extraction models training via sequence tagging
import urllib.parse
from keras.preprocessing.text import text_to_word_sequence

questions = []
question_words = []
n_words_distr = []

correct_e_spans = []
y_e = []
correct_entities_uris = []
correct_entities_ids = []

correct_answers_uris = []
correct_answers_ids = []


print("Preparing %s dataset"%dataset_name)

total_cnt = 0
fail_cnt = 0

processed = 0

while True:
    qas = mongo.get_sample(limit).skip(processed)
    try:
        for q in qas:
            # parse question
            question_o = q['question']
            questions.append(question_o)
            words = text_to_word_sequence(question_o)
            n_words_distr.append(len(words))
            question_words.append(words)

            # generate IO tags from mention spans
            entity_spans = [e['label'].lower() for e in q['entity mapping']]
            correct_e_spans.append(entity_spans)
            y_e.append([1 if word in [entity for entity_span in entity_spans for entity in entity_span.split()] else 0 for word in words])

            e_uris = [e['uri'].replace("'", "") for e in q['entity mapping']]
            correct_entities_uris.append(e_uris)
            try:
                e_ids = [e_index.look_up_by_uri(uri, top=1)[0]['_source']['id'] for uri in e_uris]
            except IndexError:
                e_ids = []
            correct_entities_ids.append(e_ids)
            if 'answers' in q:
                a_uris = [e_uri.replace("'", "") for e_uri in q['answers']]
            else:
                a_uris = []
            correct_answers_uris.append(a_uris)
            a_ids = []
            for uri in a_uris:
                total_cnt += 1
                try:
                    a_ids.append(e_index.look_up_by_uri(uri, top=1)[0]['_source']['id'])
                except:
                    # print("%s not found in the entity catalog"%uri)
                    fail_cnt += 1
            correct_answers_ids.append(a_ids)
            processed += 1
        break
    except CursorNotFound:
        print(f"Lost cursor. Retry with skip {processed} items")
    
dataset_size = len(questions)

print(total_cnt, fail_cnt)
print("Loaded %d %s questions"%(dataset_size, dataset_name))

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Preparing lcquad dataset
303131 3271
Loaded 4998 lcquad questions


In [3]:
# show sample question
i = 5
print(questions[i])
print(correct_e_spans[i])
print(y_e[i])
print(correct_entities_uris[i])
print(correct_entities_ids[i])
print(correct_answers_uris[i])
print(correct_answers_ids[i])

Which royalty was married to ptolemy XIII Theos Philopator and had mother named Cleopatra V ?
['cleopatra v', 'ptolemy xiii theos philopator']
[0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1]
['http://dbpedia.org/resource/Cleopatra_V_of_Egypt', 'http://dbpedia.org/resource/Ptolemy_XIII_Theos_Philopator']
[8078673, 18811966]
['http://dbpedia.org/resource/Cleopatra']
[8078652]


## Correct spans

Estimate the upper bound for entity scoring function performance on the correct entity spans

In [4]:
# check if the correct entities are in the subgraph 1-hop away from the top entities
# path to KG
from hdt import HDTDocument
hdt_path = "/mpqa/indexing/"
hdt_file = 'dbpedia2016-04en.hdt'
namespace = "http://dbpedia.org/"


def evaluate_entity_ranking(_e_spans, indices, top_n):
    '''
    Estimate ranking accuracy:
    n_samples <int> size of the sample questions pool
    top_n <int> threshold for the number of top entities 
    '''
    n_correct_entities, n_correct_entities_1hop = 0, 0
    n_correct_answers_1hop = 0
    # match entities
    for i in indices:
        top_e_ids = []
        
        # entities index lookup
        for span in _e_spans[i]:
            for match in e_index.match_label(span, top=top_n):
                top_e_ids.append(match['_source']['id'])
        
        if set(correct_entities_ids[i]).issubset(set(top_e_ids)):
            n_correct_entities += 1
        
        # extract a subgraph for top entities
        kg = HDTDocument(hdt_path+hdt_file)
        # all predicates: 1 hop
        kg.configure_hops(1, [], namespace, True, True)
        entities, _, _ = kg.compute_hops(top_e_ids, 500000, 0)
        if set(correct_entities_ids[i]).issubset(set(entities)):
            n_correct_entities_1hop += 1
        if set(correct_answers_ids[i]).issubset(set(entities)):
            n_correct_answers_1hop += 1
        kg.remove()


    r_entities = float(n_correct_entities) / n_samples
    r_entities_1hop = float(n_correct_entities_1hop) / n_samples
    r_answers_1hop = float(n_correct_answers_1hop) / n_samples
    
    return [r_entities, r_entities_1hop, r_answers_1hop]


# define sample size for evaluation
n_samples = 500
top_n_range = [1, 5]
print("Entity match recall estimated on %d questions @%d"%(n_samples, top_n_range[1]))
# shuffle dataset to get a random sample
from random import shuffle
index_shuf = list(range(dataset_size))
shuffle(index_shuf)
index_shuf = index_shuf[:n_samples]
assert len(index_shuf) == n_samples

Entity match recall estimated on 500 questions @5


In [5]:
# evaluate on correct entity spans
top_n = top_n_range[0]
results = [[0, 0, 0]]  # recalls at 0
while top_n <= top_n_range[1]:
    results.append(evaluate_entity_ranking(correct_e_spans, index_shuf, top_n))
    top_n += 1
    
# show result
import pandas as pd
results = pd.DataFrame(results)
print(results)

# plot
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.figure(figsize=(12,12))
plt.plot(results[0], label='Entity match')
plt.plot(results[1], label='1-hop entity match')
plt.plot(results[2], label='1-hop answer match')
plt.legend()
plt.show()

GET http://localhost:9200/dbpedia201604e/terms/_search?size=1 [status:N/A request:10.028s]
Traceback (most recent call last):
  File "/mpqa/miniconda3/envs/kbqa/lib/python3.6/site-packages/urllib3/connectionpool.py", line 384, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "/mpqa/miniconda3/envs/kbqa/lib/python3.6/site-packages/urllib3/connectionpool.py", line 380, in _make_request
    httplib_response = conn.getresponse()
  File "/mpqa/miniconda3/envs/kbqa/lib/python3.6/http/client.py", line 1379, in getresponse
    response.begin()
  File "/mpqa/miniconda3/envs/kbqa/lib/python3.6/http/client.py", line 311, in begin
    version, status, reason = self._read_status()
  File "/mpqa/miniconda3/envs/kbqa/lib/python3.6/http/client.py", line 272, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/mpqa/miniconda3/envs/kbqa/lib/python3.6/socket.py", line 586, in readinto
    return self._sock.recv_into(b)
socket.t

ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=10))

#### Test configure_hops arg_4 = False

In [None]:
# check if the correct entities are in the subgraph 1-hop away from the top entities
# path to KG
from hdt import HDTDocument
hdt_path = "/mpqa/indexing/"
hdt_file = 'dbpedia2016-04en.hdt'
namespace = "http://dbpedia.org/"


def evaluate_entity_ranking(_e_spans, indices, top_n):
    '''
    Estimate ranking accuracy:
    n_samples <int> size of the sample questions pool
    top_n <int> threshold for the number of top entities 
    '''
    n_correct_entities, n_correct_entities_1hop = 0, 0
    n_correct_answers_1hop = 0
    # match entities
    for i in indices:
        top_e_ids = []
        
        # entities index lookup
        for span in _e_spans[i]:
            for match in e_index.match_label(span, top=top_n):
                top_e_ids.append(match['_source']['id'])
        
        if set(correct_entities_ids[i]).issubset(set(top_e_ids)):
            n_correct_entities += 1
        
        # extract a subgraph for top entities
        kg = HDTDocument(hdt_path+hdt_file)
        # all predicates: 1 hop
        try:
            kg.configure_hops(1, [], namespace, True, False)
            entities, _, _ = kg.compute_hops(top_e_ids, 500000, 0)
            if set(correct_entities_ids[i]).issubset(set(entities)):
                n_correct_entities_1hop += 1
            if set(correct_answers_ids[i]).issubset(set(entities)):
                n_correct_answers_1hop += 1
            kg.remove()
        except:
            print('Timeout')


    r_entities = float(n_correct_entities) / n_samples
    r_entities_1hop = float(n_correct_entities_1hop) / n_samples
    r_answers_1hop = float(n_correct_answers_1hop) / n_samples
    
    return [r_entities, r_entities_1hop, r_answers_1hop]


# define sample size for evaluation
n_samples = 500
top_n_range = [1, 5]
print("Entity match recall estimated on %d questions @%d"%(n_samples, top_n_range[1]))
# shuffle dataset to get a random sample
from random import shuffle
index_shuf = list(range(dataset_size))
shuffle(index_shuf)
index_shuf = index_shuf[:n_samples]
assert len(index_shuf) == n_samples

In [None]:
# evaluate on correct entity spans
top_n = top_n_range[0]
results = [[0, 0, 0]]  # recalls at 0
while top_n <= top_n_range[1]:
    results.append(evaluate_entity_ranking(correct_e_spans, index_shuf, top_n))
    top_n += 1
    
# show result
import pandas as pd
results = pd.DataFrame(results)
print(results)

# plot
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.figure(figsize=(12,12))
plt.plot(results[0], label='Entity match')
plt.plot(results[1], label='1-hop entity match')
plt.plot(results[2], label='1-hop answer match')
plt.legend()
plt.show()

## Extracted spans

Estimate performance on the extracted entity spans using the mention extraction model

In [None]:
# load pre-trained entity mention extraction model
embeddings_choice = 'glove840B300d'

from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
from keras.optimizers import Adam

def build_model(model_settings):
    # architecture
    input = Input(shape=(model_settings['max_len'],))
    model = Embedding(input_dim=model_settings['n_words']+1, output_dim=model_settings['emb_dim'],
                      weights=[model_settings['embeddings']],
                      input_length=model_settings['max_len'], mask_zero=True, trainable=False)(input)
    model = Bidirectional(LSTM(units=50, return_sequences=True,
                               recurrent_dropout=0.1))(model)  # variational biLSTM
    model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
    crf = CRF(model_settings['n_tags'])  # CRF layer
    out = crf(model)  # output
    model = Model(input, out)
    model.compile(optimizer=Adam(lr=0.0001), loss=crf.loss_function, metrics=[crf.accuracy])
    model.summary()
    return model

# load model settings
import pickle as pkl
with open('%s_%s.pkl'%(dataset_name, embeddings_choice), 'rb') as f:
    model_settings = pkl.load(f)
model = build_model(model_settings)

# load weights
model_name = 'entity_model'
model.load_weights('../models/'+model_name+'.h5')

In [None]:
# evaluate entity span detection
import numpy as np
from keras.preprocessing.sequence import pad_sequences

def evaluate_entity_span_extraction(show_errors=False):
    n_correct = 0
    questions_e_spans = []
    for i, words in enumerate(question_words):
        x_test_sent = pad_sequences(sequences=[[model_settings['word2idx'].get(w, 'unk') for w in words]],
                                    padding="post", value=0, maxlen=model_settings['max_len'])
        p = model.predict(np.array([x_test_sent[0]]))
        p = np.argmax(p, axis=-1)[0]

        e_span, e_spans = [], []
        for w, pred in zip(words, p):
            if pred > 0:
                e_span.append(w)
            elif e_span:
                e_spans.append(" ".join(e_span))
                e_span = []
        # add last span
        if e_span:
            e_spans.append(" ".join(e_span))
            e_span = []

        if set(correct_e_spans[i]) == set(e_spans):
            n_correct += 1
        elif show_errors:
            print('\n')
            print(set(e_spans))
            # show correct spans
            print(set(correct_e_spans[i]))
        questions_e_spans.append(e_spans)
    p = float(n_correct) / dataset_size
    print("\nAcc: %.2f "%(p))
    return questions_e_spans

# evaluate
print("Accuracy estimated on %d questions"%(dataset_size))
extracted_e_spans = evaluate_entity_span_extraction()

In [None]:
# evaluate on correct entity spans
top_n = top_n_range[0]
results = [[0, 0, 0]]  # recalls at 0
while top_n <= top_n_range[1]:
    results.append(evaluate_entity_ranking(extracted_e_spans, index_shuf, top_n))
    top_n += 1
    
# show result
import pandas as pd
results = pd.DataFrame(results)
print(results)

# plot
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.figure(figsize=(12,12))
plt.plot(results[0], label='Entity match')
plt.plot(results[1], label='1-hop entity match')
plt.plot(results[2], label='1-hop answer match')
plt.legend()
plt.show()