# Mentions Extraction for KBQA

Parsing question into entities and predicate mentions text spans.

bi-LSTM + CRF  
https://github.com/UKPLab/emnlp2017-bilstm-cnn-crf  
https://www.depends-on-the-definition.com/sequence-tagging-lstm-crf/  
https://github.com/SNUDerek/multiLSTM  

In [2]:
# setup
dataset_name = 'lcquad'
embeddings_choice = 'glove840B300d'

import numpy as np
# set seed
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)

import os
os.chdir('/mpqa/KBQA/src')

# load word frequencies
import pickle
#wfd = pickle.load(open("wfd.pkl", "rb"))

# make sure Keras is using the GPU
from keras import backend
assert len(backend.tensorflow_backend._get_available_gpus()) > 0

# load pre-trained word embeddings
from pymagnitude import *
embeddings_path = "/mpqa/KBQA/data/embeddings/"
embeddings = {'glove840B300d': "glove.840B.300d.magnitude"}
vectors = Magnitude(embeddings_path + embeddings[embeddings_choice])

In [None]:
# load lcquad span annotations
# wget https://raw.githubusercontent.com/AskNowQA/EARL/master/data/lcquad.json
import json
with open("../data/%s.json"%dataset_name, "r", encoding='utf-8') as file:
    qas = json.load(file)

# prepare data for entity and predicate mention extraction models training via sequence tagging
limit = 5000
from keras.preprocessing.text import text_to_word_sequence

questions = []
n_words_distr = []

e_spans = []
y_e = []
correct_entities_uris = []

p_spans = []
y_p = []
correct_predicates_uris = []

for q in qas[:limit]:
    # parse question
    question_o = q['question']
    words = text_to_word_sequence(question_o)
    n_words_distr.append(len(words))
    questions.append(words)

    # generate IO tags from mention spans
    entity_spans = [e['label'].lower().split() for e in q['entity mapping']]
    e_spans.append(entity_spans)
    y_e.append([1 if word in [entity for entity_span in entity_spans for entity in entity_span] else 0 for word in words])
    correct_entities_uris.append([e['uri'] for e in q['entity mapping']])
    
    predicate_spans = [e['label'].lower().split() for e in q['predicate mapping']]
    p_spans.append(predicate_spans)
    y_p.append([1 if word in [entity for entity_span in predicate_spans for entity in entity_span] else 0 for word in words])
    correct_predicates_uris.append([e['uri'] for e in q['predicate mapping']])

dataset_size = len(questions)
print("Loaded %d/%d %s questions"%(dataset_size, len(qas), dataset_name))

In [None]:
# show sample question
i = 1
question_o = qas[i]['question']
print(question_o)
print('\n')
print(e_spans[i])
print(y_e[i])
print(correct_entities_uris[i])
print('\n')
print(p_spans[i])
print(y_p[i])
print(correct_predicates_uris[i])

In [None]:
# load pre-trained embeddings for question words
words = list(set([word for q in questions for word in q]))
n_words = len(words)
print("Number of unique words %d"%len(words))
word2idx = {w: i + 1 for i, w in enumerate(words)}

# dataset parameters for training the model
max_len = max(n_words_distr)
print("Maximum question length in the dataset: %d"%max_len)
n_tags = 2

# prepare data and pad the max length with 0s
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w] for w in s] for s in questions]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=0)

# load embeddings into matrix
import math
word_embedding_matrix = np.zeros((n_words+1, vectors.dim))

n_oov = 0

for w in word2idx:
    # get the word vector from the embedding model
    if w in vectors:
        word_vector = vectors.query(w)
    # OOV word
    else:
        n_oov += 1
        word_vector = vectors.query('unk')
    word_embedding_matrix[word2idx[w]] = word_vector

# loaded vector # may be lower than total vocab due to w2v settings
print('%d OOV words'%n_oov)

model_settings = {'embeddings': word_embedding_matrix, 'word2idx': word2idx,
                  'max_len': max_len, 'n_words': n_words, 'n_tags': n_tags, 'emb_dim': vectors.dim}

# save model settings
import pickle as pkl
f = open('%s_%s.pkl'%(dataset_name, embeddings_choice), 'wb')
pkl.dump(model_settings, f, -1)
f.close()
print("Model settings saved.")

In [None]:
# build biLSTM-CRF model for mention extraction
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
from keras.optimizers import Adam

def build_model(model_settings):
    # architecture
    input = Input(shape=(model_settings['max_len'],))
    model = Embedding(input_dim=model_settings['n_words']+1, output_dim=model_settings['emb_dim'],
                      weights=[model_settings['embeddings']],
                      input_length=model_settings['max_len'], mask_zero=True, trainable=False)(input)
    model = Bidirectional(LSTM(units=50, return_sequences=True,
                               recurrent_dropout=0.1))(model)  # variational biLSTM
    model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
    crf = CRF(model_settings['n_tags'])  # CRF layer
    out = crf(model)  # output
    model = Model(input, out)
    model.compile(optimizer=Adam(lr=0.0001), loss=crf.loss_function, metrics=[crf.accuracy])
    model.summary()
    return model

model = build_model(model_settings)

## Entity mentions

In [None]:
# train entity mention span model
y = y_e
modelname = 'entity_model'

# prepare data
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=0)
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags) for i in y]

# split dataset into training and testing subsets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
print("Training on %d samples testing on %d samples" % (len(X_train), len(X_test)))

In [None]:
# callbacks
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, TerminateOnNaN, ModelCheckpoint
cb_redlr = ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy', factor=0.5, patience=3, min_lr=0.0001, verbose=1)
cb_early = EarlyStopping(monitor='val_crf_viterbi_accuracy', min_delta=0, patience=5, verbose=1)
cb_chkpt = ModelCheckpoint('../checkpoints/_'+modelname+'{epoch:02d}-{val_crf_viterbi_accuracy:.2f}.h5', verbose=1, save_best_only=True, save_weights_only=True, period=5)

callbacks_list=[cb_redlr, cb_early, cb_chkpt]

# start training
log = model.fit(X_train, np.array(y_train), batch_size=32, epochs=50,
                callbacks=callbacks_list,
                validation_split=0.1, verbose=1)

In [None]:
# plot accuracies
import pandas as pd
hist = pd.DataFrame(log.history)
# print(hist)
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.figure(figsize=(12,12))
plt.plot(hist["crf_viterbi_accuracy"], label='Training accuracy')
plt.plot(hist["val_crf_viterbi_accuracy"], label='Validation accuracy')
plt.legend()
plt.show()

In [None]:
# save model
from keras_contrib.utils import save_load_utils
with open('../models/' + modelname + '.json', 'w') as f:
    f.write(model.to_json())
# save weights
save_load_utils.save_all_weights(model, '../models/'+modelname+'.h5')

In [None]:
# load model settings
import pickle as pkl
with open('%s_%s.pkl'%(dataset_name, embeddings_choice), 'rb') as f:
    model_settings = pkl.load(f)
model = build_model(model_settings)

# load weights
model.load_weights('../models/'+modelname+'.h5')

# evaluate model on the test set
test_pred = model.predict(X_test, verbose=1)

idx2tag = {1: 'I', 0: 'O'}
pred_labels = [[idx2tag[np.argmax(p)] for p in pred_i] for pred_i in test_pred]
test_labels = [[idx2tag[np.argmax(p)] for p in pred_i] for pred_i in y_test]

from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels, average='weighted')))
print("Precision-score: {:.1%}".format(precision_score(test_labels, pred_labels, average='weighted')))
print("Recall-score: {:.1%}".format(recall_score(test_labels, pred_labels, average='weighted')))

In [None]:
# show sample test prediction
# i = 1
p = model.predict(np.array([X_test[i]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_test[i], -1)
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_test[i], true, p[0]):
    if w != 0:
        print("{:15}: {:5} {}".format(words[w-1], t, pred))

## Predicate mentions

In [None]:
# train entity mention span model
y = y_p
modelname = 'predicate_model'

# prepare data
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=0)
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags) for i in y]

# split dataset into training and testing subsets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
print("Training on %d samples testing on %d samples" % (len(X_train), len(X_test)))

In [None]:
# callbacks
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, TerminateOnNaN, ModelCheckpoint
cb_redlr = ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy', factor=0.5, patience=3, min_lr=0.0001, verbose=1)
cb_early = EarlyStopping(monitor='val_crf_viterbi_accuracy', min_delta=0, patience=5, verbose=1)
cb_chkpt = ModelCheckpoint('../checkpoints/_'+modelname+'{epoch:02d}-{val_crf_viterbi_accuracy:.2f}.h5', verbose=1, save_best_only=True, save_weights_only=True, period=5)

callbacks_list=[cb_redlr, cb_early, cb_chkpt]

# start training
log = model.fit(X_train, np.array(y_train), batch_size=32, epochs=50,
                callbacks=callbacks_list,
                validation_split=0.1, verbose=1)

In [None]:
# plot accuracies
import pandas as pd
hist = pd.DataFrame(log.history)
# print(hist)
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.figure(figsize=(12,12))
plt.plot(hist["crf_viterbi_accuracy"], label='Training accuracy')
plt.plot(hist["val_crf_viterbi_accuracy"], label='Validation accuracy')
plt.legend()
plt.show()

In [None]:
# save model
from keras_contrib.utils import save_load_utils
with open('../models/' + modelname + '.json', 'w') as f:
    f.write(model.to_json())
# save weights
save_load_utils.save_all_weights(model, '../models/'+modelname+'.h5')

# load weights
model.load_weights('../models/'+modelname+'.h5')

# evaluate model on the test set
test_pred = model.predict(X_test, verbose=1)

idx2tag = {1: 'I', 0: 'O'}
pred_labels = [[idx2tag[np.argmax(p)] for p in pred_i] for pred_i in test_pred]
test_labels = [[idx2tag[np.argmax(p)] for p in pred_i] for pred_i in y_test]

from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels, average='weighted')))

In [None]:
# show sample test prediction
i = 1
p = model.predict(np.array([X_test[i]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_test[i], -1)
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_test[i], true, p[0]):
    if w != 0:
        print("{:15}: {:5} {}".format(words[w-1], t, pred))