In [1]:
from __future__ import unicode_literals
import os
import re
import sys
sys.path.append("..")
import pickle
import util
import pandas as pd
from entity_link.features import feature_select
from stanfordcorenlp import StanfordCoreNLP
from nltk.tokenize import word_tokenize

In [10]:
class EntityDetect(object):
    def __init__(self):
        self.nlp = StanfordCoreNLP("http://10.60.1.82", port=9999, lang="en")
        print("Stanford CoreNLP Server connnected ...")
        self.tag_list = ["FW", "NN", "NNP", "NNPS", "NNS"]
        self.tag_NN = ["NN", "NNP", "NNPS", "NNS"]
        self.id2name = {}
        self.name2id = {}
        self.strict = True
        self.keyword_strict = True
        self.proportion_strict = True
        if(self.strict):
            self.keyword_strict = True
            self.proportion_strict = True
        
    
    def getKeyWords(self, question):
        ## 一些专有名词便是直接大写化的
        ## question = question.lower()
        keyword_list = set()
        
        word_tag = self.nlp.pos_tag(question)
        ner_tag = self.nlp.ner(question)
        tag_length = len(word_tag)
        ner_length = len(ner_tag)
        
        ## 从词性标注中添加单个关键词
        for item in word_tag:
            if(item[1] in self.tag_list):
                keyword_list.add(item[0])
                
        ## 从词性标注中添加多个关键词
        for i in range(tag_length):
            if(word_tag[i][1] == "FW"):
                string = ""
                while(i < tag_length and word_tag[i][1] == "FW"):
                    string =  string + word_tag[i][0] + " "
                    i = i + 1
                keyword_list.add(string.rstrip(" "))

            if(i < tag_length and word_tag[i][1] in self.tag_NN):
                string = ""
                while(i < tag_length and word_tag[i][1] in self.tag_NN):
                    string =  string + word_tag[i][0] + " "
                    i = i + 1
                keyword_list.add(string.rstrip(" "))

        ## 从命名实体识别中添加单个关键词
        for item in ner_tag:
            if(item[1] != "O"):
                keyword_list.add(item[0])

        
        ## 从命名实体识别中添加多个关键词
        for i in range(ner_length):
            if(ner_tag[i][1] != "O"):
                tag = ner_tag[i][1] 
                string = ""
                while(i < ner_length and ner_tag[i][1] == tag):
                    string =  string + ner_tag[i][0] + " "
                    i = i + 1
                keyword_list.add(string.rstrip(" "))
                
        print("ori_keyword_list: "+ str(keyword_list))
        if(self.keyword_strict):
            tmp_list = keyword_list.copy()
            for item in tmp_list:
                inflag = False
                for word in keyword_list:
                    if(item in word and item != word):
                        inflag = True
                        break
                if(inflag):
                    keyword_list.remove(item)
        print("cur_keyword_list: "+ str(keyword_list))         
        return keyword_list
    
    def loadData(self, filepath):
        count = 0
        with open(filepath,"r",encoding="UTF-8") as file:
            for line in file:
                count = count + 1
                if(count % 1000000 == 0):
                    print("loaded %d entities ... " % count)
                index = line.find(",")
                ID = line[:index]
                name = line[index+1:-1]
                self.id2name[ID] = name
                if(name in self.name2id):
                    self.name2id[name].append(ID)
                else:
                    self.name2id[name] = [ID]
        file.close()
        print("entity names loaded !")
        return self.id2name, self.name2id
    
    def IsInString(self, name, string):
        regex = "(^" + name + "$)|(^" + name + "\W.*)|(.*\W" + name + "\W.*)|(.*\W" + name + "$)"
        pattern = re.compile(regex)
        if(pattern.match(string)):
            return True
        else:
            return False
    
    def ProportionStrict(self, substring, string):
        if(self.proportion_strict == False):
            return True
        word_threshold = 0.3
        char_threshold = 0.3
        word_list = word_tokenize(string)
        sub_word_list = word_tokenize(substring)
        string_chars = sum([len(x) for x in word_list])
        sub_string_chars = sum([len(x) for x in sub_word_list])
        string_words = len(word_list)
        sub_string_words = len(sub_word_list)
        
        return sub_string_words/string_words >= word_threshold and sub_string_chars/string_chars >= char_threshold
        
    
    def DetectEntities(self,keywords,threshold=100000):
        result = {"topic_words":[], "topic_words_names":[]}
        keywords = [x.lower() for x in keywords]
        for key in self.name2id.keys():
            for word in keywords:
                if(word in key and self.IsInString(word, key) and self.ProportionStrict(word, key)):
                    for ID in self.name2id[key]:
                        result['topic_words'].append(ID)
                        result['topic_words_names'].append(key)
                        if(len(result['topic_words']) > threshold):
                            return result
        return result

In [6]:
class QustionAnswering(object):
    def __init__(self):
        self.xgb = pickle.load(open("../datas/models/xgb_all.pickle.dat", "rb"))
        print("Model loaded ...")
        self.detect = EntityDetect()
    
    def load_data(self, filepath):
        return self.detect.loadData(filepath)
    
    def set_data(self, id2name, name2id):
        self.detect.id2name = id2name
        self.detect.name2id = name2id
    
    def gen_item_features(self, input):
        data = {'question': [], 'topic_words': [], 'word_score': [], 'topic_words_names': [], "label":[] }
        assert len(input['topic_words']) == len(input['topic_words_names'])
        cand_name = input['topic_words_names']
        question = input['question'][0]
        data['question'] = question
        for i, cand in enumerate(input['topic_words']):
            data['topic_words'].append(cand)
            data['topic_words_names'].append(cand_name[i])
            data['word_score'].append(1)
            data['label'].append(1)
            
        df = pd.DataFrame(data)
        features, _ = feature_select(df)
        return features
    
    def get_cand_entities(self,question):
        keywords = self.detect.getKeyWords(question)
        print("keywords generated!")
        input = self.detect.DetectEntities(keywords)
        input['question'] = question
        input = pd.DataFrame(input)
        print("candidates generated!")
        return input
    
    def get_top_entities(self, input):
        features = self.gen_item_features(input)
        print("features extracted!")
        predict = self.xgb.predict_proba(features)
        print("scores generated!")
        input['predict'] = predict[:, 1]
        head_100 = input.sort_values(['predict'], ascending=False).head(100)
        return head_100
    
    def get_results(self, question):
        input = self.get_cand_entities(question)
        return self.get_top_entities(input)     

In [7]:
QA = QustionAnswering()
# (id2name,name2id) = QA.load_data("../datas/zyt/mid2name_finally.txt")
(id2name,name2id) = QA.load_data("../datas/zyt/FB2M_names.txt")
# QA.set_data(id2name,name2id)
print("Finished!")

Model loaded ...
Stanford CoreNLP Server connnected ...
loaded 1000000 entities ... 
entity names loaded !
Finished!


In [12]:
import time
time1 = time.time()
# question = "What's the capital of United States?"
# question = "What's the meaning of Junk Foods?"
# question = "Slogan of Communist Party of China?"
question = "what was the cause of death of yves klein?"
cand = QA.get_cand_entities(question)
time2 = time.time()
print("Time Used: " + str(time2-time1) + "s")

ori_keyword_list: {'yves', 'klein', 'death', 'cause', 'yves klein'}
cur_keyword_list: {'death', 'cause', 'yves klein'}
keywords generated!
candidates generated!
Time Used: 0.5987193584442139s


In [11]:
question = "what was the cause of death of yves klein?"
print(QA.detect.nlp.word_tokenize(question))
print(QA.detect.nlp.pos_tag(question))
print(QA.detect.nlp.ner(question))

['what', 'was', 'the', 'cause', 'of', 'death', 'of', 'yves', 'klein', '?']
[('what', 'WP'), ('was', 'VBD'), ('the', 'DT'), ('cause', 'NN'), ('of', 'IN'), ('death', 'NN'), ('of', 'IN'), ('yves', 'NNS'), ('klein', 'NN'), ('?', '.')]
[('what', 'O'), ('was', 'O'), ('the', 'O'), ('cause', 'O'), ('of', 'O'), ('death', 'O'), ('of', 'O'), ('yves', 'O'), ('klein', 'O'), ('?', 'O')]


In [27]:
res = QA.get_results(question)

keyword_list: {'Slogan', 'Communist Party of China'}
keywords generated!
candidates generated!
features extracted!
scores generated!


In [30]:
res

Unnamed: 0,question,topic_words,topic_words_names,predict
0,Slogan of Communist Party of China?,/m/02189,Communist Party of China,0.277958
1,Slogan of Communist Party of China?,/m/0crtpxt,Slogan,0.005385
