## 连接图数据库

In [1]:
from py2neo import Graph, Node, Relationship, NodeMatcher

In [2]:
graph = Graph('http://localhost:7474', auth=('neo4j','12345678'),name='neo4j')

In [4]:
# t = Node('Person', name='老师')
# a = Node('Person', name='学生A')
# b = Node('Person', name='学生B')

# graph.create(Relationship(t, '师生', a))
# graph.create(Relationship(t, '师生', b))
# graph.create(Relationship(a, '同学', b))

In [5]:
# 删除所有实体和关系
cypher = 'MATCH (n) DETACH DELETE n'
graph.run(cypher)

## 从问题中提取实体和关系

In [3]:
from collections import defaultdict 

dic= defaultdict(list)

def load_data(path, label): 
    for i in open(path, encoding='utf-8'):
        if i.strip() not in dic:
            dic[i.strip()].append(label)
    return dic

#　特征词txt文件路径
disease_path = './build_kg/data/dict/disease.txt'
department_path = './build_kg/data/dict/department.txt'
check_path = './build_kg/data/dict/check.txt'
drug_path = './build_kg/data/dict/drug.txt'
food_path = './build_kg/data/dict/food.txt'
producer_path = './build_kg/data/dict/producer.txt'
symptom_path = './build_kg/data/dict/symptom.txt'
# deny_path = './build_kg/data/dict/deny.txt'

# 加载特征词
disease_wds = load_data(disease_path, 'disease')
department_wds = load_data(department_path, 'department')
check_wds = load_data(check_path, 'check')
drug_wds = load_data(drug_path, 'drug')
food_wds = load_data(food_path, 'food')
producer_wds = load_data(producer_path, 'producer')
symptom_wds = load_data(symptom_path, 'symptom')
# deny_words = load_data(deny_path)

dic

defaultdict(list,
            {'后发性白内障': ['disease'],
             '椎体爆裂骨折': ['disease'],
             '苍耳中毒': ['disease'],
             '小儿触电与雷击': ['disease'],
             '颅内高压综合征': ['disease'],
             '冠状动脉异常起源主动脉': ['disease'],
             '僵鉧': ['disease'],
             '腹膜转移癌': ['disease'],
             '小儿星形细胞瘤': ['disease'],
             '肢端肥大症': ['disease'],
             '小儿慢性充血性脾肿大': ['disease'],
             '脱囊': ['disease'],
             '先天性喉喘鸣': ['disease'],
             '脱屑性间质性肺炎': ['disease'],
             '耻骨结核': ['disease'],
             '背肌筋膜炎': ['disease'],
             '髌股关节软骨损伤': ['disease'],
             '脑脊液漏': ['disease'],
             '粘液表皮样癌': ['disease'],
             '中枢神经系统淋巴瘤': ['disease'],
             '膀胱气痛': ['disease'],
             '小儿脆性X染色体': ['disease'],
             '金黄色葡萄球菌肺炎': ['disease'],
             '急性腐蚀性胃炎': ['disease'],
             '口腔颌面部皮样、表皮样囊肿': ['disease'],
             '高弓足': ['disease'],
             '宫颈外翻': ['disease'],
  

### 构造AC自动机，加速实体提取

In [4]:
import ahocorasick

# 构造AC自动机，加速实体提取
def build_actree(wordlist):
    actree = ahocorasick.Automaton()
    for index, word in enumerate(wordlist):
        actree.add_word(word, (index, word))
    actree.make_automaton()
    return actree

region_tree = build_actree(list(dic.keys()))

In [5]:
'呼吸衰竭' in region_tree

True

In [6]:
region_tree.get('血常规')

(9570, '血常规')

## 任务一：提取问题相关实体及其类别

In [7]:
question = '肺气肿和百日咳要做血常规吗？'

question_entity = []
for each in region_tree.iter(question):
    entity = each[1][1]
    question_entity.append(entity)
print(question_entity)

['肺气肿', '百日咳', '血常规']


In [8]:
# stop_wds = []
# # 排除字符串子串
# for wd1 in region_words:
#     for wd2 in region_words:
#         if wd1 in wd2 and wd1!=wd2:
#             stop_wds.append(wd1)
# print(stop_wds)

# final_wds = [each for each in region_words if each not in stop_wds]
# final_wds

In [9]:
question_entity_dict = {each: dic[each] for each in question_entity}
question_entity_dict

{'肺气肿': ['disease'], '百日咳': ['disease'], '血常规': ['check']}

In [10]:
output = {}
output['args'] = question_entity_dict

## 任务二：提取问题相关的待查询关系

In [11]:
# 问题中涉及的实体类别
types = []
for each in question_entity_dict.values():
    types.extend(each)
types

['disease', 'disease', 'check']

In [12]:
# 不同提问意图的疑问词
symptom_qwds = ['症状', '表征', '现象', '症候', '表现']
cause_qwds = ['原因','成因', '为什么', '怎么会', '怎样才', '咋样才', '怎样会', '如何会', '为啥', '为何', '如何才会', '怎么才会', '会导致', '会造成']
acompany_qwds = ['并发症', '并发', '一起发生', '一并发生', '一起出现', '一并出现', '一同发生', '一同出现', '伴随发生', '伴随', '共现']
food_qwds = ['饮食', '饮用', '吃', '食', '伙食', '膳食', '喝', '菜' ,'忌口', '补品', '保健品', '食谱', '菜谱', '食用', '食物','补品']
drug_qwds = ['药', '药品', '用药', '胶囊', '口服液', '炎片']
prevent_qwds = ['预防', '防范', '抵制', '抵御', '防止','躲避','逃避','避开','免得','逃开','避开','避掉','躲开','躲掉','绕开',
                     '怎样才能不', '怎么才能不', '咋样才能不','咋才能不', '如何才能不',
                     '怎样才不', '怎么才不', '咋样才不','咋才不', '如何才不',
                     '怎样才可以不', '怎么才可以不', '咋样才可以不', '咋才可以不', '如何可以不',
                     '怎样才可不', '怎么才可不', '咋样才可不', '咋才可不', '如何可不']
lasttime_qwds = ['周期', '多久', '多长时间', '多少时间', '几天', '几年', '多少天', '多少小时', '几个小时', '多少年']
cureway_qwds = ['怎么治疗', '如何医治', '怎么医治', '怎么治', '怎么医', '如何治', '医治方式', '疗法', '咋治', '怎么办', '咋办', '咋治']
cureprob_qwds = ['多大概率能治好', '多大几率能治好', '治好希望大么', '几率', '几成', '比例', '可能性', '能治', '可治', '可以治', '可以医']
easyget_qwds = ['易感人群', '容易感染', '易发人群', '什么人', '哪些人', '感染', '染上', '得上']
check_qwds = ['检查', '检查项目', '查出', '检查', '测出', '试出']
belong_qwds = ['属于什么科', '属于', '什么科', '科室']
cure_qwds = ['治疗什么', '治啥', '治疗啥', '医治啥', '治愈啥', '主治啥', '主治什么', '有什么用', '有何用', '用处', '用途', '有什么好处', '有什么益处', '有何益处', '用来', '用来做啥', '用来作甚', '需要', '要']

In [13]:
def check_words(words, question):
    # 疑问词是否出现在提问中
    for word in words:
        if word in question:
            return True
    return False

In [14]:
question_types = []
# 疾病-症状
if check_words(symptom_qwds, question) and ('disease' in types):
    question_type = 'disease_symptom'
    question_types.append(question_type)
# 症状-疾病
if check_words(symptom_qwds, question) and ('symptom' in types):
    question_type = 'symptom_disease'
    question_types.append(question_type)
# 疾病-病因
if check_words(cause_qwds, question) and ('disease' in types):
    question_type = 'disease_cause'
    question_types.append(question_type)
# 疾病-并发症
if check_words(acompany_qwds, question) and ('disease' in types):
    question_type = 'disease_acompany'
    question_types.append(question_type)

# 疾病-可以吃/不可以吃
if check_words(food_qwds, question) and 'disease' in types:
    deny_status = check_words(deny_words, question)
    if deny_status:
        question_type = 'disease_not_food'
    else:
        question_type = 'disease_do_food'
    question_types.append(question_type)
    
# 可以吃/不可以吃-疾病
if check_words(food_qwds+cure_qwds, question) and 'food' in types:
    deny_status = check_words(deny_words, question)
    if deny_status:
        question_type = 'food_not_disease'
    else:
        question_type = 'food_do_disease'
    question_types.append(question_type)
    
# 疾病-药物
if check_words(drug_qwds, question) and 'disease' in types:
    question_type = 'disease_drug'
    question_types.append(question_type)
    
# 药物-疾病
if check_words(cure_qwds, question) and 'drug' in types:
    question_type = 'drug_disease'
    question_types.append(question_type)
    
# 疾病-检查
if check_words(check_qwds, question) and 'disease' in types:
    question_type = 'disease_check'
    question_types.append(question_type)
    
# 检查-疾病
if check_words(check_qwds+cure_qwds, question) and 'check' in types:
    question_type = 'check_disease'
    question_types.append(question_type)
    
#　疾病-预防措施
if check_words(prevent_qwds, question) and 'disease' in types:
    question_type = 'disease_prevent'
    question_types.append(question_type)
    
# 疾病-疗程
if check_words(lasttime_qwds, question) and 'disease' in types:
    question_type = 'disease_lasttime'
    question_types.append(question_type)

# 疾病-疗法
if check_words(cureway_qwds, question) and 'disease' in types:
    question_type = 'disease_cureway'
    question_types.append(question_type)

# 疾病-治愈率
if check_words(cureprob_qwds, question) and 'disease' in types:
    question_type = 'disease_cureprob'
    question_types.append(question_type)

# 疾病-易感人群
if check_words(easyget_qwds, question) and 'disease' in types :
    question_type = 'disease_easyget'
    question_types.append(question_type)
    
# 若没有查到相关的外部查询信息，那么则将该疾病的描述信息返回
if question_types == [] and 'disease' in types:
    question_types = ['disease_desc']

# 若没有查到相关的外部查询信息，那么则将该疾病的描述信息返回
if question_types == [] and 'symptom' in types:
    question_types = ['symptom_disease']

In [15]:
question_types

['check_disease']

In [16]:
# 将多个分类结果进行合并处理，组装成一个字典
output['question_types'] = question_types

args = output['args']

entity_dict = {}
for arg, types in args.items():
    for type in types:
        if type not in entity_dict:
            entity_dict[type] = [arg]
        else:
            entity_dict[type].append(arg)
entity_dict

{'disease': ['肺气肿', '百日咳'], 'check': ['血常规']}

In [17]:
output

{'args': {'肺气肿': ['disease'], '百日咳': ['disease'], '血常规': ['check']},
 'question_types': ['check_disease']}

### 生成cypher查询语句

In [18]:
# 不同的提问意图，对应不同关系的cypher查询语句
def sql_transfer(question_type, entities):
    if not entities:
        return []
    # 查询语句
    sql = []
    # 查询疾病的病因
    if question_type == 'disease_cause':
        sql = ["match (m:Disease) where m.name='{0}' return m.name, m.cause".format(i) for i in entities]
    # 查询疾病的预防措施
    elif question_type == 'disease_prevent':
        sql = ["match (m:Disease) where m.name='{0}' return m.name, m.prevent".format(i) for i in entities]
    # 查询疾病的疗程
    elif question_type == 'disease_lasttime':
        sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cure_lasttime".format(i) for i in entities]

    # 查询疾病的治愈率
    elif question_type == 'disease_cureprob':
        sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cured_prob".format(i) for i in entities]

    # 查询疾病的疗法
    elif question_type == 'disease_cureway':
        sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cure_way".format(i) for i in entities]

    # 查询疾病的易感人群
    elif question_type == 'disease_easyget':
        sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.easy_get".format(i) for i in entities]

    # 查询疾病的描述
    elif question_type == 'disease_desc':
        sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.desc".format(i) for i in entities]

    # 查询疾病的症状
    elif question_type == 'disease_symptom':
        sql = ["MATCH (m:Disease)-[r:has_symptom]->(n:Symptom) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
    # 查询疾病的并发症
    elif question_type == 'disease_acompany':
        sql1 = ["MATCH (m:Disease)-[r:acompany_with]->(n:Disease) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
        sql2 = ["MATCH (m:Disease)-[r:acompany_with]->(n:Disease) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
        sql = sql1 + sql2

    # 查询疾病的忌口
    elif question_type == 'disease_not_food':
        sql = ["MATCH (m:Disease)-[r:no_eat]->(n:Food) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]

    # 查询疾病建议吃的东西
    elif question_type == 'disease_do_food':
        sql1 = ["MATCH (m:Disease)-[r:do_eat]->(n:Food) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
        sql2 = ["MATCH (m:Disease)-[r:recommand_eat]->(n:Food) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
        sql = sql1 + sql2

    # 已知忌口查疾病
    elif question_type == 'food_not_disease':
        sql = ["MATCH (m:Disease)-[r:no_eat]->(n:Food) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]

    # 已知推荐查疾病
    elif question_type == 'food_do_disease':
        sql1 = ["MATCH (m:Disease)-[r:do_eat]->(n:Food) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
        sql2 = ["MATCH (m:Disease)-[r:recommand_eat]->(n:Food) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
        sql = sql1 + sql2

    # 查询疾病常用药品－药品别名记得扩充
    elif question_type == 'disease_drug':
        sql1 = ["MATCH (m:Disease)-[r:common_drug]->(n:Drug) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
        sql2 = ["MATCH (m:Disease)-[r:recommand_drug]->(n:Drug) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
        sql = sql1 + sql2

    # 已知药品查询能够治疗的疾病
    elif question_type == 'drug_disease':
        sql1 = ["MATCH (m:Disease)-[r:common_drug]->(n:Drug) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
        sql2 = ["MATCH (m:Disease)-[r:recommand_drug]->(n:Drug) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
        sql = sql1 + sql2
    # 查询疾病应该做的检查
    elif question_type == 'disease_check':
        sql = ["MATCH (m:Disease)-[r:need_check]->(n:Check) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]

    # 已知检查查询疾病
    elif question_type == 'check_disease':
        sql = ["MATCH (m:Disease)-[r:need_check]->(n:Check) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
    
    return sql

In [19]:
sql_transfer('disease_cause', ['肺气肿', '百日咳'])

["match (m:Disease) where m.name='肺气肿' return m.name, m.cause",
 "match (m:Disease) where m.name='百日咳' return m.name, m.cause"]

In [20]:
sqls = []
for question_type in question_types:
    sql_ = {}
    sql_['question_type'] = question_type
    sql_['sql'] = sql_transfer(question_type, entity_dict.get(question_type.split('_')[0]))
    sqls.append(sql_)
sqls

[{'question_type': 'check_disease',
  'sql': ["MATCH (m:Disease)-[r:need_check]->(n:Check) where n.name = '血常规' return m.name, r.name, n.name"]}]

### 查询图数据库，生成回答

In [21]:
num_limit = 20

'''根据对应的qustion_type，调用相应的回复模板'''
def answer_prettify(question_type, answers):
    final_answer = []
    if not answers:
        return ''
    if question_type == 'disease_symptom':
        desc = [i['n.name'] for i in answers]
        subject = answers[0]['m.name']
        final_answer = '{0}的症状包括：{1}'.format(subject, '；'.join(list(set(desc))[:num_limit]))

    elif question_type == 'symptom_disease':
        desc = [i['m.name'] for i in answers]
        subject = answers[0]['n.name']
        final_answer = '症状{0}可能染上的疾病有：{1}'.format(subject, '；'.join(list(set(desc))[:num_limit]))

    elif question_type == 'disease_cause':
        desc = [i['m.cause'] for i in answers]
        subject = answers[0]['m.name']
        final_answer = '{0}可能的成因有：{1}'.format(subject, '；'.join(list(set(desc))[:num_limit]))

    elif question_type == 'disease_prevent':
        desc = [i['m.prevent'] for i in answers]
        subject = answers[0]['m.name']
        final_answer = '{0}的预防措施包括：{1}'.format(subject, '；'.join(list(set(desc))[:num_limit]))

    elif question_type == 'disease_lasttime':
        desc = [i['m.cure_lasttime'] for i in answers]
        subject = answers[0]['m.name']
        final_answer = '{0}治疗可能持续的周期为：{1}'.format(subject, '；'.join(list(set(desc))[:num_limit]))

    elif question_type == 'disease_cureway':
        desc = [';'.join(i['m.cure_way']) for i in answers]
        subject = answers[0]['m.name']
        final_answer = '{0}可以尝试如下治疗：{1}'.format(subject, '；'.join(list(set(desc))[:num_limit]))

    elif question_type == 'disease_cureprob':
        desc = [i['m.cured_prob'] for i in answers]
        subject = answers[0]['m.name']
        final_answer = '{0}治愈的概率为（仅供参考）：{1}'.format(subject, '；'.join(list(set(desc))[:num_limit]))

    elif question_type == 'disease_easyget':
        desc = [i['m.easy_get'] for i in answers]
        subject = answers[0]['m.name']

        final_answer = '{0}的易感人群包括：{1}'.format(subject, '；'.join(list(set(desc))[:num_limit]))

    elif question_type == 'disease_desc':
        desc = [i['m.desc'] for i in answers]
        subject = answers[0]['m.name']
        final_answer = '{0},熟悉一下：{1}'.format(subject,  '；'.join(list(set(desc))[:num_limit]))

    elif question_type == 'disease_acompany':
        desc1 = [i['n.name'] for i in answers]
        desc2 = [i['m.name'] for i in answers]
        subject = answers[0]['m.name']
        desc = [i for i in desc1 + desc2 if i != subject]
        final_answer = '{0}的症状包括：{1}'.format(subject, '；'.join(list(set(desc))[:num_limit]))

    elif question_type == 'disease_not_food':
        desc = [i['n.name'] for i in answers]
        subject = answers[0]['m.name']
        final_answer = '{0}忌食的食物包括有：{1}'.format(subject, '；'.join(list(set(desc))[:num_limit]))

    elif question_type == 'disease_do_food':
        do_desc = [i['n.name'] for i in answers if i['r.name'] == '宜吃']
        recommand_desc = [i['n.name'] for i in answers if i['r.name'] == '推荐食谱']
        subject = answers[0]['m.name']
        final_answer = '{0}宜食的食物包括有：{1}\n推荐食谱包括有：{2}'.format(subject, ';'.join(list(set(do_desc))[:num_limit]), ';'.join(list(set(recommand_desc))[:self.num_limit]))

    elif question_type == 'food_not_disease':
        desc = [i['m.name'] for i in answers]
        subject = answers[0]['n.name']
        final_answer = '患有{0}的人最好不要吃{1}'.format('；'.join(list(set(desc))[:num_limit]), subject)

    elif question_type == 'food_do_disease':
        desc = [i['m.name'] for i in answers]
        subject = answers[0]['n.name']
        final_answer = '患有{0}的人建议多试试{1}'.format('；'.join(list(set(desc))[:num_limit]), subject)

    elif question_type == 'disease_drug':
        desc = [i['n.name'] for i in answers]
        subject = answers[0]['m.name']
        final_answer = '{0}通常的使用的药品包括：{1}'.format(subject, '；'.join(list(set(desc))[:num_limit]))

    elif question_type == 'drug_disease':
        desc = [i['m.name'] for i in answers]
        subject = answers[0]['n.name']
        final_answer = '{0}主治的疾病有{1},可以试试'.format(subject, '；'.join(list(set(desc))[:num_limit]))

    elif question_type == 'disease_check':
        desc = [i['n.name'] for i in answers]
        subject = answers[0]['m.name']
        final_answer = '{0}通常可以通过以下方式检查出来：{1}'.format(subject, '；'.join(list(set(desc))[:num_limit]))

    elif question_type == 'check_disease':
        desc = [i['m.name'] for i in answers]
        subject = answers[0]['n.name']
        final_answer = '通常可以通过{0}检查出来的疾病有{1}'.format(subject, '；'.join(list(set(desc))[:num_limit]))

    return final_answer

In [22]:
sqls

[{'question_type': 'check_disease',
  'sql': ["MATCH (m:Disease)-[r:need_check]->(n:Check) where n.name = '血常规' return m.name, r.name, n.name"]}]

In [23]:
for sql_ in sqls:
    question_type = sql_['question_type']
    queries = sql_['sql']
    answers = []
    for query in queries:
        res = graph.run(query).data()
        answers += res
answers

[{'m.name': '肺炎杆菌肺炎', 'r.name': '诊断检查', 'n.name': '血常规'},
 {'m.name': '肺炎球菌肺炎', 'r.name': '诊断检查', 'n.name': '血常规'},
 {'m.name': '放射性肺炎', 'r.name': '诊断检查', 'n.name': '血常规'},
 {'m.name': '二硫化碳中毒', 'r.name': '诊断检查', 'n.name': '血常规'},
 {'m.name': '大楼病综合征', 'r.name': '诊断检查', 'n.name': '血常规'},
 {'m.name': '大叶性肺炎', 'r.name': '诊断检查', 'n.name': '血常规'},
 {'m.name': '苯中毒', 'r.name': '诊断检查', 'n.name': '血常规'},
 {'m.name': '百日咳', 'r.name': '诊断检查', 'n.name': '血常规'}]

In [24]:
final_answers = []

for sql_ in sqls:
    question_type = sql_['question_type']
    queries = sql_['sql']
    answers = []
    
    
    for query in queries: # 运行每一条cypher查询语句
        res = graph.run(query).data()
        answers += res
    final_answer = answer_prettify(question_type, answers)
    if final_answer:
        final_answers.append(final_answer)
        
final_answers

['通常可以通过血常规检查出来的疾病有大楼病综合征；放射性肺炎；肺炎球菌肺炎；二硫化碳中毒；苯中毒；大叶性肺炎；百日咳；肺炎杆菌肺炎']