In [31]:
import pandas as pd
import json
import os
from fnmatch import fnmatch
import Levenshtein

In [5]:
os.getcwd()

'D:\\disease-kb'

In [60]:
# read drug description
dd_columns = ["药品名称", "成份", "性状", "适应症", "用法用量", "不良反应", "禁忌", "注意事项",
                  "孕妇及哺乳期妇女用药", "儿童用药", "老年用药", "贮藏", "规格", "药物相互作用",
                  "药理毒理", "药代动力学", "药物过量", "有效期", "包装", "执行标准"]
df_drug_description = pd.DataFrame(columns=dd_columns)
for fn in os.listdir("d:/pgkb_graph/processed"):
    if fnmatch(fn, "*drug_description_detail_*.csv"):
        df_drug_description = pd.concat(
            [df_drug_description,
             pd.read_csv(os.path.join("d:/pgkb_graph/processed", fn), dtype=str).fillna("")],
             ignore_index=True,
             axis=0)
df_drug_description = df_drug_description.fillna("")

In [62]:
len(df_drug_description)

12761

In [26]:
# read disease details
disease_list = []
with open("data/medical.json", "r", encoding="utf-8") as f:
    for line in f.read().split("\n"):
        try:
            line_json = json.loads(line, strict=False)
            disease_list.append(line_json)
        except:
            pass
        
len(disease_list)

8808

In [27]:
# read ICD code
df_icd = pd.read_csv("ICD10_filter.csv")
icd_name_dict = dict(zip(list(df_icd["disease"].values),
                         list(df_icd["code"].values)))

In [28]:
no_match_list = []
for disease in disease_list:
    if disease["name"] in icd_name_dict.keys():
        disease["ICD10_code"] = icd_name_dict[disease["name"]]
    else:
        no_match_list.append(disease["name"])
        disease["ICD10_code"] = ""

In [29]:
len(no_match_list)

6585

In [38]:
# 最短编辑比率
max_match_list= []
for no_match in no_match_list:
    max_ratio = 0
    max_disease = ""
    for disease in icd_name_dict.keys():
        if Levenshtein.ratio(no_match, disease) > max_ratio:
            max_ratio = Levenshtein.ratio(no_match, disease)
            max_disease = disease
    max_match_list.append((no_match, max_disease, max_ratio))


In [50]:
# 过滤最短编辑比率大于0.88的字符串组
filter_match_list = list(filter(lambda x: x[2] >= 0.88, max_match_list))
print(len(filter_match_list))
match_dict = dict(list(map(lambda x: (x[0], x[1]), filter_match_list)))

632


In [52]:
for disease in disease_list:
    if disease["name"] in match_dict.keys():
        disease["ICD10_code"] = icd_name_dict[match_dict[disease["name"]]]

In [55]:
len(disease_list)

8808

In [69]:
disease_drug_list = []
disease_symptom_list = []
for disease in disease_list:
    for drug in disease["recommand_drug"]:
        disease_drug_list.append([disease["name"], drug])
    for symptom in disease["symptom"]:
        disease_symptom_list.append([disease["name"], symptom])

In [68]:
print(len(disease_drug_list))
print(disease_drug_list[:10])

59467
[['百日咳', '琥乙红霉素片'], ['百日咳', '琥乙红霉素颗粒'], ['百日咳', '百咳静糖浆'], ['百日咳', '穿心莲内酯片'], ['百日咳', '红霉素肠溶片'], ['百日咳', '环酯红霉素片'], ['苯中毒', '布美他尼片'], ['苯中毒', '十一味金色丸'], ['苯中毒', '注射用布美他尼'], ['苯中毒', '注射用呋塞米']]


In [70]:
print(len(disease_symptom_list))
print(disease_symptom_list[:10])

54710
[['肺泡蛋白质沉积症', '紫绀'], ['肺泡蛋白质沉积症', '胸痛'], ['肺泡蛋白质沉积症', '呼吸困难'], ['肺泡蛋白质沉积症', '乏力'], ['肺泡蛋白质沉积症', '毓卓'], ['百日咳', '吸气时有蝉鸣音'], ['百日咳', '痉挛性咳嗽'], ['百日咳', '胸闷'], ['百日咳', '肺阴虚'], ['百日咳', '抽搐']]


In [71]:
drug_dict = {}
for index, row in df_drug_description.iterrows():
    drug_name = row["药品名称"]
    drug_dict[drug_name] = {}
    for col in dd_columns:
        drug_dict[drug_name][col] = row[col]

In [77]:
disease_dict = {}
for disease in disease_list:
    disease_dict[disease["name"]] = {}
    for key in disease.keys():
        if key in ["_id", "do_eat", "not_eat", "recommand_eat", "drug_detail"]:
            continue
        disease_dict[disease["name"]][key] = disease[key]