In [84]:
import pandas as pd
import json
import os
from fnmatch import fnmatch
import Levenshtein
import re

In [5]:
os.getcwd()

'D:\\disease-kb'

In [60]:
# read drug description
dd_columns = ["药品名称", "成份", "性状", "适应症", "用法用量", "不良反应", "禁忌", "注意事项",
                  "孕妇及哺乳期妇女用药", "儿童用药", "老年用药", "贮藏", "规格", "药物相互作用",
                  "药理毒理", "药代动力学", "药物过量", "有效期", "包装", "执行标准"]
df_drug_description = pd.DataFrame(columns=dd_columns)
for fn in os.listdir("d:/pgkb_graph/processed"):
    if fnmatch(fn, "*drug_description_detail_*.csv"):
        df_drug_description = pd.concat(
            [df_drug_description,
             pd.read_csv(os.path.join("d:/pgkb_graph/processed", fn), dtype=str).fillna("")],
             ignore_index=True,
             axis=0)
df_drug_description = df_drug_description.fillna("")

In [62]:
len(df_drug_description)

12761

In [156]:
# read disease details
disease_list = []
with open("data/medical.json", "r", encoding="utf-8") as f:
    for line in f.read().split("\n"):
        try:
            line_json = json.loads(line, strict=False)
            disease_list.append(line_json)
        except:
            pass
        
len(disease_list)

8808

In [168]:
# read ICD code
df_icd = pd.read_csv("ICD10_filter.csv")
icd_name_dict = dict(zip(list(df_icd["disease"].values),
                         list(df_icd["code"].values)))

In [174]:
no_match_list = []
for disease in disease_list:
    if disease["name"] in icd_name_dict.keys():
        disease["ICD10_code"] = icd_name_dict[disease["name"]]
    else:
        no_match_list.append(disease["name"])
        disease["ICD10_code"] = ""

In [175]:
len(no_match_list)

6585

In [176]:
# 最短编辑比率
max_match_list = []
no_match_list = list(map(lambda x: [
    x,
    re.sub(r"[性|病|症|征]", "", re.sub(r"[\(\)-\/（）\-\[\]\s、]", "", x))
    ], no_match_list))

icd_name_list = list(map(lambda x: [
    x,
    re.sub(r"[性|病|症|征]", "", re.sub(r"[\(\)-\/（）\-\[\]\s、]", "", x))
    ], icd_name_dict.keys()))

for no_match, no_match_clean in no_match_list:
    max_ratio = 0
    max_disease = ""
    for disease, disease_clean in icd_name_list:        
        ratio = Levenshtein.ratio(no_match_clean, disease_clean)
        if ratio > max_ratio:
            max_ratio = ratio
            max_disease = disease
    max_match_list.append((no_match, max_disease, max_ratio))


In [182]:
# 过滤最短编辑比率大于0.88的字符串组
filter_match_list = list(filter(lambda x: x[2] >= 0.85, max_match_list))
print(filter_match_list)
match_dict = dict(list(map(lambda x: (x[0], x[1]), filter_match_list)))

[('肺泡蛋白质沉积症', '肺泡蛋白沉积症', 0.9230769230769231), ('喘息样支气管炎', '喘息性支气管肺炎', 0.8571428571428571), ('大叶性肺炎', '大叶肺炎', 1.0), ('肺炎球菌肺炎', '肺炎球菌性肺炎', 1.0), ('肺炎杆菌肺炎', '肺炎杆菌性肺炎', 1.0), ('呼吸道异物', '呼吸道内异物', 0.9090909090909091), ('急性肺脓肿', '肺脓肿', 0.8571428571428571), ('金黄色葡萄球菌肺炎', '金黄色葡萄球菌性肠炎', 0.8888888888888888), ('卡氏肺囊虫肺炎', '卡氏肺囊虫性肺炎', 1.0), ('铍中毒', '铍中毒性肺', 0.8571428571428571), ('气管肿瘤', '气管恶性肿瘤', 0.8888888888888888), ('腺病毒肺炎', '腺病毒性肺炎', 1.0), ('乙醚中毒', '醚中毒', 0.8571428571428571), ('衣原体肺炎', '衣原体性肺炎', 1.0), ('支原体肺炎', '支原体性肺炎', 1.0), ('支气管肺炎', '支气管炎', 0.8888888888888888), ('单纯性下肢静脉曲张', '下肢静脉曲张', 0.8571428571428571), ('二尖瓣关闭不全', '风湿性二尖瓣关闭不全', 0.875), ('二尖瓣环钙化', '二尖瓣钙化', 0.9090909090909091), ('肥厚型梗阻性心肌病', '肥厚性梗阻性心肌病', 0.9230769230769231), ('肺动脉口狭窄', '肺动脉狭窄', 0.9090909090909091), ('肥厚型心肌病', '肥厚性心肌病', 0.8888888888888888), ('后天性三尖瓣关闭不全', '先天性三尖瓣关闭不全', 0.8888888888888888), ('急性心功能不全', '心功能不全', 0.9090909090909091), ('急性感染性心内膜炎', '亚急性感染性心内膜炎', 0.9333333333333333), ('胸降主动脉动脉瘤', '胸主动脉动脉瘤', 0.9333333333333333), ('

In [52]:
for disease in disease_list:
    if disease["name"] in match_dict.keys():
        disease["ICD10_code"] = icd_name_dict[match_dict[disease["name"]]]

In [55]:
len(disease_list)

8808

In [69]:
disease_drug_list = []
disease_symptom_list = []
for disease in disease_list:
    for drug in disease["recommand_drug"]:
        disease_drug_list.append([disease["name"], drug])
    for symptom in disease["symptom"]:
        disease_symptom_list.append([disease["name"], symptom])

In [68]:
print(len(disease_drug_list))
print(disease_drug_list[:10])

59467
[['百日咳', '琥乙红霉素片'], ['百日咳', '琥乙红霉素颗粒'], ['百日咳', '百咳静糖浆'], ['百日咳', '穿心莲内酯片'], ['百日咳', '红霉素肠溶片'], ['百日咳', '环酯红霉素片'], ['苯中毒', '布美他尼片'], ['苯中毒', '十一味金色丸'], ['苯中毒', '注射用布美他尼'], ['苯中毒', '注射用呋塞米']]


In [70]:
print(len(disease_symptom_list))
print(disease_symptom_list[:10])

54710
[['肺泡蛋白质沉积症', '紫绀'], ['肺泡蛋白质沉积症', '胸痛'], ['肺泡蛋白质沉积症', '呼吸困难'], ['肺泡蛋白质沉积症', '乏力'], ['肺泡蛋白质沉积症', '毓卓'], ['百日咳', '吸气时有蝉鸣音'], ['百日咳', '痉挛性咳嗽'], ['百日咳', '胸闷'], ['百日咳', '肺阴虚'], ['百日咳', '抽搐']]


In [71]:
drug_dict = {}
for index, row in df_drug_description.iterrows():
    drug_name = row["药品名称"]
    drug_dict[drug_name] = {}
    for col in dd_columns:
        drug_dict[drug_name][col] = row[col]

In [77]:
disease_dict = {}
for disease in disease_list:
    disease_dict[disease["name"]] = {}
    for key in disease.keys():
        if key in ["_id", "do_eat", "not_eat", "recommand_eat", "drug_detail"]:
            continue
        disease_dict[disease["name"]][key] = disease[key]

In [79]:
df_insurance_l2 = pd.read_csv("d:/pgkb_graph/processed/drug_insurance_L2.csv", dtype=str).fillna("")
df_insurance_l3 = pd.read_csv("d:/pgkb_graph/processed/drug_insurance_L3.csv", dtype=str).fillna("")

In [103]:
seq_regex = re.compile(r"（[\w]+-[\w]+）")
seq_list = []
text_list = []
code_list = []
for index, row in df_insurance_l2.iterrows():
    try:
        text = row["text"]
        code = row["code"]
        seq = re.findall(seq_regex, text)[0]
        text = text.replace(seq, "")
        start, end = seq.replace("（", "").replace("）", "").split("-")
        seq_list.append([start, end])
        text_list.append(text)
        code_list.append(code)
    except:
        continue

zip_list = list(zip(seq_list, code_list, text_list))
num_dict = {}
for x in zip_list:
    for i in range(int(x[0][0]), int(x[0][1]) + 1):
        num_dict[i] = [x[1], x[2]]

In [108]:
num_regex = re.compile("[\w]+")
insurance_dict = {}
for index, row in df_insurance_l3.iterrows():
    code = row["编号"]
    clazz = row["甲乙"]
    name = row["药品名称"]
    try:
        num = re.findall(num_regex, code)[0]
        category_num = num_dict[int(num)][0]
        category = num_dict[int(num)][1]
    except:
        category_num = ""
        category = ""
    insurance_dict[name] = [code, clazz, category_num, category]

In [151]:
# 最短编辑比率
# max_match_list = []
for drug in drug_dict.keys():
    max_ratio = 0
    max_insurance = ""
    drug_clean = re.sub(r"[\(\)-\/（）\-\[\]\s、]", "", drug)
    drug_clean = re.sub(r"[片|注射液|颗粒|滴剂|胶囊|散剂|混悬液|乳剂|剂|膏|丸|口服溶液|口服液|咀嚼|泡腾]", "", drug_clean) 
    
    for insurance in insurance_dict.keys():
        insurance_clean = re.sub(r"[\(\)-\/（）\-\[\]\s、]", "", insurance)
        insurance_clean = re.sub(r"[片|注射液|颗粒|滴剂|胶囊|散剂|混悬液|乳剂|剂|膏|丸|口服溶液|口服液|咀嚼|泡腾]", "", insurance_clean) 
        
        ratio = Levenshtein.ratio(drug_clean, insurance_clean)
        
        if insurance_clean in drug_clean:
            ratio += 0.15
        
        if ratio > max_ratio:
            max_ratio = ratio
            max_insurance = insurance
            
    # max_match_list.append([drug, max_insurance, max_ratio])
    if max_ratio > 0.85:
        drug_dict[drug]["是否医保"] = "是"
        drug_dict[drug]["医保药品名"] = max_insurance
        drug_dict[drug]["甲乙"] = insurance_dict[max_insurance][1]
        drug_dict[drug]["医保药品种类"] = insurance_dict[max_insurance][3]
    else:
        drug_dict[drug]["是否医保"] = "否"
        drug_dict[drug]["医保药品名"] = ""
        drug_dict[drug]["甲乙"] = ""
        drug_dict[drug]["医保药品种类"] = ""