In [41]:
import pandas as pd
import re
import json
import requests
from collections import defaultdict
import Levenshtein


In [55]:
df_icd = pd.read_csv("ICD10.csv", sep="\t", header=0, names=["code", "disease"])

df_icd = df_icd[df_icd.code.str.match("^[A-Z]")]
df_icd = df_icd[df_icd.code.str.match("\w+(\.| )")]

In [56]:
df_icd = df_icd.sort_values(by="code")
df_icd.index = range(len(df_icd))

In [57]:
df_icd.drop_duplicates().to_csv("ICD10_filter.csv", index=False)

In [32]:
df_icd = pd.read_csv("c:/Users/zhangke1/Desktop/ICD-11.csv", dtype=str).fillna("")
df_icd = df_icd[df_icd["有效码"] == "是"]

In [33]:
df_icd["code"] = df_icd["章节或编码"]
df_icd["disease"] = df_icd["中文名称"]
df_icd = df_icd[["code", "disease"]]

In [37]:
df_icd.index=range(len(df_icd))
df_icd.sort_values(by=["code"])


In [18]:
with open("d:/disease-kb/processed/disease_dict.json", "r", encoding="utf-8") as f:
    disease_dict = json.load(f)

In [22]:
print("length of disease data: ", len(disease_dict.keys()))

length of disease data:  8807


In [35]:
print("disease fully match ICD 11: ", len(set(df_icd["disease"].values) & set(disease_dict.keys())))

disease fully match ICD 11:  723


In [36]:
len(list(filter(lambda x: x, [True if disease_dict[x]["ICD10_code"] != "" else False for x in disease_dict.keys()])))

3549

In [43]:
disease_list = list(df_icd["disease"].values)

brac_regex = re.compile("（[^)]*）")

def clean_up(disease):
    disease = re.sub(brac_regex, "", disease)
    disease = disease.split("，")[0]
    disease = disease.split("：")[0]
    return disease.strip()

clean_disease_list = [clean_up(x) for x in disease_list]

In [44]:
df_icd["clean"] = clean_disease_list

In [47]:
match_list = set(df_icd["clean"].values) & set(disease_dict.keys())
print("disease fully match ICD 11: ", len(match_list))

disease fully match ICD 11:  1171


In [None]:
# TODO 可以尝试用最短编辑距离率来匹配疾病和ICD名。

In [45]:
# df_icd.to_csv("ICD11_filter.csv", index=False)

In [5]:
df_icd10 = pd.read_excel("医保ICD10_v2.0_0122.xlsx", sheet_name="完整分类与代码")

In [7]:
df_icd10_chapter = df_icd10[["章", "章代码范围", "章的名称"]]
df_icd10_chapter = df_icd10_chapter.drop_duplicates()
df_icd10_section = df_icd10[["节代码范围", "节名称"]]
df_icd10_section = df_icd10_section.drop_duplicates()
df_icd10_class = df_icd10[["类目代码", "类目名称"]]
df_icd10_class = df_icd10_class.drop_duplicates()
df_icd10_subclass = df_icd10[["亚目代码", "亚目名称"]]
df_icd10_subclass = df_icd10_subclass.drop_duplicates()
df_icd10_code = df_icd10[["诊断代码", "诊断名称"]]

In [22]:
icd10_code_dict = dict(zip(
    list(df_icd10_code["诊断名称"].values),
    list(df_icd10_code["诊断代码"].values)
))

icd10_subclass_dict = dict(zip(
    list(df_icd10_subclass["亚目名称"].values),
    list(df_icd10_subclass["亚目代码"].values)
))

icd10_class_dict = dict(zip(
    list(df_icd10_class["类目名称"].values),
    list(df_icd10_class["类目代码"].values)
))

icd10_section_dict = dict(zip(
    list(df_icd10_section["节名称"].values),
    [x.split("-") for x in df_icd10_section["节代码范围"].values]
))

icd10_chapter_dict = {}
for index, row in df_icd10_chapter.iterrows():
    icd10_chapter_dict[row["章的名称"]] = [row["章"], row["章代码范围"].split("-")[0], row["章代码范围"].split("-")[1]]

In [23]:
icd10_chapter_dict

{'某些传染病和寄生虫病': [1, 'A00', 'B99'],
 '肿瘤': [2, 'C00', 'D48'],
 '血液及造血器官疾病和涉及免疫机制的某些疾患': [3, 'D50', 'D89'],
 '内分泌、营养和代谢疾病': [4, 'E00', 'E90'],
 '精神和行为障碍': [5, 'F00', 'F99'],
 '神经系统疾病': [6, 'G00', 'G99'],
 '眼和附器疾病': [7, 'H00', 'H59'],
 '耳和乳突疾病': [8, 'H60', 'H95'],
 '循环系统疾病': [9, 'I00', 'I99'],
 '呼吸系统疾病': [10, 'J00', 'J99'],
 '消化系统疾病': [11, 'K00', 'K93'],
 '皮肤和皮下组织疾病': [12, 'L00', 'L99'],
 '肌肉骨骼系统和结缔组织疾病': [13, 'M00', 'M99'],
 '泌尿生殖系统疾病': [14, 'N00', 'N99'],
 '妊娠、分娩和产褥期': [15, 'O00', 'O99'],
 '起源于围生期的某些情况': [16, 'P00', 'P96'],
 '先天性畸形、变形和染色体异常': [17, 'Q00', 'Q99'],
 '症状、体征和临床与实验室异常所见，不可归类在他处者': [18, 'R00', 'R99'],
 '损伤、中毒和外因的某些其他后果': [19, 'S00', 'T98'],
 '用于特殊目的的编码': [22, 'U00', 'U85'],
 '疾病和死亡的外因': [20, 'V01', 'Y98'],
 '影响健康状态和与保健机构接触的因素': [21, 'Z00', 'Z99']}

In [26]:
req = requests.get("https://zstp.pcl.ac.cn:8002/load_tree/ICD10")

In [37]:
node_list = json.loads(req.content)["nodes"]

node_list = list(map(lambda x: {
    "type": "class" if "class" in x["icon"] else "disease",
    "parent_id": x["pId"],
    "id": x["id"],
    "name": x["name"]
    }, node_list))

In [38]:
list(filter(lambda x: "伤寒" in x["name"], node_list))

[{'type': 'class', 'parent_id': 469, 'id': 5, 'name': 'A01.0 伤寒'},
 {'type': 'class', 'parent_id': 469, 'id': 6, 'name': 'A01.3 副伤寒丙'},
 {'type': 'class', 'parent_id': 469, 'id': 7, 'name': 'A01.4 未特指的副伤寒'},
 {'type': 'class',
  'parent_id': 527,
  'id': 179,
  'name': 'A75.0 普氏立克次体引起的流行性虱媒介的斑疹伤寒'},
 {'type': 'class',
  'parent_id': 527,
  'id': 180,
  'name': 'A75.2 地方性斑疹伤寒立克次体引起的斑疹伤寒'},
 {'type': 'class',
  'parent_id': 527,
  'id': 181,
  'name': 'A75.3 恙虫病立克次体引起的斑疹伤寒'},
 {'type': 'class', 'parent_id': 527, 'id': 182, 'name': 'A75.9 未特指的斑疹伤寒'},
 {'type': 'class', 'parent_id': 4, 'id': 469, 'name': 'A01 伤寒和副伤寒'},
 {'type': 'class', 'parent_id': 528, 'id': 527, 'name': 'A75 斑疹伤寒'},
 {'type': 'disease', 'parent_id': 5, 'id': 148092, 'name': '伤寒'},
 {'type': 'disease', 'parent_id': 5, 'id': 148093, 'name': '伤寒感染'},
 {'type': 'disease', 'parent_id': 7, 'id': 148569, 'name': '副伤寒'},
 {'type': 'disease', 'parent_id': 6, 'id': 148570, 'name': '副伤寒'},
 {'type': 'disease', 'parent_id': 180, '

In [40]:
disease_node_list = list(filter(lambda x: x["type"] == "disease", node_list))
icd_node_list = list(filter(lambda x: x["type"] == "class", node_list))

In [53]:
disease_dict = defaultdict(dict)
for dis in disease_node_list:
    pid = dis["parent_id"]
    icd_name = list(filter(lambda x: x["id"] == pid, icd_node_list))[0]["name"]
    icd_code, icd_name = icd_name.strip("\t").split(" ")
    disease_dict[dis["name"]][icd_code] = icd_name

In [49]:
disease_dict["气管肿瘤"]

['D38.1 气管、支气管和肺动态未定或动态未知的肿瘤']

In [55]:
with open("processed/icd10_disease_dict.json", "w") as f:
    json.dump(disease_dict, f)

In [54]:
disease_dict

defaultdict(dict,
            {'DMD': {'G71.0': '肌营养不良'},
             '13 -三体综合征': {'Q91.7': '未特指的帕套综合征'},
             '18-三体综合征': {'Q91.3': '未特指的爱德华兹综合征'},
             '21-轻化酶缺乏症': {'E25.0': '先天性肾上腺性征疾患伴有酶缺乏'},
             '3D眩晕症': {'T75.3': '晕动病'},
             'Alagille综合征': {'Q44.7': '肝的其他先天性畸形'},
             'Alport综合征': {'Q87.8': '其他特指的先天性畸形综合征,不可归类在他处者',
              'N07.8': '遗传性肾病,不可归类在他处者,其他'},
             'Apert综合征': {'Q87.0': '主要影响面部外貌的先天性畸形综合征',
              'Q75.8': '颅和面骨的其他特指先天性畸形'},
             'Arnold-Chiari畸形': {'G93.5': '脑受压',
              'Q04.8': '脑其他特指的先天性畸形',
              'Q75.8': '颅和面骨的其他特指先天性畸形',
              'Q07.0': '阿-基综合征'},
             'Austin型幼儿脑硫脂病': {'E88.8': '其他特指的代谢紊乱'},
             'A—V综合征': {'H50.8': '其他特指的斜视'},
             'Barrett食管': {'K20': '食管炎', 'K22.7': '巴雷特食管'},
             'Bartter综合征': {'E26.8': '其他醛固酮过多症'},
             'Bazin硬红斑': {'L53.8': '其他特指的红斑性情况',
              'L52': '结节性红斑',
              'A18.4': '皮肤和皮下组织的结核'},


In [57]:
import sys
sys.path.insert(0, "src")
import icd_mapping

def get_all_drug():
    df_all_nmpa = pd.read_csv("processed/all_nmpa_info.csv", dtype=str).fillna("")
    df_import = pd.read_csv("d:/pgkb_graph/processed/imported_drug.csv", dtype=str).fillna("")
    all_drug_list = list(set(list(df_all_nmpa["name"].values) + list(df_import["drug_name"].values)))
    return all_drug_list


with open("processed/all_disease_drug_desc.json", "r") as f:
    disease_dict = json.load(f)

xywy_disease_drug_list = []
for key, value in disease_dict.items():
    if key == "":
        continue

    for val in value["drug"]:
        xywy_disease_drug_list.append([key, val])

print(len(xywy_disease_drug_list))

with open("processed/disease_drug_list.json", "r") as f:
    disease_drug_list = json.load(f)

print(len(disease_drug_list))

all_disease_drug_list = xywy_disease_drug_list + disease_drug_list
df_disease_drug = pd.DataFrame(all_disease_drug_list, columns=["disease", "drug"])
df_disease_drug = df_disease_drug.drop_duplicates()
print(len(df_disease_drug))

all_drug_list = get_all_drug()

df_disease_drug = df_disease_drug[df_disease_drug["drug"].isin(all_drug_list)]
print(len(df_disease_drug))

all_disease_list = list(set(df_disease_drug["disease"].values))

icd_map = icd_mapping.icdMapping()
disease_icd_dict = icd_map.disease_dict

69956
59467
84033
81065


In [63]:
matched_disease_set = set(all_disease_list) & set(disease_icd_dict.keys())
len(matched_disease_set)

4105

In [68]:
len(disease_icd_dict.keys())
no_match_1_list = sorted(list(set(disease_icd_dict.keys()) - matched_disease_set))
no_match_1_list

['13 -三体综合征',
 '18-三体综合征',
 '21-轻化酶缺乏症',
 '3D眩晕症',
 'Alagille综合征',
 'Alport综合征',
 'Apert综合征',
 'Arnold-Chiari畸形',
 'A—V综合征',
 'Bartter综合征',
 'Caroli病',
 'Chediak-Higashi综合征',
 'Criger-Najjar综合征',
 'DMD',
 'DiGeorge综合征',
 'GSDⅠa型',
 'GSDⅡ型',
 'GSDⅣ型',
 'GSDⅥ型',
 'Gardner综合征',
 'HIV感染',
 'HUGHES—STOVIN综合征',
 'IgA缺陷',
 'IgA肾病',
 'I型新月体肾炎',
 'Klippel-Trenaunay综合征',
 'Lennox-Gastaut综合征',
 'Loffler综合征',
 'Mallory-Weiss综合征',
 'Marfan综合征',
 'Marie—Bamberger综合征',
 'McCune-Albright综合征',
 'Milk—Alkali综合征',
 'Mollaret脑膜炎',
 'Morton跖头痛',
 'Noonan综合征',
 'Pasini-Pierni进行性特发性皮肤萎缩',
 'QT间期延长综合征',
 'Reis—Bucklers角膜营养不良',
 'ST段抬高型心肌梗死',
 'Stevens-Johnson综合征',
 'TINU综合征',
 'Turcot综合征',
 'Turner综合征',
 'T细胞淋巴瘤',
 'WPW综合征',
 'Walker-Warburg综合征',
 'X连锁无丙种球蛋白血症',
 'Zellweger综合征',
 'hunt综合征',
 'mu外翻',
 'poems综合征',
 'x综合征',
 'α+地中海贫血',
 'α储存池（α-SPD）',
 'β地中海贫血',
 '三叉神经良性肿瘤',
 '三尖瓣狭窄',
 '三尖瓣闭锁',
 '三尖瓣闭锁不全',
 '三度房室传导阻滞',
 '三房心',
 '上皮-肌上皮癌',
 '上睑下垂',
 '上腔静脉受压综合征',
 '上颌窦后鼻孔息肉',
 '上颌窦牙源性囊肿',
 '上颌骨骨折',
 '下丘脑综合征',
 '下咽

In [69]:
no_match_2_list = sorted(list(set(all_disease_list) - matched_disease_set))
no_match_2_list

['A-V综合征',
 'ADH分泌过多综合征',
 'A链球菌群感染',
 'Behcet病',
 'B链球菌群感染',
 'CD30阳性皮肤T细胞淋巴瘤',
 'Creutzfeldt-Jakob病',
 'DF-2败血症的皮肤表现',
 'Ehler-Danlos综合征',
 'Felty综合征',
 'Friedreich共济失调症',
 'Goodpasture综合征',
 'Graves病',
 'H7N9型禽流感',
 'H7n7',
 'Hughes-Stovin综合征',
 'II型糖尿病',
 'II型肾小管性酸中毒',
 'I型糖尿病',
 'I型肾小管性酸中毒',
 'Kaposis肉瘤',
 'Maffucci综合征',
 'Marie-Bamberger综合征',
 'Milk-Alkali综合征',
 'PUVA雀斑样痣',
 'Pasini-Pierni进行性皮肤萎缩',
 'Pick病和额颞痴呆',
 'Plummer-Vinson综合征',
 'Q热肺炎',
 'Reis-Bucklers角膜营养不良',
 'Reiter综合征',
 'Shy-Drager综合征',
 'Usher综合征',
 'X综合征',
 'c胰岛功能性β细胞瘤',
 'α-贮存池病',
 'α1-抗胰蛋白酶缺乏症',
 'β-氨基酸尿',
 'β受体亢进症',
 'δ-贮存池病',
 'Ⅰ型免疫母细胞性淋巴腺病',
 'Ⅰ型超敏反应性疾病',
 'Ⅱ型免疫母细胞性淋巴腺病',
 'Ⅲ型超敏反应性疾病',
 'Ⅳ型肾小管性酸中毒',
 'Ⅳ型超敏反应性疾病',
 '一度房室传导阻滞',
 '丁型病毒性肝炎',
 '三焦咳',
 '三环类抗忧郁药中毒',
 '三痹',
 '三高症',
 '上皮样囊肿',
 '上皮样肉瘤',
 '上肢深静脉血栓形成',
 '上腔静脉综合征',
 '上颈椎不稳症',
 '上颌窦恶性肿瘤',
 '下呼吸道感染',
 '下消',
 '下消化道出血',
 '下焦湿热',
 '下肢淋巴水肿',
 '下肢静脉曲张',
 '下肢静脉血栓形成',
 '下背部痛',
 '下颌下隙感染',
 '不均匀性脂肪肝',
 '不孕不育',
 '不射精症',
 '丙型病毒性肝炎',
 '丙型肝炎病毒感染与肾小球肾炎',
 '丙酮酸激酶缺乏症',
 '东方马

In [73]:
def str_clean(dis_str):
    return dis_str.replace("—", "-").replace(" ", "").replace("\n", "").replace("\t", "") \
            .replace("［", "").replace("］", "").replace("[", "").replace("]", "").lower()

no_match_2_clean_list = [str_clean(x) for x in no_match_2_list]
no_match_2_dict = dict(zip(no_match_2_list, no_match_2_clean_list))
no_match_1_clean_list = [str_clean(x) for x in no_match_1_list]
no_match_1_dict = dict(zip(no_match_1_list, no_match_1_clean_list))

In [79]:
dis_match_dict = {}
for dis_2, dis_clean_2 in no_match_2_dict.items():
    max_ratio = 0
    max_dis = ""

    for dis_1, dis_clean_1 in no_match_1_dict.items():

        ratio = Levenshtein.ratio(dis_clean_2, dis_clean_1)

        if dis_clean_1 in dis_clean_2 or dis_clean_2 in dis_clean_1 :
            ratio += 0.15

        if ratio > max_ratio and ratio > 0.85:
            max_ratio = ratio
            max_dis = dis_1
    
    if max_dis != "":
        dis_match_dict[dis_2] = max_dis
    

In [86]:
# key是已有疾病，value是cmekg疾病
dis_match_dict
print(len(dis_match_dict) + len(matched_disease_set))
print(len(all_disease_list))


4503
7599


In [87]:
with open("processed/cmekg_disease_drug_list.json", "r") as f:
    cmekg_disease_drug_list = json.load(f)


In [90]:
print(len(cmekg_disease_drug_list))
def get_all_drug():
    df_all_nmpa = pd.read_csv("processed/all_nmpa_info.csv", dtype=str).fillna("")
    df_import = pd.read_csv("d:/pgkb_graph/processed/imported_drug.csv", dtype=str).fillna("")
    all_drug_list = list(set(list(df_all_nmpa["name"].values) + list(df_import["drug_name"].values)))
    return all_drug_list
all_drug_list = get_all_drug()

686


In [92]:
len(all_drug_list)

18104

In [96]:
cmekg_drug = set(map(lambda x: x[1], cmekg_disease_drug_list))

In [102]:
d_match_dict = {}
for cd in cmekg_drug:
    if len(cd) <= 1 and cd not in ["葡萄糖", "干扰素"]:
        continue
    d_match_list = list(filter(lambda x: cd.lower() in x.lower(), all_drug_list))
    if len(d_match_list) > 0:
        d_match_dict[cd] = d_match_list

In [104]:
cmekg_disease_drug_dict = defaultdict(list)
for cddl in cmekg_disease_drug_list:
    if cddl[1] in d_match_dict.keys():
        for md in d_match_dict[cddl[1]]:
            cmekg_disease_drug_dict["disease"].append(cddl[0])
            cmekg_disease_drug_dict["drug"].append(md)

In [107]:
pd.DataFrame(cmekg_disease_drug_dict).to_csv("processed/cmekg_disease_drug.csv", index=False)