In [1]:
import pandas as pd
import json
import os
from fnmatch import fnmatch
import Levenshtein
import re

In [2]:
os.getcwd()

'D:\\disease-kb'

In [3]:
# read drug description
dd_columns = ["药品名称", "成份", "性状", "适应症", "用法用量", "不良反应", "禁忌", "注意事项",
                  "孕妇及哺乳期妇女用药", "儿童用药", "老年用药", "贮藏", "规格", "药物相互作用",
                  "药理毒理", "药代动力学", "药物过量", "有效期", "包装", "执行标准"]
df_drug_description = pd.DataFrame(columns=dd_columns)
for fn in os.listdir("d:/pgkb_graph/processed"):
    if fnmatch(fn, "*drug_description_detail_*.csv"):
        df_drug_description = pd.concat(
            [df_drug_description,
             pd.read_csv(os.path.join("d:/pgkb_graph/processed", fn), dtype=str).fillna("")],
             ignore_index=True,
             axis=0)
df_drug_description = df_drug_description.fillna("")

In [11]:
len(df_drug_description)

12761

In [12]:
# read disease details
disease_list = []
with open("data/medical.json", "r", encoding="utf-8") as f:
    for line in f.read().split("\n"):
        try:
            line_json = json.loads(line, strict=False)
            disease_list.append(line_json)
        except:
            pass
        
len(disease_list)

8808

In [13]:
# read ICD code
df_icd = pd.read_csv("ICD10_filter.csv")
icd_name_dict = dict(zip(list(df_icd["disease"].values),
                         list(df_icd["code"].values)))

In [14]:
no_match_list = []
for disease in disease_list:
    if disease["name"] in icd_name_dict.keys():
        disease["ICD10_code"] = icd_name_dict[disease["name"]]
    else:
        no_match_list.append(disease["name"])
        disease["ICD10_code"] = ""

In [15]:
len(no_match_list)

6585

In [176]:
# 最短编辑比率
max_match_list = []
no_match_list = list(map(lambda x: [
    x,
    re.sub(r"[性|病|症|征]", "", re.sub(r"[\(\)-\/（）\-\[\]\s、]", "", x))
    ], no_match_list))

icd_name_list = list(map(lambda x: [
    x,
    re.sub(r"[性|病|症|征]", "", re.sub(r"[\(\)-\/（）\-\[\]\s、]", "", x))
    ], icd_name_dict.keys()))

for no_match, no_match_clean in no_match_list:
    max_ratio = 0
    max_disease = ""
    for disease, disease_clean in icd_name_list:        
        ratio = Levenshtein.ratio(no_match_clean, disease_clean)
        if ratio > max_ratio:
            max_ratio = ratio
            max_disease = disease
    max_match_list.append((no_match, max_disease, max_ratio))


In [182]:
# 过滤最短编辑比率大于0.88的字符串组
filter_match_list = list(filter(lambda x: x[2] >= 0.85, max_match_list))
print(filter_match_list)
match_dict = dict(list(map(lambda x: (x[0], x[1]), filter_match_list)))

[('肺泡蛋白质沉积症', '肺泡蛋白沉积症', 0.9230769230769231), ('喘息样支气管炎', '喘息性支气管肺炎', 0.8571428571428571), ('大叶性肺炎', '大叶肺炎', 1.0), ('肺炎球菌肺炎', '肺炎球菌性肺炎', 1.0), ('肺炎杆菌肺炎', '肺炎杆菌性肺炎', 1.0), ('呼吸道异物', '呼吸道内异物', 0.9090909090909091), ('急性肺脓肿', '肺脓肿', 0.8571428571428571), ('金黄色葡萄球菌肺炎', '金黄色葡萄球菌性肠炎', 0.8888888888888888), ('卡氏肺囊虫肺炎', '卡氏肺囊虫性肺炎', 1.0), ('铍中毒', '铍中毒性肺', 0.8571428571428571), ('气管肿瘤', '气管恶性肿瘤', 0.8888888888888888), ('腺病毒肺炎', '腺病毒性肺炎', 1.0), ('乙醚中毒', '醚中毒', 0.8571428571428571), ('衣原体肺炎', '衣原体性肺炎', 1.0), ('支原体肺炎', '支原体性肺炎', 1.0), ('支气管肺炎', '支气管炎', 0.8888888888888888), ('单纯性下肢静脉曲张', '下肢静脉曲张', 0.8571428571428571), ('二尖瓣关闭不全', '风湿性二尖瓣关闭不全', 0.875), ('二尖瓣环钙化', '二尖瓣钙化', 0.9090909090909091), ('肥厚型梗阻性心肌病', '肥厚性梗阻性心肌病', 0.9230769230769231), ('肺动脉口狭窄', '肺动脉狭窄', 0.9090909090909091), ('肥厚型心肌病', '肥厚性心肌病', 0.8888888888888888), ('后天性三尖瓣关闭不全', '先天性三尖瓣关闭不全', 0.8888888888888888), ('急性心功能不全', '心功能不全', 0.9090909090909091), ('急性感染性心内膜炎', '亚急性感染性心内膜炎', 0.9333333333333333), ('胸降主动脉动脉瘤', '胸主动脉动脉瘤', 0.9333333333333333), ('

In [52]:
for disease in disease_list:
    if disease["name"] in match_dict.keys():
        disease["ICD10_code"] = icd_name_dict[match_dict[disease["name"]]]

In [55]:
len(disease_list)

8808

In [69]:
disease_drug_list = []
disease_symptom_list = []
for disease in disease_list:
    for drug in disease["recommand_drug"]:
        disease_drug_list.append([disease["name"], drug])
    for symptom in disease["symptom"]:
        disease_symptom_list.append([disease["name"], symptom])

In [68]:
print(len(disease_drug_list))
print(disease_drug_list[:10])

59467
[['百日咳', '琥乙红霉素片'], ['百日咳', '琥乙红霉素颗粒'], ['百日咳', '百咳静糖浆'], ['百日咳', '穿心莲内酯片'], ['百日咳', '红霉素肠溶片'], ['百日咳', '环酯红霉素片'], ['苯中毒', '布美他尼片'], ['苯中毒', '十一味金色丸'], ['苯中毒', '注射用布美他尼'], ['苯中毒', '注射用呋塞米']]


In [70]:
print(len(disease_symptom_list))
print(disease_symptom_list[:10])

54710
[['肺泡蛋白质沉积症', '紫绀'], ['肺泡蛋白质沉积症', '胸痛'], ['肺泡蛋白质沉积症', '呼吸困难'], ['肺泡蛋白质沉积症', '乏力'], ['肺泡蛋白质沉积症', '毓卓'], ['百日咳', '吸气时有蝉鸣音'], ['百日咳', '痉挛性咳嗽'], ['百日咳', '胸闷'], ['百日咳', '肺阴虚'], ['百日咳', '抽搐']]


In [4]:
drug_dict = {}
for index, row in df_drug_description.iterrows():
    drug_name = row["药品名称"]
    drug_dict[drug_name] = {}
    for col in dd_columns:
        drug_dict[drug_name][col] = row[col]

In [17]:
disease_dict = {}
for disease in disease_list:
    disease_dict[disease["name"]] = {}
    for key in disease.keys():
        if key in ["_id", "do_eat", "not_eat", "recommand_eat", "drug_detail"]:
            continue
        disease_dict[disease["name"]][key] = disease[key]

In [4]:
df_insurance_l2 = pd.read_csv("d:/pgkb_graph/processed/drug_insurance_L2.csv", dtype=str).fillna("")
df_insurance_l3 = pd.read_csv("d:/pgkb_graph/processed/drug_insurance_L3.csv", dtype=str).fillna("")

In [5]:
seq_regex = re.compile(r"（[\w]+-[\w]+）")
seq_list = []
text_list = []
code_list = []
for index, row in df_insurance_l2.iterrows():
    try:
        text = row["text"]
        code = row["code"]
        seq = re.findall(seq_regex, text)[0]
        text = text.replace(seq, "")
        start, end = seq.replace("（", "").replace("）", "").split("-")
        seq_list.append([start, end])
        text_list.append(text)
        code_list.append(code)
    except:
        continue

zip_list = list(zip(seq_list, code_list, text_list))
num_dict = {}
for x in zip_list:
    for i in range(int(x[0][0]), int(x[0][1]) + 1):
        num_dict[i] = [x[1], x[2]]

In [37]:
num_regex = re.compile("[\w]+")
insurance_dict = {}
for index, row in df_insurance_l3.iterrows():
    code = row["编号"]
    clazz = row["甲乙"]
    name = row["药品名称"]
    dosage_form = row["剂型"]
    try:
        num = re.findall(num_regex, code)[0]
        category_num = num_dict[int(num)][0]
        category = num_dict[int(num)][1]
    except:
        category_num = ""
        category = ""
    insurance_dict["{}--{}".format(name, dosage_form)] = [code, clazz, category_num, category]

In [47]:
# 最短编辑比率
# max_match_list = []
for drug in drug_dict.keys():
    max_ratio = 0
    max_insurance = ""
    drug_clean = re.sub(r"[\(\)-\/（）\-\[\]\s、]", "", drug)
    drug_clean = re.sub(r"[片|注射液|颗粒|滴剂|胶囊|散剂|混悬液|乳剂|剂|膏|丸|口服溶液|口服液|咀嚼|泡腾]", "", drug_clean) 
    
    for ori_insurance in insurance_dict.keys():
        insurance, dosage_form = ori_insurance.split("--")
        insurance_clean = re.sub(r"[\(\)-\/（）\-\[\]\s、]", "", insurance)
        insurance_clean = re.sub(r"[片|注射液|颗粒|滴剂|胶囊|散剂|混悬液|乳剂|剂|膏|丸|口服溶液|口服液|咀嚼|泡腾]", "", insurance_clean) 
        
        ratio = Levenshtein.ratio(drug_clean, insurance_clean)
        
        if insurance_clean in drug_clean:
            ratio += 0.15
        
        if ratio > max_ratio:
            max_ratio = ratio
            max_insurance = insurance
            
    # max_match_list.append([drug, max_insurance, max_ratio])
    if max_ratio > 0.85:
        drug_dict[drug]["是否医保"] = "是"
        drug_dict[drug]["医保药品名"] = max_insurance
        drug_dict[drug]["甲乙"] = insurance_dict[ori_insurance][1]
        drug_dict[drug]["医保药品种类"] = insurance_dict[ori_insurance][3]
        drug_dict[drug]["医保药品种类编号"] = insurance_dict[ori_insurance][2]
        drug_dict[drug]["剂型"] = dosage_form
    else:
        drug_dict[drug]["是否医保"] = "否"
        drug_dict[drug]["医保药品名"] = ""
        drug_dict[drug]["甲乙"] = ""
        drug_dict[drug]["医保药品种类"] = ""
        drug_dict[drug]["医保药品种类编号"] = ""
        drug_dict[drug]["剂型"] = ""

12759

In [9]:
df_dc = pd.read_csv("processed/drug_chemical.csv", dtype=str).fillna("")

2883

In [12]:
dc_drug_set = set(df_dc["chn_name"].values)
dd_drug_set = set(drug_dict.keys())

matched_drug_set = dc_drug_set.intersection(dd_drug_set)

dc_drug_set = dc_drug_set - matched_drug_set
dd_drug_set = dd_drug_set - matched_drug_set

In [18]:
drug_regex = re.compile(
    r"[素|口腔|崩解|肠溶|舌下|放射免疫分析药盒|眼用|凝胶|片|注射液|颗粒|滴剂|胶囊|散剂|贴片|凝胶|咀嚼|混悬液|乳剂|剂|膏|丸|口服|口服液|糖浆|咀嚼|泡腾|缓释|分散|滴眼液|溶液|粉雾剂|速释]"
)
dc_match_dict = {}

def get_clean(drug_string, drug_regex):
    drug_clean = re.sub(r"[\(\)-\/（）\-\[\]\s、]", "", drug_string)
    drug_clean = re.sub(drug_regex, "", drug_clean)
    return drug_string, drug_clean

for dc, dc_clean in [get_clean(x, drug_regex) for x in dc_drug_set]:
    max_ratio = 0
    max_dd = ""
    for dd, dd_clean in [get_clean(x, drug_regex) for x in dd_drug_set]:
        ratio = Levenshtein.ratio(dc_clean, dd_clean)
        if ratio > max_ratio:
            max_ratio = ratio
            max_dd = dd
            
    if max_ratio > 0.85:
        dc_match_dict[dc] = max_dd
    else:
        dc_match_dict[dc] = ""

In [19]:
dc_match_dict

{'硫酸氨基葡萄糖泡腾片': '',
 '盐酸依匹斯汀': '',
 '盐酸左卡巴斯汀滴眼液': '盐酸左卡巴斯汀喷剂',
 '羧甲淀粉钠': '羧甲淀粉钠溶液',
 '氨苄西林': '氨苄西林栓',
 '烯丙雌醇': '',
 '非布司他片': '',
 '盐酸洛非西定': '',
 '富马酸伊布利特': '',
 '盐酸平阳霉素': '',
 '右酮洛芬胶囊': '',
 '盐酸托哌酮': '',
 '硫酸钙': '',
 '倍他米松': '倍他米松软膏',
 '磷酸氢钙': '',
 '醋酸钙': '醋酸钙颗粒剂',
 '肝素钠封管注射液': '',
 '他克莫司滴眼液': '',
 '氯沙坦钾': '',
 '奥替拉西钾': '',
 '碘解磷定': '',
 '盐酸雷洛昔芬': '',
 '盐酸莫西沙星片': '',
 '左氧氟沙星滴眼液': '',
 '舒林酸': '',
 '利匹韦林片': '',
 '阿司匹林锌肠溶胶囊': '',
 '扎那米韦吸入粉雾剂': '',
 '缩宫素溶液': '',
 '双氯青霉素钠': '双氯青霉素钠片',
 '利福布汀胶囊': '',
 '氨麻美明分散片': '',
 '甲状腺球蛋白抗体、微粒体抗体放射免疫诊断试剂盒': '',
 '左西孟旦': '',
 '盐酸纳布啡注射液': '',
 '盐酸莫西沙星滴眼液': '',
 '碳酸钙': '碳酸钙口服混悬液',
 '吡嘧司特钾分散片': '',
 '氯诺昔康': '',
 '盐酸莫西沙星注射液': '',
 '塞替派': '',
 '地高辛': '地高辛酏剂',
 '乙胺嘧啶': '',
 '果糖二磷酸钙': '果糖二磷酸钙口服液',
 '阿柏西普眼内注射溶液': '',
 '烯丙雌醇片': '',
 '孟鲁司特钠颗粒': '',
 '乌司奴单抗注射液(静脉输注)': '',
 '甲丙氨酯': '甲丙氨酯片',
 '盐酸异丙嗪': '',
 '盐酸氟西汀分散片': '',
 '环索奈德吸入气雾剂': '',
 '依非韦伦片': '',
 '铝镁匹林片(Ⅱ)': '',
 '奥美拉唑钠': '奥美拉唑胶囊',
 '碘[125I]人促黄体生成激素放射免疫分析药盒': '',
 '艾普拉唑': '',
 '雌三醇乳膏': '',
 '甲磺酸伊马替尼': '',
 '牡蛎碳酸

In [38]:
drug_interaction_dict = {}
drug_regex_dict = {}
drug_regex = re.compile(
    r"[喉片|素|口腔|崩解|肠溶|舌下|放射免疫分析药盒|眼用|凝胶|片|注射液|颗粒|滴剂|胶囊|散剂|贴片|凝胶|咀嚼|混悬液|乳剂|剂|膏|丸|口服|口服液|糖浆|咀嚼|泡腾|缓释|分散|滴眼液|溶液|粉雾剂|速释]"
)
for key, value in drug_dict.items():
    drug_interaction_dict[key] = value["药物相互作用"]
    drug_regex_dict[key] = key
    drug_regex_dict[re.sub(drug_regex, "", key)] = key

In [41]:
drug_inter_list = []
for drug, interaction in drug_interaction_dict.items():
    drug_regex = drug_regex_dict[drug]
    if interaction == "":
        continue
    for d in list(filter(lambda x: len(x) > 1 and x not in [drug, "和血", "降宁"] and x in interaction,
                         drug_regex_dict.keys())):
        drug_inter_list.append((drug, d, drug_regex_dict[d], interaction))

In [50]:
aa = list(set(map(lambda x: (x[0], x[2]),drug_inter_list)))

pd.DataFrame(aa, columns=["drug", "interact_drug"])

Unnamed: 0,drug,interact_drug
0,吲哚美辛片,秋水仙碱片
1,复方利福平胶囊,注射用环磷酰胺
2,注射用青霉素钠,琥乙红霉素片
3,双氯芬酸钠栓(Ⅱ),丙磺舒片
4,棕榈氯霉素混悬液,红霉素眼膏
...,...,...
12056,小儿双嘧啶片,注射用叶酸
12057,硫酸胍乙啶片,利血平片
12058,醋硝香豆素片,双香豆素片
12059,齐多夫定口服溶液,吲哚美辛贴膏


In [29]:
df_dc = pd.read_csv("processed/drug_chemical.csv", dtype=str).fillna("")
from collections import defaultdict

dc_dict = defaultdict(list)
dc_relation_list = []
for index, row in df_dc.iterrows():
    chemical = row["chemical"].lower()
    drug = row["chn_name"]
    drug_alias = row["name_alias"]

    if chemical != "" and drug != "":
        dc_relation_list.append((chemical, drug))
        dc_dict[chemical].append(drug)

    if chemical != "" and drug_alias != "" and drug_alias != drug:
        dc_relation_list.append((chemical, drug_alias))
        dc_dict[chemical].append(drug_alias)

dc_relation_list = list(set(dc_relation_list))

In [30]:
dc_dict

defaultdict(list,
            {'thioridazine': ['盐酸硫利哒嗪', '盐酸硫利哒嗪片'],
             'chloride': ['盐酸阿扎司琼氯化钠注射液',
              '葡萄糖氯化钠注射液',
              '甲硝唑氯化钠注射液',
              '甲磺酸左氧氟沙星氯化钠注射液',
              '氯化钠注射液',
              '羟乙基淀粉40氯化钠注射液',
              '氯化钾氯化钠注射液',
              '氯化铵片',
              '乳酸环丙沙星氯化钠注射液',
              '氟康唑氯化钠注射液',
              '缩合葡萄糖氯化钠注射液',
              '苦参碱氯化钠注射液',
              '乙酰谷酰胺氯化钠注射液',
              '乳酸左氧氟沙星氯化钠注射液',
              '右旋糖酐40氯化钠注射液',
              '盐酸川芎嗪氯化钠注射液',
              '羟乙基淀粉20氯化钠注射液',
              '注射用亚锡焦磷酸钠',
              '加替沙星氯化钠注射液',
              '喷托维林氯化铵糖浆',
              '氧氟沙星氯化钠注射液',
              '吡拉西坦氯化钠注射液',
              '输血用氯化钠注射液',
              '盐酸洛美沙星氯化钠注射液',
              '亚叶酸钙氯化钠注射液',
              '复方甘草酸单铵S氯化钠注射液',
              '生理氯化钠溶液',
              '盐酸昂丹司琼氯化钠注射液',
              '硫酸阿米卡星氯化钠注射液',
              '盐酸普鲁卡因氯化钠注射液',
              '单硝酸异山梨酯氯化钠注射液',
              '氯化钾注射液',
          

In [31]:
df_warning = pd.read_csv("processed/fda_warning.csv", dtype=str).fillna("")
warning_list = [x.lower() for x in df_warning["warning"].values]

In [52]:
warning_split_list = [x.split(" ") for x in warning_list]
common_chemical_list = []
for ws in warning_split_list:
    common_chemical_list.append(list(set(ws).intersection(set(dc_dict.keys()))))

In [53]:
df_drug_chemical = pd.read_csv(
    "d:/pgkb_graph/processed/nmpa_drug_chemical.csv",
    encoding="utf-8",
    dtype=str
).fillna("")[["chemical", "eng_name",  "eng_business_name"]]

In [54]:
db_dict = defaultdict(list)
for index, row in df_drug_chemical.iterrows():
    chemical = row["chemical"]
    drug_name_eng = row["eng_name"]
    drug_business_name_eng = row["eng_business_name"].lower()

#     if chemical != "" and drug_name_eng != "":
#         db_dict[chemical].append(drug_name_eng)

    if chemical != "" and drug_business_name_eng != "":
        db_dict[chemical].append(drug_business_name_eng)

In [55]:
db_dict = {x.lower(): list(filter(lambda x: x not in ['-'] ,set(y))) for x, y in db_dict.items()}

In [56]:
common_business_name_list = []
for ws in warning_split_list:
    sub_list = []
    for key, value in db_dict.items():
        intersect = set(ws).intersection(set(value))
        if len(intersect) > 0:
            sub_list.append(key)
    common_business_name_list.append(sub_list)

In [59]:
match_che_list = []
match_drug_list = []
for i in range(len(warning_list)):
    che_list = []
    if len(common_chemical_list[i]) > 0:
        che_list.extend(common_chemical_list[i])
    if len(common_business_name_list[i]) > 0:
        che_list.extend(common_business_name_list[i])
    che_list = list(set(che_list))
    match_che_list.append(che_list)
    drug_list = []
    for che in che_list:
        drug_list.extend(dc_dict[che])
    match_drug_list.append(drug_list)

In [60]:
match_che_list

[['dabigatran'],
 ['ribavirin'],
 [],
 [],
 ['heparin'],
 ['ondansetron'],
 [],
 [],
 [],
 [],
 [],
 ['ondansetron'],
 ['cefepime'],
 [],
 ['lenalidomide'],
 [],
 ['aliskiren'],
 ['drospirenone'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['methylphenidate'],
 [],
 [],
 [],
 [],
 ['tigecycline'],
 [],
 [],
 ['acetaminophen'],
 [],
 [],
 ['olmesartan'],
 ['magnesium'],
 ['zolpidem'],
 [],
 [],
 [],
 ['azithromycin'],
 [],
 ['ziprasidone'],
 [],
 [],
 ['omalizumab'],
 ['lidocaine'],
 [],
 ['olmesartan'],
 ['docetaxel'],
 [],
 ['warfarin', 'dabigatran'],
 [],
 [],
 [],
 [],
 [],
 [],
 ['thyroid'],
 ['clopidogrel'],
 ['entacapone'],
 [],
 [],
 [],
 ['tramadol'],
 ['clozapine'],
 [],
 [],
 [],
 [],
 ['ticagrelor', 'vortioxetine'],
 [],
 [],
 [],
 ['codeine'],
 [],
 [],
 [],
 [],
 ['sofosbuvir', 'amiodarone'],
 ['olanzapine'],
 [],
 ['testosterone'],
 [],
 [],
 [],
 [],
 ['pioglitazone'],
 [],
 [],
 [],
 ['dapagliflozin'],
 ['loperamide'],
 ['aspirin'],
 [],
 [],
 [],
 [],
 ['olanzapine'

In [61]:
match_drug_list

[['达比加群酯胶囊'],
 ['利巴韦林注射液',
  '利巴韦林滴眼液',
  '利巴韦林含片',
  '利巴韦林片',
  '利巴韦林葡萄糖注射液',
  '利巴韦林滴鼻液',
  '利巴韦林颗粒',
  '利巴韦林泡腾颗粒',
  '利巴韦林',
  '利巴韦林颗粒剂',
  '利巴韦林氯化钠注射液',
  '注射用利巴韦林',
  '利巴韦林分散片',
  '利巴韦林喷剂',
  '利巴韦林眼膏',
  '利巴韦林胶囊',
  '利巴韦林口服溶液',
  '利巴韦林颗粒剂'],
 [],
 [],
 ['肝素钠',
  '低分子量肝素钙',
  '低分子肝素钙注射液',
  '低分子量肝素钠注射液',
  '肝素钠注射液',
  '注射用低分子量肝素钙',
  '低分子量肝素钠',
  '低分子肝素钠注射液',
  '肝素钠乳膏',
  '肝素钠封管注射液',
  '肝素钙注射液',
  '注射用肝素钙',
  '低分子量肝素钙注射液',
  '低分子肝素钙',
  '注射用低分子量肝素钠',
  '肝素钠含片',
  '肝素钙',
  '低分子量肝素钠凝胶',
  '低分子肝素钠注射液',
  '复方肝素钠尿囊素凝胶'],
 ['盐酸昂丹司琼氯化钠注射液',
  '盐酸昂丹司琼注射液',
  '盐酸昂丹司琼片',
  '盐酸昂丹司琼胶囊',
  '注射用盐酸昂丹司琼',
  '盐酸昂丹司琼',
  '盐酸昂丹司琼口腔崩解片',
  '盐酸昂丹司琼葡萄糖注射液'],
 [],
 [],
 [],
 [],
 [],
 ['盐酸昂丹司琼氯化钠注射液',
  '盐酸昂丹司琼注射液',
  '盐酸昂丹司琼片',
  '盐酸昂丹司琼胶囊',
  '注射用盐酸昂丹司琼',
  '盐酸昂丹司琼',
  '盐酸昂丹司琼口腔崩解片',
  '盐酸昂丹司琼葡萄糖注射液'],
 ['注射用盐酸头孢吡肟', '盐酸头孢吡肟', '盐酸头孢吡肟/L-精氨酸', '注射用盐酸头孢吡肟/氯化钠注射液'],
 [],
 ['来那度胺胶囊', '来那度胺'],
 [],
 ['阿利吉仑片'],
 ['屈螺酮炔雌醇片', '屈螺酮炔雌醇片(Ⅱ)', '雌二醇屈螺酮片'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['盐酸哌甲酯片', '注

In [72]:
df_warning = df_warning.assign(match_chemical_list=match_che_list)
df_warning = df_warning.assign(match_drug_list=match_drug_list)
link_list = [x if "https://www.fda.gov" in x else "https://www.fda.gov{}".format(x) for x in df_warning["link"].values]
df_warning = df_warning.assign(link=link_list)

In [78]:
drug_warning_dict = {}
for index, row in df_warning.iterrows():
    warning = row["warning"]
    warning_chn = row['warning_chn']
    drug_list = row["match_drug_list"]
    link = row["link"]
    for drug in drug_list:
        if drug not in drug_warning_dict:
            drug_warning_dict[drug] = defaultdict(list)
        drug_warning_dict[drug]["warning"].append(warning)
        drug_warning_dict[drug]["warning_chn"].append(warning_chn)
        drug_warning_dict[drug]["link"].append(link)

In [79]:
for key, value in drug_warning_dict.items():
    for k in value.keys():
        value[k] = " \n".join(["{}. {}".format(i, value[k][i - 1]) for i in range(1, len(value[k]) + 1)])

In [82]:
import json
with open("processed/warning_dict.json", "w") as f:
    json.dump(drug_warning_dict, f)

In [83]:
len(drug_warning_dict)

474

In [84]:
with open("processed/drug_dict.json", "r", encoding="utf-8") as f:
    drug_dict = json.load(f)

In [85]:
drug_dict["气血双补丸"]

{'药品名称': '气血双补丸',
 '成份': '熟地黄、当归、黄芪、何首乌(酒炙)、白芍、党参、白术(麸炒)、丹参、川芎、女贞子(酒炙)、甘草。',
 '性状': '本品为黑褐色小蜜丸；味甜、微苦。',
 '适应症': '',
 '用法用量': '口服，一次9克，一日2次。',
 '不良反应': '',
 '禁忌': '',
 '注意事项': '1.忌油腻食物。2.凡脾胃虚弱，呕吐泄泻，腹胀便溏、咳嗽痰多者慎用。3.感冒病人不宜服用。4.本品宜饭前服用。5.按照用法用量服用，小儿、孕妇、高血压、糖尿病患者应在医师指导下服用。6.服药二周或服药期间症状无改善，或症状加重，或出现新的严重症状，应立即停药并去医院就诊。7.对本品过敏者禁用，过敏体质者慎用。8.本品性状发生改变时禁止使用。9.儿童必须在成人监护下使用。10.请将本品放在儿童不能接触的地方。11.如正在使用其他药品，使用本品前请咨询医师或药师。',
 '孕妇及哺乳期妇女用药': '',
 '儿童用药': '',
 '老年用药': '',
 '贮藏': '',
 '规格': '',
 '药物相互作用': '如与其他药物同时使用可能会发生药物相互作用，详情请咨询医师或药师。',
 '药理毒理': '',
 '药代动力学': '',
 '药物过量': '',
 '有效期': '',
 '包装': '',
 '执行标准': '部标八册',
 '是否医保': '否',
 '医保药品名': '',
 '甲乙': '',
 '医保药品种类': '',
 '医保药品种类编号': '',
 '剂型': ''}