In [22]:
from py2neo import Graph 
import pandas as pd
from collections import defaultdict
import json
import re

In [23]:
class neo4jUtil:
    def __init__(self):
        self.driver = Graph("neo4j://172.16.231.80:7687", auth=("neo4j", "123456"))

    def run_cypher(self, cypher):
        result_list = self.driver.run(cypher).data()
        return result_list
    
n_util = neo4jUtil()

In [73]:
df_warning = pd.read_csv("processed/fda_warning_with_chemical.csv")[["match_chemical_list", "link"]]
df_warning = df_warning[~df_warning['match_chemical_list'].isnull()]
warning_dict = defaultdict(list)

for index, row in df_warning.iterrows():
    chem = row["match_chemical_list"]
    link = row["link"]
    warning_dict[chem].append(link)


In [2]:
df_drug = pd.DataFrame({"drug": list(set(pd.read_csv("drug_report/report_chemicals.csv")["Drug"].values))})

In [3]:
df_translation = pd.read_csv("processed/chemical_translation.csv", dtype=str).fillna("")
df_translation["chemical_name"] = df_translation["chemical_name"].str.strip().str.lower() 
df_translation["cn_chemical_name"] = df_translation["cn_chemical_name"].str.strip().str.lower()

df_drug = pd.merge(df_drug, df_translation, how="left", left_on=["drug"], 
                   right_on=["chemical_name"])[["chemical_name", "cn_chemical_name"]].fillna("")



In [45]:
def get_pc_label(chemical_name):
    # phenotype category label
    query_template = """
    match (m:chemical {{chemical_name: "{chem_name}"}})<-[r:clinical_annotation]-()
    return distinct r.phenotype_category as phenotype_category
    union
    match (m:chemical {{chemical_name: "{chem_name}"}})<-[r:research_annotation]-()
    return distinct r.phenotype_category as phenotype_category
    """.format(chem_name=chemical_name)
    result = n_util.run_cypher(query_template)

    toxicity_flag = False
    efficacy_flag = False
    dosage_flag = False
    metabolism_flag = False
    # label_code pc_toxicity, pc_efficacy, pc_dosage, pc_metabolism
    for x in [res["phenotype_category"].lower() for res in result]:
        if "toxicity" in x:
            toxicity_flag = True
        if "efficacy" in x:
            efficacy_flag = True
        if "dosage" in x:
            dosage_flag = True
        if "metabolism" in x:
            metabolism_flag = True
    return [toxicity_flag, efficacy_flag, dosage_flag, metabolism_flag]

In [50]:
def get_cn_drug_label(chemical_name):
    # cn_drug_label
    query_template = """
    match (m:chemical {{chemical_name: "{chem_name}"}})<-[r:cn_drug_label]-(ge:gene)
    return m.chemical_name as chemical_name, ge.gene_name as gene_name, r.remark as remark
    """.format(chem_name=chemical_name)
    result = n_util.run_cypher(query_template)

    cn_drug_label_flag = False
    genes = ""
    if len(result) > 0:
        cn_drug_label_flag = True
        genes = ",".join([x["gene_name"] for x in result])
        
    return [cn_drug_label_flag, genes]

In [78]:
def fda_warning_insurance_label(chemical_name):
    # FDA warning and medical_insurance
    warning_link = ""
    fda_warning_flag = False
    
    if chemical_name in warning_dict.keys():
        fda_warning_flag = True
        warning_link = " | ".join(warning_dict[chemical_name])

    query_template = """
    match (m:chemical {{chemical_name: "{chem_name}"}})-[r:chemical_drug_relation]->(n:drug)
    return distinct n.in_medical_insurance as in_medical_insurance, n.insurance_level as insurance_level
    """.format(chem_name=chemical_name)
    result = n_util.run_cypher(query_template)
    is_insurance = [x["in_medical_insurance"] for x in result]
    level = [x["insurance_level"] for x in result]

    insurance_flag = False
    insurance_level = ""

    if "是" in is_insurance:
        insurance_flag = True
        insurance_level = ",".join(set(filter(lambda x: x != "", level)))
        
    return [fda_warning_flag, warning_link, insurance_flag, insurance_level]

In [48]:
def get_atc_info(chemical_name):
    # ATC code
    query_template = """
    match (m:chemical {{chemical_name: "{chem_name}"}})
    return m.atc_code as atc_code, m.L1_info as L1_info, m.L1_info_chn as L1_info_chn,
    m.L2_info as L2_info, m.L2_info_chn as L2_info_chn,
    m.L3_info as L3_info, m.L3_info_chn as L3_info_chn,
    m.L4_info as L4_info, m.L4_info_chn as L4_info_chn
    """.format(chem_name=chemical_name)
    result = n_util.run_cypher(query_template)
    
    if len(result) == 0:
        return [""] * 9
    
    result = result[0]
    return [result['atc_code'], result['L1_info'], result['L1_info_chn'], result['L2_info'], result['L2_info_chn'],
            result['L3_info'], result['L3_info_chn'], result['L4_info'], result['L4_info_chn']]


In [79]:
cn_drug_label_list = []
drug_label_gene_list = []
fda_warning_flag_list = []
insurance_flag_list = []
insurance_level_list = []
atc_code_list = []
L1_info_list = []
L1_info_chn_list = []
L2_info_list = []
L2_info_chn_list = []
L3_info_list = []
L3_info_chn_list = []
L4_info_list = []
L4_info_chn_list = []
toxicity_flag_list = []
efficacy_flag_list = []
dosage_flag_list = []
metabolism_flag_list = []
fda_warning_link_list = []

for index, row in df_drug.iterrows():
    chemical_name = row["chemical_name"]
    [toxicity_flag, efficacy_flag, dosage_flag, metabolism_flag] = get_pc_label(chemical_name)
    [cn_drug_label, genes] = get_cn_drug_label(chemical_name)
    [fda_warning_flag, fda_warning_link, insurance_flag, insurance_level] = fda_warning_insurance_label(chemical_name)
    [atc_code, L1_info, L1_info_chn, L2_info, L2_info_chn, L3_info, L3_info_chn, L4_info, L4_info_chn] = get_atc_info(chemical_name)
    
    cn_drug_label_list.append(cn_drug_label)
    drug_label_gene_list.append(genes)
    fda_warning_flag_list.append(fda_warning_flag)
    insurance_flag_list.append(insurance_flag)
    insurance_level_list.append(insurance_level)
    atc_code_list.append(atc_code)
    L1_info_list.append(L1_info)
    L1_info_chn_list.append(L1_info_chn)
    L2_info_list.append(L2_info)
    L2_info_chn_list.append(L2_info_chn)
    L3_info_list.append(L3_info)
    L3_info_chn_list.append(L3_info_chn)
    L4_info_list.append(L4_info)
    L4_info_chn_list.append(L4_info_chn)
    toxicity_flag_list.append(toxicity_flag)
    efficacy_flag_list.append(efficacy_flag)
    dosage_flag_list.append(dosage_flag)
    metabolism_flag_list.append(metabolism_flag)
    fda_warning_link_list.append(fda_warning_link)

In [80]:
df_label = pd.DataFrame({
    "中国药物标签": cn_drug_label_list,
    "中国药物基因影响": drug_label_gene_list,
    "FDA预警标签": fda_warning_flag_list,
    "FDA预警链接": fda_warning_link_list,
    "医保标签": insurance_flag_list,
    "医保级别": insurance_level_list,
    "ATC代码": atc_code_list,
    "ATC level 1": L1_info_list,
    "ATC level 1 中文": L1_info_chn_list,
    "ATC level 2": L2_info_list,
    "ATC level 2 中文": L2_info_chn_list,
    "ATC level 3": L3_info_list,
    "ATC level 3 中文": L3_info_chn_list,
    "ATC level 4": L4_info_list,
    "ATC level 4 中文": L4_info_chn_list,
}).fillna("")

In [81]:
pd.concat([df_drug, df_label], axis=1).to_csv("drug_report/report_label.csv", index=False)

In [5]:
for index, row in pd.read_csv("processed/drug_chemical.csv", encoding="utf-8", dtype=str).fillna("").iterrows():
    cn_name = row["cn_chemical_name"].strip()
    drug_name = row["chn_name"].strip()
    en_name = row["chemical"].lower()
    en_drug_name = row["eng_name"].lower()
    
    if cn_name not in drug_name and en_name not in en_drug_name:
        print("{} -- {}".format(cn_name, drug_name))

In [15]:
import json
with open("c:/Users/zhangke1/Downloads/records.json", "r", encoding="utf-8") as f:
    content = f.read()

In [18]:
df_pharmacology = pd.read_csv("c:/Users/zhangke1/Downloads/export.csv")

In [19]:
df_pharmacology

Unnamed: 0,drug_name,pharmacology_toxicology
0,琥乙红霉素片,本品属大环内酯类抗生素，为红霉素的琥珀酸乙酯，在胃酸中较红霉素稳定。对葡萄球菌属(耐甲氧西林...
1,琥乙红霉素颗粒,本品属大环内酯类抗生素，为红霉素的琥珀酸乙酯，在胃酸中较红霉素稳定。对葡萄球菌属(耐甲氧西林...
2,红霉素肠溶片,本品属大环内酯类抗生素。对葡萄球菌属(耐甲氧西林菌株除外)、各组链球菌和革兰阳性杆菌均具抗菌...
3,布美他尼片,对水和电解质排泄的作用基本同呋塞米，其利尿作用为呋塞米20～60倍。主要抑制肾小管髓袢升支厚...
4,注射用布美他尼,对水和电解质的排泄作用基本同呋塞米，其利尿作用为呋塞米的20～60倍。主要抑制肾小管髓袢升支...
...,...,...
5227,左羟丙哌嗪胶囊,1.左羟丙哌嗪的镇咳作用；研究了左羟丙哌嗪对麻醉豚鼠和家兔的镇咳作用，结果表明左羟丙哌嗪具有...
5228,左旋卡尼汀注射液,左旋卡尼汀是哺乳动物能量代谢中必需的体内天然物质，其主要功能是促进脂类代谢。在缺血、缺氧时，...
5229,左炔诺孕酮聚己内酯棒,全合成的孕激素。具有较强抑制垂体分泌促性腺激素的作用而抑制排卵；它能使宫黏粘液变稠，阻碍精子...
5230,左旋卡尼汀口服溶液,左旋卡尼汀是哺乳动物能量代谢中必需的体内天然物质，其主要功能是促进脂类代谢。在缺血、缺氧时，...


In [27]:
sent_list = []
for index,row in df_pharmacology.iterrows():
    sent = row["pharmacology_toxicology"].split("。")[:5]
    available_sent_list = []
    for s in sent:
        if any([True if x in s else False for x in ["抑制", "阻断", "具", "作用", 
                                                    "有效", "减少", "降低", "增加", "促进"]]) and "毒理" not in s:
            available_sent_list.append(s)
            
    sent_list.append("。".join(available_sent_list))

df_pharmacology["sent"] = sent_list

In [29]:
df_pharmacology.to_csv("c:/Users/zhangke1/Desktop/药理.csv", index=False)

In [13]:
# 药物指南 协会
query_template = """
MATCH ()-[r:guideline_annotation]->(che:chemical) 
where r.guideline_institute is not null
RETURN distinct che.display as chemical_name, r.guideline_institute as guideline_institute, 
r.guideline_link as guideline_link, r.guideline_name as guideline_name
"""
result = n_util.run_cypher(query_template)
df_guideline = pd.DataFrame(result)

In [17]:
dpwg_dict = {}
for chem, content in df_guideline[df_guideline["guideline_institute"] == "DPWG_Guideline"].groupby(["chemical_name"]):
    links = "\n".join(list(content["guideline_link"].values))
    guides = "\n".join(list(content["guideline_name"].values))
    dpwg_dict[chem] = [links, guides]

In [12]:
# 药物标签 协会
query_template = """
MATCH ()-[r:drug_label]->(che:chemical) 
RETURN distinct che.display as chemical_name,
r.organization as organization, r.testing_level as testing_level
"""
result = n_util.run_cypher(query_template)
df_label = pd.DataFrame(result)


In [19]:
set(df_label["organization"].values)

{'EMA', 'FDA', 'HCSC', 'PMDA', 'Swissmedic'}

In [24]:
ema_dict = {}
for chem, content in df_label[df_label["organization"] == "EMA"].groupby(["chemical_name"]):
    testing_levels = "\n".join(list(content["testing_level"].values))
    ema_dict[chem] = testing_levels
    
hcsc_dict = {}
for chem, content in df_label[df_label["organization"] == "HCSC"].groupby(["chemical_name"]):
    testing_levels = "\n".join(list(content["testing_level"].values))
    hcsc_dict[chem] = testing_levels
    
pmda_dict = {}
for chem, content in df_label[df_label["organization"] == "PMDA"].groupby(["chemical_name"]):
    testing_levels = "\n".join(list(content["testing_level"].values))
    pmda_dict[chem] = testing_levels
    
swissmedic_dict = {}
for chem, content in df_label[df_label["organization"] == "Swissmedic"].groupby(["chemical_name"]):
    testing_levels = "\n".join(list(content["testing_level"].values))
    swissmedic_dict[chem] = testing_levels

In [29]:
dpwg_flag_list = []
dpwg_link_list = []
dpwg_name_list = []

ema_flag_list = []
ema_list = []

hcsc_flag_list = []
hcsc_list = []

pmda_flag_list = []
pmda_list = []

swissmedic_flag_list = []
swissmedic_list = []

for chem in df_drug["chemical_name"].values:
    if chem in dpwg_dict.keys():
        dpwg_flag_list.append(True)
        dpwg_link_list.append(dpwg_dict[chem][0])
        dpwg_name_list.append(dpwg_dict[chem][1])
    else:
        dpwg_flag_list.append(False)
        dpwg_link_list.append("")
        dpwg_name_list.append("")
        
    if chem in ema_dict.keys():
        ema_flag_list.append(True)
        ema_list.append(ema_dict[chem])
    else:
        ema_flag_list.append(False)
        ema_list.append("")
        
    if chem in hcsc_dict.keys():
        hcsc_flag_list.append(True)
        hcsc_list.append(hcsc_dict[chem])
    else:
        hcsc_flag_list.append(False)
        hcsc_list.append("")
        
    if chem in pmda_dict.keys():
        pmda_flag_list.append(True)
        pmda_list.append(pmda_dict[chem])
    else:
        pmda_flag_list.append(False)
        pmda_list.append("")
        
    if chem in swissmedic_dict.keys():
        swissmedic_flag_list.append(True)
        swissmedic_list.append(swissmedic_dict[chem])
    else:
        swissmedic_flag_list.append(False)
        swissmedic_list.append("")
        
df_flag = pd.DataFrame({
    "DPWG指南": dpwg_flag_list,
    "DPWG指南链接": dpwg_link_list,
    "DPWG名称": dpwg_name_list,
    "EMA药物标签": ema_flag_list,
    "EMA信息": ema_list,
    "HCSC药物标签": hcsc_flag_list,
    "HCSC信息": hcsc_list,
    "PMDA药物标签": pmda_flag_list,
    "PMDA信息": pmda_list,
    "Swissmedic药物标签": swissmedic_flag_list,
    "Swissmedic信息": swissmedic_list,
})

pd.concat([df_drug, df_flag], axis=1).to_csv("c:/Users/zhangke1/Desktop/drug_flag.csv", index=False)

In [4]:
df_drug

Unnamed: 0,chemical_name,cn_chemical_name
0,captopril,卡托普利
1,paromomycin,巴龙霉素
2,tegafur,替加氟
3,doxepin,多塞平
4,azathioprine,硫唑嘌呤
...,...,...
264,epirubicin,表柔比星
265,valbenazine,瓦莱苯嗪
266,chloroquine,氯喹
267,bupropion,安非他酮


In [33]:
df_en_drug_insurance = pd.read_csv("src/cralwer/西药医保.csv", dtype=str, encoding="utf-8")

In [34]:
df_report = pd.read_csv("drug_report/report_label.csv").fillna("")[["chemical_name", "cn_chemical_name", "医保级别"]]

ins_list = list(zip(list(df_en_drug_insurance["药名"].values), list(df_en_drug_insurance["报销类别"].values)))



In [34]:
ins_flag_list = []
match_d_list = []

for drug_name in df_report["cn_chemical_name"].values:
    dn_list = list(filter(lambda x: drug_name in x[0], ins_list))
    cate = ",".join(list(set(map(lambda x: x[1], dn_list))))
    ins_d = ",".join(list(set(map(lambda x: x[0], dn_list))))
    
    ins_flag_list.append(cate)
    match_d_list.append(ins_d)
    
df_report["医保级别（新）"] = ins_flag_list

In [35]:
df_report.to_csv("医保标签.csv", index=False)

In [8]:
df_cli = pd.read_csv("processed/clinical_annotations.tsv", sep="\t").fillna("")

d_list = []
l_list = []

for index, row in df_cli.iterrows():
    drug = row["Drug(s)"]
    level = row["Level of Evidence"]
    drug_list = [x.strip() for x in re.split(r";|,|/", drug)]
    d_list.extend(drug_list)
    l_list.extend([level] * len(drug_list))

In [22]:
df_cli = pd.DataFrame({
    "drug": d_list,
    "level": l_list
}).groupby(["drug"])["level"].apply(lambda x: ",".join(sorted(set(x)))).reset_index()

df_cli["drug"] = df_cli["drug"].str.strip().str.lower()

df_translation = pd.read_csv("processed/chemical_translation.csv", dtype=str).fillna("")

df_translation["chemical_name"] = df_translation["chemical_name"].str.strip().str.lower()
df_translation["cn_chemical_name"] = df_translation["cn_chemical_name"].str.strip().str.lower()

In [30]:
df_cli = pd.merge(df_cli, df_translation, how='left', left_on='drug', right_on='chemical_name').fillna("")[[
    "drug", "cn_chemical_name", "level"]]

In [35]:
ins_flag_list = []
match_d_list = []

for drug_name in df_cli["cn_chemical_name"].values:
    dn_list = list(filter(lambda x: drug_name in x[0], ins_list))
    cate = ",".join(list(set(map(lambda x: x[1], dn_list))))
    ins_d = ",".join(list(set(map(lambda x: x[0], dn_list))))
    
    ins_flag_list.append(cate)
    match_d_list.append(ins_d)
    
df_cli["医保级别"] = ins_flag_list

In [37]:
df_cli.to_csv("clinical_insurance.csv", index=False)

In [55]:
import re

def clean_drug(drug):
    drug_clean = re.sub(r"[\(\)-\/（）\-\[\]\s、]", "", drug)
    drug_clean = re.sub(r"[片|注射液|颗粒|滴剂|胶囊|散剂|混悬液|乳剂|剂|膏|丸|口服溶液|口服液|咀嚼|泡腾|分散|凝胶]",
                        "", drug_clean)
    return drug_clean

In [81]:
query_template = """
MATCH (d:drug)-[r:treatment]->(dis:disease) 
RETURN d.display as drug, dis.display as disease
"""
result_list = n_util.run_cypher(query_template)
df_relation = pd.DataFrame(result_list)
print("all relation: {}".format(len(df_relation)))

all relation: 88973


In [57]:
query_template = """
MATCH (d:drug)
RETURN d.display as drug, d.indication as indication
"""
result_list = n_util.run_cypher(query_template)
df_drug = pd.DataFrame(result_list)

In [58]:
query_template = """
MATCH (dis:disease)
RETURN dis.display as disease
"""
result_list = n_util.run_cypher(query_template)
df_disease = pd.DataFrame(result_list)
all_disease = list(df_disease["disease"].values)

In [59]:
df_drug = df_drug.fillna("")
df_drug = df_drug[df_drug["indication"] != ""]
print(len(df_drug))

11963


In [60]:
drug_clean = list(map(lambda x: clean_drug(x), df_drug["drug"].values))
drug_clean_indication_dict = dict(zip(drug_clean, df_drug["indication"].values))

In [61]:
df_relation = df_relation.groupby(["drug"])["disease"].apply(lambda x: list(x)).reset_index()
print(len(df_relation))

3793


In [62]:
df_merge = pd.merge(df_drug, df_relation, how="outer", on=["drug"])
df_merge = df_merge.fillna("")

In [68]:
new_disease_list = []
new_relation_list = []
for index, row in df_merge.iterrows():
    drug = row["drug"]
    indication = row["indication"]
    disease_list = [] if row["disease"] == "" else row["disease"]
    if indication == "":
        cd = clean_drug(drug)
        if cd in drug_clean_indication_dict.keys():
            indication = drug_clean_indication_dict[cd]
    
    new_disease = list(filter(lambda x: x in indication and x not in disease_list, all_disease))
    new_disease_list.append(new_disease)
    
    for nd in new_disease:
        new_relation_list.append([drug, nd])

In [64]:
df_merge["new_disease"] = new_disease_list

In [65]:
# df_merge.to_csv("c:/Users/zhangke1/Desktop/new_disease.csv", index=False)

In [77]:
import json
with open("processed/disease_drug_new_list.json", "w", encoding="utf-8") as f:
    json.dump(new_relation_list, f)

In [84]:
# 药物基因组适用药物
result_list = n_util.run_cypher("""match (d:drug)-[]-(m:chemical)<-[]-(v:variant)
return d.display as name
union
match (d:drug)-[]-(m:chemical)<-[]-(dip:diplotype)
return d.display as name
union
match (d:drug)-[]-(m:chemical)<-[]-(ge:gene)
return d.display as name""")
len(set([x["name"] for x in result_list]))

3215

In [85]:
def is_all_chinese(strs):
    for _char in strs:
        if not '\u4e00' <= _char <= '\u9fa5':
            return False
    return True

In [89]:
is_all_chinese("小儿氨酚黄那敏顆粒")

True

In [97]:
query_template = """
match (m:chemical)<-[]-(v:variant)
where m.chemical_name = "{chem_name}"
return m.chemical_name as chemical_name
union
match (m:chemical)<-[]-(dip:diplotype)
where m.chemical_name = "{chem_name}"
return m.chemical_name as chemical_name
union
match (m:chemical)<-[]-(ge:gene)
where m.chemical_name = "{chem_name}"
return m.chemical_name as chemical_name
"""

df_dm = pd.read_csv("c:/Users/zhangke1/Desktop/drug_mapping.tsv", sep='\t')
drug_list = list(df_dm["Phenprocoumon"].str.lower().str.strip().values)

In [99]:
pgx_list= []
for drug in drug_list:
    result_list = n_util.run_cypher(query_template.format(chem_name=drug))
    if len(result_list) > 0:
        pgx_list.append(True)
    else:
        pgx_list.append(False)

In [106]:
# 和bge匹配不上的药
df_pgx = pd.DataFrame({
    "drug": drug_list,
    "pgx": pgx_list
})

df_pgx[~df_pgx["pgx"]]

Unnamed: 0,drug,pgx
15,nitrofurazone,False
16,furazolidone,False
27,aminobenzoic acid,False
39,rifampicin,False
41,glyceryl trinitrate,False
48,oral contraceptives,False
54,sodium phenylacetate and sodium benzoate,False


In [112]:
df_rare = pd.read_csv("data/罕见病审核.csv").fillna("")

all_disease_list = []
rare_relation_list = []

for index, row in df_rare.iterrows():
    disease_list_1 = row["适应症"].split(",")
    disease_list_2 = [x.split("/")[0] for x in row["BGE亚型/Omim"].split(",")]
    disease_list = list(filter(lambda x: x!= "", set(disease_list_1 + disease_list_2)))
    drug = row["药名"]
    all_disease_list.extend(disease_list)
    for dis in disease_list:
        rare_relation_list.append([drug, dis])

In [115]:
rare_relation_list

[['注射用拉罗尼酶浓溶液', '黏多糖贮积症Ⅰ型'],
 ['注射用拉罗尼酶浓溶液', '黏多糖贮积症1S型'],
 ['注射用拉罗尼酶浓溶液', '黏多糖贮积症1H型'],
 ['艾度硫酸酯酶β注射液', '黏多糖贮积症Ⅱ型'],
 ['艾度硫酸酯酶β注射液', '亨特综合征'],
 ['艾度硫酸酯酶β注射液', '黏多糖贮积症2型'],
 ['注射用伊米苷酶', '戈谢病3型'],
 ['注射用伊米苷酶', '戈谢病1型'],
 ['注射用伊米苷酶', '戈谢病Ⅲ型'],
 ['注射用伊米苷酶', '戈谢病Ⅰ型'],
 ['注射用伊米苷酶', '戈谢病3C型'],
 ['注射用阿糖苷酶α', '酸性α葡萄糖苷酶缺乏症'],
 ['注射用阿糖苷酶α', '庞贝病'],
 ['注射用阿糖苷酶α', '糖原累积症II型'],
 ['阿加糖酶α注射用浓溶液', 'α-半乳糖苷酶 A 缺乏症'],
 ['阿加糖酶α注射用浓溶液', 'Fabry病'],
 ['阿加糖酶α注射用浓溶液', '法布雷病'],
 ['注射用阿加糖酶β', 'α-半乳糖苷酶 A 缺乏症'],
 ['注射用阿加糖酶β', 'Fabry病'],
 ['注射用阿加糖酶β', '法布雷病'],
 ['诺西那生钠注射液', '脊肌萎缩症1型'],
 ['诺西那生钠注射液', '脊肌萎缩症2型'],
 ['诺西那生钠注射液', '脊髓性肌萎缩症'],
 ['诺西那生钠注射液', '脊肌萎缩症4型'],
 ['诺西那生钠注射液', '脊肌萎缩症3型'],
 ['盐酸沙丙蝶呤片', '高苯丙氨酸血症'],
 ['盐酸沙丙蝶呤片', 'BH4缺乏性高苯丙氨酸血症C型'],
 ['盐酸沙丙蝶呤片', 'BH4缺乏性高苯丙氨酸血症A型'],
 ['盐酸沙丙蝶呤片', 'BH4缺乏性高苯丙氨酸血症B型'],
 ['盐酸沙丙蝶呤片', 'BH4缺乏性高苯丙氨酸血症D型'],
 ['硫培非格司亭注射液', '中性粒细胞减少症'],
 ['依库珠单抗注射液', '阵发性睡眠性血红蛋白尿症'],
 ['依库珠单抗注射液', '非典型溶血尿毒症综合征'],
 ['依库珠单抗注射液', '补体因子I缺乏症'],
 ['依库珠单抗注射液', '阵发性夜间血红蛋白尿2型'],
 ['依库珠单抗注射液', '血性尿毒症综合征'],
 [

In [117]:
import json
node_list = []

for disease in all_disease_list:
    chemical_node = {
        "label": ["disease"],
        "node_ID": "disease_name",
        "property": {
            "disease_name": disease,
            "display": disease
        }
    }
    node_list.append(chemical_node)

edge_list = []
for rel in rare_relation_list:
    disease_drug_edge = {
        "start_node": {
            "label": ["drug"],
            "node_ID": "drug_name",
            "property": {
                "drug_name": rel[0]
            }
        },
        "end_node": {
            "label": ["disease"],
            "node_ID": "disease_name",
            "property": {
                "disease_name": rel[1]
            }
        },
        "edge": {
            "label": "treatment",
            "property": {}
        }
    }
    edge_list.append(disease_drug_edge)
    
with open("json/rare_disease_nodes.json", "w") as f:
    json.dump(node_list, f)
    
    
with open("json/rare_disease_edges.json", "w") as f:
    json.dump(edge_list, f)


In [122]:
from src.util.neo4j_util import neo4jUtil as raw_n_util
rn_util = raw_n_util(host="neo4j://172.16.231.80:7687", user="neo4j", password="123456")

In [129]:
cypher_list = []
for n in node_list:
    cypher = rn_util.gen_add_node_cypher(n)
    cypher_list.append(cypher)

In [130]:
for e in edge_list:
    cypher = rn_util.gen_add_edge_cypher(e)
    cypher_list.append(cypher)

In [132]:
print("\n\n\n".join(cypher_list))

MERGE (n:disease {disease_name:"黏多糖贮积症Ⅰ型"})
SET n += {disease_name:"黏多糖贮积症Ⅰ型", display:"黏多糖贮积症Ⅰ型"}



MERGE (n:disease {disease_name:"黏多糖贮积症1S型"})
SET n += {disease_name:"黏多糖贮积症1S型", display:"黏多糖贮积症1S型"}



MERGE (n:disease {disease_name:"黏多糖贮积症1H型"})
SET n += {disease_name:"黏多糖贮积症1H型", display:"黏多糖贮积症1H型"}



MERGE (n:disease {disease_name:"黏多糖贮积症Ⅱ型"})
SET n += {disease_name:"黏多糖贮积症Ⅱ型", display:"黏多糖贮积症Ⅱ型"}



MERGE (n:disease {disease_name:"亨特综合征"})
SET n += {disease_name:"亨特综合征", display:"亨特综合征"}



MERGE (n:disease {disease_name:"黏多糖贮积症2型"})
SET n += {disease_name:"黏多糖贮积症2型", display:"黏多糖贮积症2型"}



MERGE (n:disease {disease_name:"戈谢病3型"})
SET n += {disease_name:"戈谢病3型", display:"戈谢病3型"}



MERGE (n:disease {disease_name:"戈谢病1型"})
SET n += {disease_name:"戈谢病1型", display:"戈谢病1型"}



MERGE (n:disease {disease_name:"戈谢病Ⅲ型"})
SET n += {disease_name:"戈谢病Ⅲ型", display:"戈谢病Ⅲ型"}



MERGE (n:disease {disease_name:"戈谢病Ⅰ型"})
SET n += {disease_name:"戈谢病Ⅰ型", display:"戈谢病Ⅰ型"}



MERGE (n:disease {d

In [133]:
# for c in cypher_list:
#     n_util.run_cypher(c)

In [29]:
with open("processed/drug_dict.json", "r") as f:
    d_dict = json.load(f)

In [30]:
column_list = {
    "成份": "component",
    "性状": "character",
    "适应症": "indication",
    "用法用量": "dosage",
    "不良反应": "adverse_reaction",
    "禁忌": "avoid",
    "注意事项": "attention_notice",
    "孕妇及哺乳期妇女用药": "pregnant_attention",
    "儿童用药": "children_attention",
    "老年用药": "older_attention",
    "贮藏": "storage",
    "规格": "specification",
    "药物相互作用": "drug_interaction",
    "药理毒理": "pharmacology_toxicology",
    "药代动力学": "pharmacokinetics",
    "药物过量": "drug_overdose",
    "有效期": "expiry_date",
    "包装": "package",
    "执行标准": "standard",
    "是否医保": "in_medical_insurance",
    "甲乙": "insurance_level"
}

dc_dict = dict()
dc_dict["drug_name"] = []
for col in column_list.values():
    dc_dict[col] = []

for key, value in d_dict.items():
    dc_dict["drug_name"].append(key)
    for col, cn in column_list.items():
        if col in value.keys():
            dc_dict[cn].append(value[col])
        else:
            dc_dict[cn].append("")

In [31]:
pd.DataFrame(dc_dict).to_csv("processed/all_drug_description.csv", index=False)

In [26]:
df_no_desc_drug = pd.DataFrame(n_util.run_cypher("""
MATCH (n:drug) 
where n.indication = ""
RETURN n.drug_name as drug_name
"""))

In [28]:
df_no_desc_drug.to_csv("d:/ticket/processed/no_desc_drug.csv", index=False)