In [1]:
from py2neo import Graph 
import pandas as pd
from collections import defaultdict
import json

In [21]:
class neo4jUtil:
    def __init__(self):
        self.driver = Graph("neo4j://172.16.227.27:7687", auth=("neo4j", "123456"))

    def run_cypher(self, cypher):
        result_list = self.driver.run(cypher).data()
        return result_list
    
n_util = neo4jUtil()

In [73]:
df_warning = pd.read_csv("processed/fda_warning_with_chemical.csv")[["match_chemical_list", "link"]]
df_warning = df_warning[~df_warning['match_chemical_list'].isnull()]
warning_dict = defaultdict(list)

for index, row in df_warning.iterrows():
    chem = row["match_chemical_list"]
    link = row["link"]
    warning_dict[chem].append(link)


In [6]:
df_drug = pd.DataFrame({"drug": list(set(pd.read_csv("drug_report/report_chemicals.csv")["Drug"].values))})

In [15]:
df_translation = pd.read_csv("processed/chemical_translation.csv", dtype=str).fillna("")
df_translation["chemical_name"] = df_translation["chemical_name"].str.strip().str.lower() 
df_translation["cn_chemical_name"] = df_translation["cn_chemical_name"].str.strip().str.lower()

df_drug = pd.merge(df_drug, df_translation, how="left", left_on=["drug"], 
                   right_on=["chemical_name"])[["chemical_name", "cn_chemical_name"]].fillna("")



In [45]:
def get_pc_label(chemical_name):
    # phenotype category label
    query_template = """
    match (m:chemical {{chemical_name: "{chem_name}"}})<-[r:clinical_annotation]-()
    return distinct r.phenotype_category as phenotype_category
    union
    match (m:chemical {{chemical_name: "{chem_name}"}})<-[r:research_annotation]-()
    return distinct r.phenotype_category as phenotype_category
    """.format(chem_name=chemical_name)
    result = n_util.run_cypher(query_template)

    toxicity_flag = False
    efficacy_flag = False
    dosage_flag = False
    metabolism_flag = False
    # label_code pc_toxicity, pc_efficacy, pc_dosage, pc_metabolism
    for x in [res["phenotype_category"].lower() for res in result]:
        if "toxicity" in x:
            toxicity_flag = True
        if "efficacy" in x:
            efficacy_flag = True
        if "dosage" in x:
            dosage_flag = True
        if "metabolism" in x:
            metabolism_flag = True
    return [toxicity_flag, efficacy_flag, dosage_flag, metabolism_flag]

In [50]:
def get_cn_drug_label(chemical_name):
    # cn_drug_label
    query_template = """
    match (m:chemical {{chemical_name: "{chem_name}"}})<-[r:cn_drug_label]-(ge:gene)
    return m.chemical_name as chemical_name, ge.gene_name as gene_name, r.remark as remark
    """.format(chem_name=chemical_name)
    result = n_util.run_cypher(query_template)

    cn_drug_label_flag = False
    genes = ""
    if len(result) > 0:
        cn_drug_label_flag = True
        genes = ",".join([x["gene_name"] for x in result])
        
    return [cn_drug_label_flag, genes]

In [78]:
def fda_warning_insurance_label(chemical_name):
    # FDA warning and medical_insurance
    warning_link = ""
    fda_warning_flag = False
    
    if chemical_name in warning_dict.keys():
        fda_warning_flag = True
        warning_link = " | ".join(warning_dict[chemical_name])

    query_template = """
    match (m:chemical {{chemical_name: "{chem_name}"}})-[r:chemical_drug_relation]->(n:drug)
    return distinct n.in_medical_insurance as in_medical_insurance, n.insurance_level as insurance_level
    """.format(chem_name=chemical_name)
    result = n_util.run_cypher(query_template)
    is_insurance = [x["in_medical_insurance"] for x in result]
    level = [x["insurance_level"] for x in result]

    insurance_flag = False
    insurance_level = ""

    if "是" in is_insurance:
        insurance_flag = True
        insurance_level = ",".join(set(filter(lambda x: x != "", level)))
        
    return [fda_warning_flag, warning_link, insurance_flag, insurance_level]

In [48]:
def get_atc_info(chemical_name):
    # ATC code
    query_template = """
    match (m:chemical {{chemical_name: "{chem_name}"}})
    return m.atc_code as atc_code, m.L1_info as L1_info, m.L1_info_chn as L1_info_chn,
    m.L2_info as L2_info, m.L2_info_chn as L2_info_chn,
    m.L3_info as L3_info, m.L3_info_chn as L3_info_chn,
    m.L4_info as L4_info, m.L4_info_chn as L4_info_chn
    """.format(chem_name=chemical_name)
    result = n_util.run_cypher(query_template)
    
    if len(result) == 0:
        return [""] * 9
    
    result = result[0]
    return [result['atc_code'], result['L1_info'], result['L1_info_chn'], result['L2_info'], result['L2_info_chn'],
            result['L3_info'], result['L3_info_chn'], result['L4_info'], result['L4_info_chn']]


In [79]:
cn_drug_label_list = []
drug_label_gene_list = []
fda_warning_flag_list = []
insurance_flag_list = []
insurance_level_list = []
atc_code_list = []
L1_info_list = []
L1_info_chn_list = []
L2_info_list = []
L2_info_chn_list = []
L3_info_list = []
L3_info_chn_list = []
L4_info_list = []
L4_info_chn_list = []
toxicity_flag_list = []
efficacy_flag_list = []
dosage_flag_list = []
metabolism_flag_list = []
fda_warning_link_list = []

for index, row in df_drug.iterrows():
    chemical_name = row["chemical_name"]
    [toxicity_flag, efficacy_flag, dosage_flag, metabolism_flag] = get_pc_label(chemical_name)
    [cn_drug_label, genes] = get_cn_drug_label(chemical_name)
    [fda_warning_flag, fda_warning_link, insurance_flag, insurance_level] = fda_warning_insurance_label(chemical_name)
    [atc_code, L1_info, L1_info_chn, L2_info, L2_info_chn, L3_info, L3_info_chn, L4_info, L4_info_chn] = get_atc_info(chemical_name)
    
    cn_drug_label_list.append(cn_drug_label)
    drug_label_gene_list.append(genes)
    fda_warning_flag_list.append(fda_warning_flag)
    insurance_flag_list.append(insurance_flag)
    insurance_level_list.append(insurance_level)
    atc_code_list.append(atc_code)
    L1_info_list.append(L1_info)
    L1_info_chn_list.append(L1_info_chn)
    L2_info_list.append(L2_info)
    L2_info_chn_list.append(L2_info_chn)
    L3_info_list.append(L3_info)
    L3_info_chn_list.append(L3_info_chn)
    L4_info_list.append(L4_info)
    L4_info_chn_list.append(L4_info_chn)
    toxicity_flag_list.append(toxicity_flag)
    efficacy_flag_list.append(efficacy_flag)
    dosage_flag_list.append(dosage_flag)
    metabolism_flag_list.append(metabolism_flag)
    fda_warning_link_list.append(fda_warning_link)

In [80]:
df_label = pd.DataFrame({
    "中国药物标签": cn_drug_label_list,
    "中国药物基因影响": drug_label_gene_list,
    "FDA预警标签": fda_warning_flag_list,
    "FDA预警链接": fda_warning_link_list,
    "医保标签": insurance_flag_list,
    "医保级别": insurance_level_list,
    "ATC代码": atc_code_list,
    "ATC level 1": L1_info_list,
    "ATC level 1 中文": L1_info_chn_list,
    "ATC level 2": L2_info_list,
    "ATC level 2 中文": L2_info_chn_list,
    "ATC level 3": L3_info_list,
    "ATC level 3 中文": L3_info_chn_list,
    "ATC level 4": L4_info_list,
    "ATC level 4 中文": L4_info_chn_list,
}).fillna("")

In [81]:
pd.concat([df_drug, df_label], axis=1).to_csv("drug_report/report_label.csv", index=False)

In [5]:
for index, row in pd.read_csv("processed/drug_chemical.csv", encoding="utf-8", dtype=str).fillna("").iterrows():
    cn_name = row["cn_chemical_name"].strip()
    drug_name = row["chn_name"].strip()
    en_name = row["chemical"].lower()
    en_drug_name = row["eng_name"].lower()
    
    if cn_name not in drug_name and en_name not in en_drug_name:
        print("{} -- {}".format(cn_name, drug_name))

In [15]:
import json
with open("c:/Users/zhangke1/Downloads/records.json", "r", encoding="utf-8") as f:
    content = f.read()

In [18]:
df_pharmacology = pd.read_csv("c:/Users/zhangke1/Downloads/export.csv")

In [19]:
df_pharmacology

Unnamed: 0,drug_name,pharmacology_toxicology
0,琥乙红霉素片,本品属大环内酯类抗生素，为红霉素的琥珀酸乙酯，在胃酸中较红霉素稳定。对葡萄球菌属(耐甲氧西林...
1,琥乙红霉素颗粒,本品属大环内酯类抗生素，为红霉素的琥珀酸乙酯，在胃酸中较红霉素稳定。对葡萄球菌属(耐甲氧西林...
2,红霉素肠溶片,本品属大环内酯类抗生素。对葡萄球菌属(耐甲氧西林菌株除外)、各组链球菌和革兰阳性杆菌均具抗菌...
3,布美他尼片,对水和电解质排泄的作用基本同呋塞米，其利尿作用为呋塞米20～60倍。主要抑制肾小管髓袢升支厚...
4,注射用布美他尼,对水和电解质的排泄作用基本同呋塞米，其利尿作用为呋塞米的20～60倍。主要抑制肾小管髓袢升支...
...,...,...
5227,左羟丙哌嗪胶囊,1.左羟丙哌嗪的镇咳作用；研究了左羟丙哌嗪对麻醉豚鼠和家兔的镇咳作用，结果表明左羟丙哌嗪具有...
5228,左旋卡尼汀注射液,左旋卡尼汀是哺乳动物能量代谢中必需的体内天然物质，其主要功能是促进脂类代谢。在缺血、缺氧时，...
5229,左炔诺孕酮聚己内酯棒,全合成的孕激素。具有较强抑制垂体分泌促性腺激素的作用而抑制排卵；它能使宫黏粘液变稠，阻碍精子...
5230,左旋卡尼汀口服溶液,左旋卡尼汀是哺乳动物能量代谢中必需的体内天然物质，其主要功能是促进脂类代谢。在缺血、缺氧时，...


In [27]:
sent_list = []
for index,row in df_pharmacology.iterrows():
    sent = row["pharmacology_toxicology"].split("。")[:5]
    available_sent_list = []
    for s in sent:
        if any([True if x in s else False for x in ["抑制", "阻断", "具", "作用", 
                                                    "有效", "减少", "降低", "增加", "促进"]]) and "毒理" not in s:
            available_sent_list.append(s)
            
    sent_list.append("。".join(available_sent_list))

df_pharmacology["sent"] = sent_list

In [29]:
df_pharmacology.to_csv("c:/Users/zhangke1/Desktop/药理.csv", index=False)