In [96]:
from py2neo import Graph 
import pandas as pd
import re
import os
os.getcwd()

'D:\\pgkb_graph'

In [3]:
session = Graph("neo4j://172.16.229.46:7687", auth=("neo4j", "123456"))

In [78]:
haplotype_guideline_cypher = """
MATCH (hap:haplotype)-[r:variant_guideline]->(che:chemical) 
WITH hap, r, che
MATCH (hap)-[]->(ge:gene)
RETURN ge.gene_name as gene_name, hap.variant_name as haplotype_name, hap.is_reference as is_reference,
hap.NC_change_code as NC_code, hap.NG_change_code as NG_code, hap.nucleotide_change_code as nucleotide_code, 
hap.protein_change_code as protein_code, hap.mapped_rsID as mapped_rsID, 
r.phenotype as phenotype, r.implication as implication, r.recommendation as recommendation,
r.organization as organization, r.link as link, 
che.chemical_name as chemical_name, che.atc_code as atc_code, che.L1_info as L1_info,
che.L2_info as L2_info, che.L3_info as L3_info, che.L4_info as L4_info
"""

result_list = session.run(haplotype_guideline_cypher).data()
df_haplotype_guideline = pd.DataFrame(result_list)

In [75]:
diplotype_guideline_cypher = """
MATCH (dip:diplotype)-[r:variant_guideline]->(che:chemical) 
WITH dip, r, che
MATCH (dip)-[:diplotype_consist_of]->(hap:haplotype)-[:mutation_at]->(ge:gene)
RETURN distinct ge.gene_name as gene_name,
dip.diplotype_name as diplotype_name, r.phenotype as phenotype, 
hap.variant_name as haplotype_name, hap.is_reference as is_reference, hap.NC_change_code as NC_code, hap.NG_change_code as NG_code, 
hap.nucleotide_change_code as nucleotide_code, hap.protein_change_code as protein_code, hap.mapped_rsID as mapped_rsID,
r.implication as implication, r.recommendation as recommendation, r.organization as organization, r.link as link, 
che.chemical_name as chemical_name, che.atc_code as atc_code, che.L1_info as L1_info, 
che.L2_info as L2_info, che.L3_info as L3_info, che.L4_info as L4_info
"""

result_list = session.run(diplotype_guideline_cypher).data()
df_diplotype_guideline = pd.DataFrame(result_list)

In [17]:
diplotype_metabolizer_cypher = """
MATCH (dip:diplotype)-[r:diplotype_metabolizer]->(che:chemical) 
WITH dip, r, che
MATCH (dip)-[:diplotype_consist_of]->(hap:haplotype)-[:mutation_at]->(ge:gene)
RETURN distinct ge.gene_name as gene_name,
dip.diplotype_name as diplotype_name, r.phenotype as phenotype, r.phenotype_category as metabolizer,
hap.variant_name as haplotype_name, hap.is_reference as is_reference, hap.NC_change_code as NC_code, 
hap.NG_change_code as NG_code, hap.nucleotide_change_code as nucleotide_code, 
hap.protein_change_code as protein_code, hap.mapped_rsID as mapped_rsID,
r.implication as implication, r.recommendation as recommendation, 
r.organization as organization, r.link as link, 
che.chemical_name as chemical_name, che.atc_code as atc_code, che.L1_info as L1_info, 
che.L2_info as L2_info, che.L3_info as L3_info, che.L4_info as L4_info
"""
result_list = session.run(diplotype_metabolizer_cypher).data()
df_diplotype_metabolizer = pd.DataFrame(result_list)

In [101]:
def get_position(nc_code):
    find_list = re.findall(r"chromosome [\d]+", nc_code)
    if len(find_list) > 0:
        chr_name = find_list[0].replace("chromosome ", "chr")
        position_list = [
            "{}:{}".format(
                chr_name,
                "".join(["{}:{}".format(x.strip().replace("g.", "").replace(y, ""), y) 
                         for y in re.findall(r"[\D]+", x.strip().replace("g.", ""))]) if "del" not in x else x.strip().replace("g.", "")
                ) 
            for x in nc_code.split(";")[1].strip().replace("[", "").replace("]", ""). replace("'", "").split(",")
        ]
        return ", ".join(position_list)
        
    else:
        return ""
    
df_diplotype_metabolizer = df_diplotype_metabolizer.assign(
    position=[get_position(x) for x in list(df_diplotype_metabolizer["NC_code"].values)])

df_diplotype_guideline = df_diplotype_guideline.assign(
    position=[get_position(x) for x in list(df_diplotype_guideline["NC_code"].values)])

df_haplotype_guideline = df_haplotype_guideline.assign(
    position=[get_position(x) for x in list(df_haplotype_guideline["NC_code"].values)])

df_diplotype_guideline.to_csv("query_file/guideline_diplotype.csv", index=False)
df_haplotype_guideline.to_csv("query_file/guideline_haplotype.csv", index=False)
df_diplotype_metabolizer.to_csv("query_file/guideline_metabolizer.csv", index=False)

In [103]:
group_list = []
for key, content in df_diplotype_metabolizer.groupby(["gene_name", "metabolizer", "organization", "chemical_name"]):
    content_dict = {
        "gene_name": key[0],
        "metabolizer": key[1],
        "organization": key[2],
        "chemical_name": key[3],
        "diplotype": list(set(content["diplotype_name"].values)),
        "haplotype": list(set(content["haplotype_name"].values)),
        "nc_code": list(set(content["NC_code"].values)),
        "ng_code": list(set(content["NG_code"].values)),
        "nucleotide_code": list(set(content["nucleotide_code"].values)),
        "protein_code": list(set(content["protein_code"].values)),
        "recommendation": list(set(content["recommendation"].values))[0],
        "implication": list(set(content["implication"].values))[0],
        "phenotype": list(set(content["phenotype"].values))[0],
        "link": list(set(content["link"].values))[0],
        "atc_code": list(set(content["atc_code"].values))[0],
        "L1_info": list(set(content["L1_info"].values))[0],
        "L2_info": list(set(content["L2_info"].values))[0],
        "L3_info": list(set(content["L3_info"].values))[0],
        "L4_info": list(set(content["L4_info"].values))[0],
        "position": list(set([x.strip() for x in ",".join(list(set(content["position"].values))).split(",")])),
        "rsID": list(set([x.strip() for x in ",".join(list(set(content["mapped_rsID"].values))).split(",")]))
    }
    group_list.append(content_dict)
    
pd.DataFrame(group_list).to_csv("query_file/guideline_metabolizer_group.csv", index=False)

In [104]:
group_list = []
for key, content in df_diplotype_guideline.groupby(["gene_name", "organization", "chemical_name", "diplotype_name"]):
    content_dict = {
        "gene_name": key[0],
        "metabolizer": "",
        "organization": key[1],
        "chemical_name": key[2],
        "diplotype": key[3],
        "haplotype": list(set(content["haplotype_name"].values)),
        "nc_code": list(set(content["NC_code"].values)),
        "ng_code": list(set(content["NG_code"].values)),
        "nucleotide_code": list(set(content["nucleotide_code"].values)),
        "protein_code": list(set(content["protein_code"].values)),
        "recommendation": list(set(content["recommendation"].values))[0],
        "implication": list(set(content["implication"].values))[0],
        "phenotype": list(set(content["phenotype"].values))[0],
        "link": list(set(content["link"].values))[0],
        "atc_code": list(set(content["atc_code"].values))[0],
        "L1_info": list(set(content["L1_info"].values))[0],
        "L2_info": list(set(content["L2_info"].values))[0],
        "L3_info": list(set(content["L3_info"].values))[0],
        "L4_info": list(set(content["L4_info"].values))[0],
        "position": list(set([x.strip() for x in ",".join(list(set(content["position"].values))).split(",")])),
        "rsID": list(set([x.strip() for x in ",".join(list(set(content["mapped_rsID"].values))).split(",")]))
    }
    group_list.append(content_dict)
    
pd.DataFrame(group_list).to_csv("query_file/guideline_diplotype_group.csv", index=False)

In [105]:
group_list = []
for key, content in df_haplotype_guideline.groupby(["gene_name", "organization", "chemical_name", "haplotype_name"]):
    content_dict = {
        "gene_name": key[0],
        "metabolizer": "",
        "organization": key[1],
        "chemical_name": key[2],
        "diplotype": "",
        "haplotype": key[3],
        "nc_code": list(set(content["NC_code"].values)),
        "ng_code": list(set(content["NG_code"].values)),
        "nucleotide_code": list(set(content["nucleotide_code"].values)),
        "protein_code": list(set(content["protein_code"].values)),
        "recommendation": list(set(content["recommendation"].values))[0],
        "implication": list(set(content["implication"].values))[0],
        "phenotype": list(set(content["phenotype"].values))[0],
        "link": list(set(content["link"].values))[0],
        "atc_code": list(set(content["atc_code"].values))[0],
        "L1_info": list(set(content["L1_info"].values))[0],
        "L2_info": list(set(content["L2_info"].values))[0],
        "L3_info": list(set(content["L3_info"].values))[0],
        "L4_info": list(set(content["L4_info"].values))[0],
        "position": list(set([x.strip() for x in ",".join(list(set(content["position"].values))).split(",")])),
        "rsID": list(set([x.strip() for x in ",".join(list(set(content["mapped_rsID"].values))).split(",")]))
    }
    group_list.append(content_dict)
    
pd.DataFrame(group_list).to_csv("query_file/guideline_haplotype_group.csv", index=False)

In [115]:
cpic_drug_label_cypher = """
MATCH (ge:gene)-[r:cpic_guideline]->(che:chemical) 
RETURN ge.gene_name as gene_name, ge.OMIM as OMIM, ge.chromosome as chromosome, ge.refseq as refseq,
ge.chromosomal_start_GRCh38 as chromosomal_start, ge.chromosomal_stop_GRCh38 as chromosomal_stop, ge.is_VIP as is_VIP,
r.CPIC_level_status as CPIC_level_status, r.CPIC_level as CPIC_level, r.FDA_PGx_label as FDA_PGx_label,
r.PGKB_evidence_level as PGKB_evidence_level, r.guideline_link as link,
che.chemical_name as chemical_name, che.atc_code as atc_code, che.L1_info as L1_info, 
che.L2_info as L2_info, che.L3_info as L3_info, che.L4_info as L4_info, "CPIC" as organization
"""

result_list = session.run(cpic_drug_label_cypher).data()
df_cpic_drug_label = pd.DataFrame(result_list).fillna("")

In [117]:
pgkb_drug_label_cypher = """
MATCH (ge:gene)-[r:drug_label]->(che:chemical) 
RETURN ge.gene_name as gene_name, ge.OMIM as OMIM, ge.chromosome as chromosome, ge.refseq as refseq,
ge.chromosomal_start_GRCh38 as chromosomal_start, ge.chromosomal_stop_GRCh38 as chromosomal_stop, ge.is_VIP as is_VIP,
r.link as link, r.label_name as label_name, r.testing_level as testing_level,
che.chemical_name as chemical_name, che.atc_code as atc_code, che.L1_info as L1_info, 
che.L2_info as L2_info, che.L3_info as L3_info, che.L4_info as L4_info, r.organization as organization
"""

result_list = session.run(pgkb_drug_label_cypher).data()
df_pgkb_drug_label = pd.DataFrame(result_list).fillna("")

In [119]:
df_pgkb_drug_label.to_csv("query_file/pgkb_drug_label.csv", index=False)
df_cpic_drug_label.to_csv("query_file/cpic_drug_label.csv", index=False)

In [132]:
variant_clinical_cypher = """
MATCH (var:variant)-[r:clinical_annotation]->(che:chemical)
WITH var, r, che
MATCH (var)-[:mutation_at]->(ge:gene)
return ge.gene_name as gene_name, ge.chromosome as chromosome,
var.variant_name as variant_name, var.is_reference as is_reference,
var.NC_change_code as NC_code, var.NG_change_code as NG_code, 
var.nucleotide_change_code as nucleotide_code, 
var.protein_change_code as protein_code, var.mapped_rsID as mapped_rsID, 
r.phenotype_category as phenotype_category, r.evidence_level as evidence_level, 
r.level_modifier as level_modifier, r.phenotype as phenotype, r.score as score, r.link as link,
che.chemical_name as chemical_name, che.atc_code as atc_code, che.L1_info as L1_info,
che.L2_info as L2_info, che.L3_info as L3_info, che.L4_info as L4_info
"""

result_list = session.run(variant_clinical_cypher).data()
df_variant_clinical = pd.DataFrame(result_list).fillna("")

In [133]:
group_list = []
for key, content in df_variant_clinical.groupby(["variant_name", "phenotype_category", "chemical_name"]):
    content_dict = {
        "gene_name": list(set(content["gene_name"].values))[0],
        "chromosome": list(set(content["chromosome"].values))[0],
        "variant_name": key[0],
        "phenotype_category": key[1],
        "chemical_name": key[2],
        "nc_code": list(set(content["NC_code"].values)),
        "ng_code": list(set(content["NG_code"].values)),
        "nucleotide_code": list(set(content["nucleotide_code"].values)),
        "protein_code": list(set(content["protein_code"].values)),
        "evidence_level": list(set(content["evidence_level"].values))[0],
        "level_modifier": list(set(content["level_modifier"].values))[0],
        "phenotype": list(set(content["phenotype"].values)),
        "link": list(set(content["link"].values))[0],
        "atc_code": list(set(content["atc_code"].values))[0],
        "L1_info": list(set(content["L1_info"].values))[0],
        "L2_info": list(set(content["L2_info"].values))[0],
        "L3_info": list(set(content["L3_info"].values))[0],
        "L4_info": list(set(content["L4_info"].values))[0],
        "rsID": list(set([x.strip() for x in ",".join(list(set(content["mapped_rsID"].values))).split(",")]))
    }
    group_list.append(content_dict)
df_variant_clinical = pd.DataFrame(group_list)
df_variant_clinical = df_variant_clinical.sort_values(by=["evidence_level"])
df_variant_clinical.index = range(len(df_variant_clinical))
df_variant_clinical.to_csv("query_file/variant_clinical.csv", index=False)