In [15]:
#stl
import os
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")

#data handling
import pandas as pd
import numpy as np

#neo4j
import neo4j
from neo4j import GraphDatabase
import networkx

In [16]:
driver = GraphDatabase.driver(uri = "bolt://localhost:7687", auth = ("neo4j","caseolap"))
driver.verify_connectivity()

'Neo4j/4.4.10'

In [23]:

UNIQUE_CONSTAINT_1 = "CREATE CONSTRAINT ON (d:CVD) ASSERT d.name IS UNIQUE"
UNIQUE_CONSTAINT_2 = "CREATE CONSTRAINT ON (m:MeSH_Tree_Disease) ASSERT m.name IS UNIQUE"
UNIQUE_CONSTAINT_3 = "CREATE CONSTRAINT ON (p:Protein) ASSERT p.name IS UNIQUE"
UNIQUE_CONSTAINT_4 = "CREATE CONSTRAINT ON (r:Reactome_Pathway) ASSERT r.name  IS UNIQUE"

info = driver.session().run(UNIQUE_CONSTAINT_1)
info = driver.session().run(UNIQUE_CONSTAINT_2)
info = driver.session().run(UNIQUE_CONSTAINT_3)
info = driver.session().run(UNIQUE_CONSTAINT_4)


In [24]:
!ls ../input_data

kg_edges_with_predictions.csv merged_node_list.tsv


In [25]:
kg_edges = pd.read_csv("../input_data/kg_edges_with_predictions.csv")
kg_edges

Unnamed: 0,head,relation,tail,weight
0,MeSH_Tree_Disease:MeSH_Tree_Disease:C01.069,MeSH_hierarchy,MeSH_Tree_Disease:MeSH_Tree_Disease:C01,1.000000
1,MeSH_Tree_Disease:MeSH_Tree_Disease:C01.100,MeSH_hierarchy,MeSH_Tree_Disease:MeSH_Tree_Disease:C01,1.000000
2,MeSH_Tree_Disease:MeSH_Tree_Disease:C01.100.500,MeSH_hierarchy,MeSH_Tree_Disease:MeSH_Tree_Disease:C01.100,1.000000
3,MeSH_Tree_Disease:MeSH_Tree_Disease:C01.125,MeSH_hierarchy,MeSH_Tree_Disease:MeSH_Tree_Disease:C01,1.000000
4,MeSH_Tree_Disease:MeSH_Tree_Disease:C01.150,MeSH_hierarchy,MeSH_Tree_Disease:MeSH_Tree_Disease:C01,1.000000
...,...,...,...,...
88008,VD,predicted_association,P36873,0.971166
88009,VD,predicted_association,P19634,0.956168
88010,VD,predicted_association,P27338,0.928993
88011,VD,predicted_association,Q13268,0.964434


In [26]:
nodes = pd.read_csv("../input_data/merged_node_list.tsv", sep = "\t")
nodes

Unnamed: 0,node,node_type
0,MeSH_Tree_Disease:MeSH_Tree_Disease:C23.550.07...,MeSH_Tree_Disease
1,MeSH_Tree_Disease:MeSH_Tree_Disease:C16.131.74...,MeSH_Tree_Disease
2,MeSH_Disease:D001660,MeSH_Tree_Disease
3,MeSH_Tree_Disease:MeSH_Tree_Disease:C12.050.35...,MeSH_Tree_Disease
4,MeSH_Tree_Disease:C12.950.419.473.585.220,MeSH_Tree_Disease
...,...,...
40366,Q9BZ23,Protein
40367,Q9Y227,Protein
40368,Q7Z2D5,Protein
40369,P25940,Protein


## Deploy Nodes with Corresponding Metadata

In [30]:
types = list(set(nodes["node_type"].to_list()))
types

['Protein', 'Reactome_Pathway', 'MeSH_Tree_Disease', 'CVD']

In [31]:
def deploy_protein(tx, name) -> None:
    query = "MERGE (n:Protein{name: $name})"
    tx.run(query, name = name)

def deploy_MeSH_Tree_Disease(tx, name) -> None:
    query = "MERGE (n:MeSH_Tree_Disease{name: $name})"
    tx.run(query, name = name)
    
def deploy_CVD(tx, name) -> None:
    query = "MERGE (n:CVD{name: $name})"
    tx.run(query, name = name)

def deploy_Reactome_Pathway(tx, name) -> None:
    query = "MERGE (n:Reactome_Pathway{name: $name})"
    tx.run(query, name = name)

In [34]:
#runtime ~ 30 min
for i in types:

    df = nodes[nodes["node_type"] == i]

    if i == "Protein":
        for name in tqdm(df["node"].to_list(), desc = "Deploying Proteins"):
            info = driver.session().write_transaction(deploy_protein, name)

    elif i == "Reactome_Pathway":
        for name in tqdm(df["node"].to_list(), desc = "Deploying Reactome_Pathways"):
            info = driver.session().write_transaction(deploy_Reactome_Pathway, name)

    elif i == "MeSH_Tree_Disease":
        for name in tqdm(df["node"].to_list(), desc = "Deploying MeSH_Tree_Disease"):
            info = driver.session().write_transaction(deploy_MeSH_Tree_Disease, name)

    elif i == "CVD":
        for name in tqdm(df["node"].to_list(), desc = "Deploying CVD"):
            info = driver.session().write_transaction(deploy_CVD, name)

print("success")

Deploying Proteins: 100%|██████████| 8853/8853 [04:12<00:00, 35.11it/s]
Deploying Reactome_Pathways: 100%|██████████| 508/508 [00:13<00:00, 36.50it/s]
Deploying MeSH_Tree_Disease: 100%|██████████| 31002/31002 [14:08<00:00, 36.53it/s]
Deploying CVD: 100%|██████████| 8/8 [00:00<00:00, 20.32it/s]

success





## Deploying Relationships

In [47]:
kg_edges[kg_edges["relation"] == "MeSH_is"]

Unnamed: 0,head,relation,tail,weight
12952,MeSH_Tree_Disease,MeSH_is,MeSH_Disease:D007239,1.0
12953,MeSH_Tree_Disease,MeSH_is,MeSH_Disease:D000785,1.0
12954,MeSH_Tree_Disease,MeSH_is,MeSH_Disease:D001170,1.0
12955,MeSH_Tree_Disease,MeSH_is,MeSH_Disease:D016918,1.0
12956,MeSH_Tree_Disease,MeSH_is,MeSH_Disease:D058345,1.0
...,...,...,...,...
25923,MeSH_Tree_Disease,MeSH_is,MeSH_Disease:D000067073,1.0
25924,MeSH_Tree_Disease,MeSH_is,MeSH_Disease:D000080037,1.0
25925,MeSH_Tree_Disease,MeSH_is,MeSH_Disease:D000082002,1.0
25926,MeSH_Tree_Disease,MeSH_is,MeSH_Disease:D013313,1.0


In [37]:
edge_types = list(set(kg_edges["relation"]))
edge_types

['MeSH_is',
 'binds_to',
 'MeSH_hierarchy',
 'CaseOLAP_score',
 'participates_in_pathway',
 'predicted_association',
 'MeSH_CVD']

In [None]:
def create_MeSH_hierarchy(tx, src, rel, dest, weight):
    query = '''
    MATCH (d:Deploying MeSH_Tree_Disease{name:$src})
    MATCH (p:Deploying MeSH_Tree_Disease{name:$dest})
    MERGE (d)-[:MeSH_hierarchy {WEIGHT:$weight}]->(p)
    '''
    tx.run(query, src = src, rel = rel, dest = dest, weight = weight)

def create_MeSH_is(tx, src, rel, dest, weight):
    query = '''
    MATCH (d:Deploying MeSH_Tree_Disease{name:$src})
    MATCH (p:Deploying MeSH_Tree_Disease{name:$dest})
    MERGE (d)-[:MeSH_hierarchy {WEIGHT:$weight}]->(p)
    '''
    tx.run(query, src = src, rel = rel, dest = dest, weight = weight)
    