# INSTRUCTIONS FOR USE

**Neo4j Graph Database Instance Version = v4.4.7**
 1. Password = "caseolap"
 2. Plugins
    - APOC: v4.4.0.6
    - GDSL: v2.1.1

In [1]:
#stl
import os
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")

#data handling
import pandas as pd
import numpy as np

#neo4j
import neo4j
from neo4j import GraphDatabase
import networkx

In [2]:
#check connection
driver = GraphDatabase.driver(uri = "bolt://localhost:7687", auth = ("neo4j","caseolap"))
driver.verify_connectivity()

'Neo4j/4.4.7'

In [3]:
EDGE_LIST = pd.read_csv("../input_data/merged_edge_list.tsv", sep = "\t")
EDGE_LIST

Unnamed: 0,head,relation,tail,weight
0,MeSH_Tree_Disease:C01.069,MeSH_hierarchy,MeSH_Tree_Disease:C01,1.0
1,MeSH_Tree_Disease:C01.100,MeSH_hierarchy,MeSH_Tree_Disease:C01,1.0
2,MeSH_Tree_Disease:C01.100.500,MeSH_hierarchy,MeSH_Tree_Disease:C01.100,1.0
3,MeSH_Tree_Disease:C01.125,MeSH_hierarchy,MeSH_Tree_Disease:C01,1.0
4,MeSH_Tree_Disease:C01.150,MeSH_hierarchy,MeSH_Tree_Disease:C01,1.0
...,...,...,...,...
193096,R-HSA-983705,Reactome_Hierarchy,R-HSA-983695,1.0
193097,R-HSA-983712,Reactome_Hierarchy,R-HSA-2672351,1.0
193098,R-HSA-983712,Reactome_Hierarchy,R-HSA-936837,1.0
193099,R-HSA-991365,Reactome_Hierarchy,R-HSA-170670,1.0


In [4]:
NODE_LIST = pd.read_csv("../input_data/merged_node_list.tsv", sep = "\t")
NODE_LIST

Unnamed: 0,node,node_type
0,MeSH_Tree_Disease:C25.723.127.071,MeSH_Tree_Disease
1,MeSH_Tree_Disease:C23.550.291.531.750,MeSH_Tree_Disease
2,MeSH_Tree_Disease:C01.150.252.400.310.980,MeSH_Tree_Disease
3,MeSH_Tree_Disease:C21.223,MeSH_Tree_Disease
4,MeSH_Tree_Disease:C12.050.351.968.419.403.875.500,MeSH_Tree_Disease
...,...,...
38583,R-HSA-453276,Reactome_Pathway
38584,R-HSA-162594,Reactome_Pathway
38585,R-HSA-9634285,Reactome_Pathway
38586,R-HSA-964739,Reactome_Pathway


In [5]:
PREDICTIONS = pd.read_csv("../input_data/kg_edges_with_predictions.csv")
PREDICTIONS

Unnamed: 0,head,relation,tail,weight
0,MeSH_Tree_Disease:C01.069,MeSH_hierarchy,MeSH_Tree_Disease:C01,1.000000
1,MeSH_Tree_Disease:C01.100,MeSH_hierarchy,MeSH_Tree_Disease:C01,1.000000
2,MeSH_Tree_Disease:C01.100.500,MeSH_hierarchy,MeSH_Tree_Disease:C01.100,1.000000
3,MeSH_Tree_Disease:C01.125,MeSH_hierarchy,MeSH_Tree_Disease:C01,1.000000
4,MeSH_Tree_Disease:C01.150,MeSH_hierarchy,MeSH_Tree_Disease:C01,1.000000
...,...,...,...,...
196078,ARR,predicted_association,P26678,0.997566
196079,ARR,predicted_association,Q9H1K4,0.973283
196080,ARR,predicted_association,Q8NCR3,0.998709
196081,ARR,predicted_association,Q9BT17,0.991093


In [6]:
#see node types
set(NODE_LIST["node_type"])

{'CVD', 'MeSH_Tree_Disease', 'Protein', 'Reactome_Pathway'}

In [7]:
try:
    UNIQUE_CONSTAINT_1 = "CREATE CONSTRAINT ON (d:CVD) ASSERT d.name IS UNIQUE"
    UNIQUE_CONSTAINT_2 = "CREATE CONSTRAINT ON (m:MeSH_Tree_Disease) ASSERT m.name IS UNIQUE"
    UNIQUE_CONSTAINT_3 = "CREATE CONSTRAINT ON (p:Protein) ASSERT p.name IS UNIQUE"
    UNIQUE_CONSTAINT_4 = "CREATE CONSTRAINT ON (r:Reactome_Pathway) ASSERT r.name  IS UNIQUE"

    info = driver.session().run(UNIQUE_CONSTAINT_1)
    info = driver.session().run(UNIQUE_CONSTAINT_2)
    info = driver.session().run(UNIQUE_CONSTAINT_3)
    info = driver.session().run(UNIQUE_CONSTAINT_4)
except:
    print("constraints already created")

In [8]:
!ls ../input_data

kg_edges_with_predictions.csv
merged_edge_list.tsv
merged_node_list.tsv


In [9]:
kg_edges = pd.read_csv("../input_data/kg_edges_with_predictions.csv")
kg_edges

Unnamed: 0,head,relation,tail,weight
0,MeSH_Tree_Disease:C01.069,MeSH_hierarchy,MeSH_Tree_Disease:C01,1.000000
1,MeSH_Tree_Disease:C01.100,MeSH_hierarchy,MeSH_Tree_Disease:C01,1.000000
2,MeSH_Tree_Disease:C01.100.500,MeSH_hierarchy,MeSH_Tree_Disease:C01.100,1.000000
3,MeSH_Tree_Disease:C01.125,MeSH_hierarchy,MeSH_Tree_Disease:C01,1.000000
4,MeSH_Tree_Disease:C01.150,MeSH_hierarchy,MeSH_Tree_Disease:C01,1.000000
...,...,...,...,...
196078,ARR,predicted_association,P26678,0.997566
196079,ARR,predicted_association,Q9H1K4,0.973283
196080,ARR,predicted_association,Q8NCR3,0.998709
196081,ARR,predicted_association,Q9BT17,0.991093


In [10]:
nodes = pd.read_csv("../input_data/merged_node_list.tsv", sep = "\t")
nodes

Unnamed: 0,node,node_type
0,MeSH_Tree_Disease:C25.723.127.071,MeSH_Tree_Disease
1,MeSH_Tree_Disease:C23.550.291.531.750,MeSH_Tree_Disease
2,MeSH_Tree_Disease:C01.150.252.400.310.980,MeSH_Tree_Disease
3,MeSH_Tree_Disease:C21.223,MeSH_Tree_Disease
4,MeSH_Tree_Disease:C12.050.351.968.419.403.875.500,MeSH_Tree_Disease
...,...,...
38583,R-HSA-453276,Reactome_Pathway
38584,R-HSA-162594,Reactome_Pathway
38585,R-HSA-9634285,Reactome_Pathway
38586,R-HSA-964739,Reactome_Pathway


## Deploy Nodes with Corresponding Metadata

In [11]:
types = list(set(NODE_LIST["node_type"].to_list()))
types

['Protein', 'MeSH_Tree_Disease', 'CVD', 'Reactome_Pathway']

In [12]:
def deploy_protein(tx, name) -> None:
    query = "MERGE (n:Protein{name: $name})"
    tx.run(query, name = name)

def deploy_MeSH_Tree_Disease(tx, name) -> None:
    query = "MERGE (n:MeSH_Tree_Disease{name: $name})"
    tx.run(query, name = name)
    
def deploy_CVD(tx, name) -> None:
    query = "MERGE (n:CVD{name: $name})"
    tx.run(query, name = name)

def deploy_Reactome_Pathway(tx, name) -> None:
    query = "MERGE (n:Reactome_Pathway{name: $name})"
    tx.run(query, name = name)

In [13]:
#runtime ~ 30 min
for i in types:

    df = NODE_LIST[NODE_LIST["node_type"] == i]

    if i == "Protein":
        for name in tqdm(df["node"].to_list(), desc = "Deploying Proteins"):
            info = driver.session().write_transaction(deploy_protein, name)

    elif i == "Reactome_Pathway":
        for name in tqdm(df["node"].to_list(), desc = "Deploying Reactome_Pathways"):
            info = driver.session().write_transaction(deploy_Reactome_Pathway, name)

    elif i == "MeSH_Tree_Disease":
        for name in tqdm(df["node"].to_list(), desc = "Deploying MeSH_Tree_Disease"):
            info = driver.session().write_transaction(deploy_MeSH_Tree_Disease, name)

    elif i == "CVD":
        for name in tqdm(df["node"].to_list(), desc = "Deploying CVD"):
            info = driver.session().write_transaction(deploy_CVD, name)

print("success")

Deploying Proteins: 100%|██████████| 17953/17953 [02:47<00:00, 107.28it/s]
Deploying MeSH_Tree_Disease: 100%|██████████| 18026/18026 [02:33<00:00, 117.69it/s]
Deploying CVD: 100%|██████████| 8/8 [00:00<00:00, 87.92it/s]
Deploying Reactome_Pathways: 100%|██████████| 2601/2601 [00:20<00:00, 125.00it/s]

success





## Deploying Relationships

In [14]:
PREDICTIONS

Unnamed: 0,head,relation,tail,weight
0,MeSH_Tree_Disease:C01.069,MeSH_hierarchy,MeSH_Tree_Disease:C01,1.000000
1,MeSH_Tree_Disease:C01.100,MeSH_hierarchy,MeSH_Tree_Disease:C01,1.000000
2,MeSH_Tree_Disease:C01.100.500,MeSH_hierarchy,MeSH_Tree_Disease:C01.100,1.000000
3,MeSH_Tree_Disease:C01.125,MeSH_hierarchy,MeSH_Tree_Disease:C01,1.000000
4,MeSH_Tree_Disease:C01.150,MeSH_hierarchy,MeSH_Tree_Disease:C01,1.000000
...,...,...,...,...
196078,ARR,predicted_association,P26678,0.997566
196079,ARR,predicted_association,Q9H1K4,0.973283
196080,ARR,predicted_association,Q8NCR3,0.998709
196081,ARR,predicted_association,Q9BT17,0.991093


In [15]:
def deploy_rel(tx, node1: str, node2 : str, relation : str, weight: float) -> None:
    """ 
    ARGS:
        tx
        node1: name of src
        node2: name of dst
        relation: rel

    RETURNS: None, but uploads in the neo4j DBMSS Instance
    """
    query = """
            MATCH (n) WHERE n.name = '%s'
            MATCH (m) WHERE m.name = '%s'
            MERGE (n)-[:%s {WEIGHT:$weight}]->(m)
            """ % (node1, node2, relation)

    tx.run(query, node1 = node1, node2 = node2, relation = relation, weight = weight)

In [16]:
for i in tqdm(range(len(PREDICTIONS)), desc = "deploying relationships"):
    head = PREDICTIONS["head"][i]
    tail = PREDICTIONS["tail"][i]
    relation = PREDICTIONS["relation"][i]
    weight = PREDICTIONS["weight"][i]
    info = driver.session().write_transaction(deploy_rel, head, tail, relation, weight)


deploying relationships:   1%|          | 1952/196083 [03:36<5:59:10,  9.01it/s] 


KeyboardInterrupt: 

### Note that the while I specified a direction during edge creation, all edges are undirected unless direction is specified at query time