In [3]:
from neo4j import GraphDatabase # import graph data base library. make sure neo4k is installed by doing "pip instal neo4j" on your terminal/command line
import pandas as pd
from tqdm import tqdm
import ast

url = "bolt://localhost:7687" # do ":server status" in neo4j desktop on YOUR own database

driver = GraphDatabase.driver(url, auth=("neo4j", "heart"))

In [2]:
def create_KG(tx):
    """
    Deploys protein and drug nodes
    """
    tx.run('''
    LOAD CSV WITH HEADERS FROM 'file:///edge_list.csv' AS row
    MERGE (p: Protein {UniProtID: row.UNIPROT_ID})
    MERGE (d: Drug {DrugbankID: row.DRUGBANK_ID})
    WITH row, p, d
    MERGE (p)-[:HAS_RELATED_DRUG]->(d)
    ''')

In [3]:
def create_categories(tx):
    """
    updates the drug categories
    """
    tx.run('''
    LOAD CSV WITH HEADERS FROM 'file:///node_list.csv' AS row
    UNWIND split(row.CATEGORIES, ',') AS Category
    MERGE (c:Category {name: Category})
    with row, c
    MATCH (d: Drug {DrugbankID: row.DRUGBANK_ID})
    MERGE (d)-[:IS_IN_CATEGORY]->(c)
    ''')

In [4]:
##########################
#Protein-pmid list
def get_unid_pmid_list(tx) -> any:
    """
    get list of proteins for each drug
    @param self
    @return a data table
    """
    query = ("LOAD CSV WITH HEADERS FROM 'file:///edge_list_unid_and_pmid.csv' AS row "
             "RETURN row.UNIPROT_ID as UNIPROT_ID, row.PMID AS PMID")
    result = tx.run(query)
    return result.data()

def process_unid_pmid_list() -> None:
    """
    creates a csv from the data
    @param self
    @return None
    """
    #grab the table from the query
    result = driver.session().write_transaction(get_unid_pmid_list)
    #create dataframe
    result = pd.DataFrame(result)
    
    #create the string to a list
    for element in tqdm(range(len(result['PMID'])), desc = "Parsing CSV"):
        result['PMID'][element] = ast.literal_eval(result['PMID'][element])
    return result

def unid_pmid_query_for_deploy(tx, pmid) -> None:
    query = ("MERGE (p: PMID{pmid : $pmid})")
    result = tx.run(query, pmid = pmid)
    
def unid_pmid_query_for_match(tx, unid, pmid) -> None:
    query = ("MATCH (p:Protein) WHERE p.UniProtID = $unid "
             "MATCH (s:PMID) WHERE s.pmid = $pmid "
             "MERGE (p)<-[:PMID_TARGET]-(s)")
    result = tx.run(query, unid = unid, pmid = pmid)

def create_unid_pmids() -> None:
    """
    deploys pmids
    """
    result = process_unid_pmid_list()

    #iterate throught the dataframe
    for i in tqdm(range(len(result)), "Deploying Protein PMIDs: "):
        for j in range(len(result["PMID"][i])):
            driver.session().write_transaction(unid_pmid_query_for_deploy, result["PMID"][i][j])
            driver.session().write_transaction(unid_pmid_query_for_match, result["UNIPROT_ID"][i], result["PMID"][i][j])
    

In [5]:
def get_dbid_pmid_list(tx) -> any:
    """
    get list of proteins for each drug
    @param self
    @return a data table
    """
    query = ("LOAD CSV WITH HEADERS FROM 'file:///edge_list_dbid_and_pmid.csv' AS row "
             "RETURN row.DRUGBANK_ID as DRUGBANK_ID, row.PMID AS PMID")
    result = tx.run(query)
    return result.data()

def process_dbid_pmid_list() -> None:
    """
    creates a csv from the data
    @param self
    @return None
    """
    #grab the table from the query
    result = driver.session().write_transaction(get_dbid_pmid_list)
    #create dataframe
    result = pd.DataFrame(result)
    
    #create the string to a list
    for element in tqdm(range(len(result['PMID'])), desc = "Parsing CSV"):
        result['PMID'][element] = ast.literal_eval(result['PMID'][element])
    return result

def dbid_pmid_query_for_deploy(tx, pmid) -> None:
    query = ("MERGE (p: PMID{pmid : $pmid})")
    result = tx.run(query, pmid = pmid)
    
def dbid_pmid_query_for_match(tx, dbid, pmid) -> None:
    query = ("MATCH (p:Drug) WHERE p.DrugbankID = $dbid "
             "MATCH (s:PMID) WHERE s.pmid = $pmid "
             "MERGE (p)<-[:PMID_TARGET]-(s)")
    result = tx.run(query, dbid = dbid, pmid = pmid)

def create_dbid_pmids() -> None:
    """
    deploys pmids
    """
    result = process_dbid_pmid_list()

    #iterate throught the dataframe
    for i in tqdm(range(len(result)), "Deploying Drug PMIDs: "):
        for j in range(len(result["PMID"][i])):
            if result["PMID"][i][j] != None:
                driver.session().write_transaction(dbid_pmid_query_for_deploy, result["PMID"][i][j])
                driver.session().write_transaction(dbid_pmid_query_for_match, result["DRUGBANK_ID"][i], result["PMID"][i][j])

In [6]:
def constraint_unique_uniprotID(tx):
    tx.run('''
    CREATE CONSTRAINT unique_uniprotID ON (p:Protein) ASSERT p.UniProtID IS UNIQUE
    ''')
def constraint_unique_drugbankID(tx):
    tx.run('''
    CREATE CONSTRAINT unique_drugbankID ON (d:Drug) ASSERT d.DrugbankID IS UNIQUE
    ''')
def constraing_unique_pmids(tx):
    tx.run('''
    CREATE CONSTRAINT unique_PMID ON (d:pmid) ASSERT d.pmid IS UNIQUE
    ''')
def constraint_unique_categories(tx):
    tx.run('''
    CREATE CONSTRAINT unique_categories ON (c:Category) ASSERT c.name IS UNIQUE
    ''')


In [7]:
with driver.session() as session: 
    session.write_transaction(constraint_unique_uniprotID) 
    session.write_transaction(constraint_unique_drugbankID)
    session.write_transaction(constraint_unique_categories)
    session.write_transaction(create_KG)
    session.write_transaction(create_categories)

In [8]:
#run pmids
create_unid_pmids()

Parsing CSV: 100%|██████████| 828/828 [00:00<00:00, 1907.95it/s]
Deploying Protein PMIDs: 100%|██████████| 828/828 [05:21<00:00,  2.58it/s]


In [9]:
#run pmids with dbid
create_dbid_pmids()

Parsing CSV: 100%|██████████| 207/207 [00:00<00:00, 2964.93it/s]
Deploying Drug PMIDs: 100%|██████████| 207/207 [00:28<00:00,  7.22it/s]


----------------Graph Algorithms--------------------

@cypher:

    CALL gds.graph.create(
    'graph',
    ['Category', 'Drug', 'Protein', 'PMID'], 
    ['HAS_RELATED_DRUG', 'IS_IN_CATEGORY', 'PMID_TARGET']
    )

    
    CALL gds.graph.create(
    '4th',
    ['Drug', 'Protein', 'PMID'], 
    ['HAS_RELATED_DRUG', 'PMID_TARGET']
    )

In [31]:
from neo4j import GraphDatabase # import graph data base library. make sure neo4k is installed by doing "pip instal neo4j" on your terminal/command line
import pandas as pd
from tqdm import tqdm
import ast
import numpy as np

url = "bolt://localhost:7687" # do ":server status" in neo4j desktop on YOUR own database

driver = GraphDatabase.driver(url, auth=("neo4j", "heart"))

In [28]:
class pagerank():
    """Class to run pageranke"""
    def __init__(self) -> None:
        self.driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "heart"))

    def close(self) -> None:
        self.driver.close()

    #estimation for memory
    @classmethod
    def memory_estimation(cls, tx) -> any:
        """
        @param cls is the class
        @param tx is the transaction
        @returns the data for the memory
        """
        query = ("Call gds.pageRank.write.estimate('graph', {writeProperty: 'pageRank', maxIterations: 30, dampingFactor: 0.85}) YIELD nodeCount, relationshipCount, bytesMin, bytesMax, requiredMemory")
        result = tx.run(query)
        return result.single()

    @classmethod
    def pagerank(cls, tx) -> any:
        """
        @param cls is the class
        @param tx is the transaction
        @return result.data() is the data of the pagerank
        """
        query = ("Call gds.pageRank.stream('4th') YIELD nodeId, score RETURN gds.util.asNode(nodeId).DrugbankID AS DrugbankID, score ORDER BY score DESC, name ASC")
        result = tx.run(query)
        return result.data()


    def estimate_pagerank(self) -> any:
        """
        @param self
        @return result is the result of the memory estimation
        """
        result = self.driver.session().write_transaction(self.memory_estimation)
        return result


    def run_pagerank(self) -> any:
        """
        @param self
        @return result is the dataframe from the pagerank
        """
        result = self.driver.session().write_transaction(self.pagerank)
        result = pd.DataFrame(result)
        return result

In [29]:
hello = pagerank()
result = hello.run_pagerank()
result

Unnamed: 0,name,score
0,DB12010,458.105091
1,DB02709,82.677919
2,DB00945,52.978173
3,DB06154,27.763331
4,DB00421,27.372005
...,...,...
12724,,0.150000
12725,,0.150000
12726,,0.150000
12727,,0.150000


Local Clustering Coefficient requires all relationships to be undirected

@cypher:


    CALL gds.graph.create(
    '5th',
    ['Drug', 'Protein', 'PMID'], 
    ['HAS_RELATED_DRUG', 'PMID_TARGET']
    {
      HAS_RELATED_DRUG: {
        orientation: 'UNDIRECTED'
      }, 
      PMID_TARGET: {
        orientation: 'UNDIRECTED'
      }
    }
    )


    CALL gds.graph.create(
    'drug-protein',
    ['Drug', 'Protein'], 
    {
      HAS_RELATED_DRUG: {
        orientation: 'UNDIRECTED'
      }
    }
    )

In [38]:
class algo():
    """Class to run pageranke"""
    def __init__(self) -> None:
        self.driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "heart"))

    def close(self) -> None:
        self.driver.close()

    @classmethod
    def algo(cls, tx) -> any:
        """
        @param cls is the class
        @param tx is the transaction
        @return result.data() is the data of the cluster
        """
        query = ("""
        CALL gds.nodeSimilarity.stream('drug-protein')
        YIELD node1, node2, similarity
        RETURN gds.util.asNode(node1).DrugbankID AS nodeID_1, gds.util.asNode(node2).DrugbankID AS nodeID_2, similarity
        ORDER BY similarity DESCENDING, nodeID_1, nodeID_2
        """)
        result = tx.run(query)
        return result.data()


    def run_algo(self) -> any:
        """
        @param self
        @return result is the dataframe from the pagerank
        """
        result = self.driver.session().write_transaction(self.algo)
        result = pd.DataFrame(result)
        return result

In [42]:
pd.set_option('display.max_rows', 30)
hello = algo()
result = hello.run_algo()
#extract nodes whose similarity is 1
result = result[result['similarity'] == 1]
#result.drop()
result