In [None]:
# Install
# !python -m pip install --upgrade pip --user
# !pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
# !pip install import-ipynb
# !pip install -U -q PyDrive
# !pip install bioservices
# !pip install neo4j --user
# !pip install neo4jupyter
# !pip install msgpack --user
# !pip install pandas_read_xml --user
# !pip install ontobio --user
# !pip install numpy --user
# !pip install scipy --user
# !pip install libicon v --user
# !pip install networkx --user
# !pip install python-igraph --user
# !pip install ndex-dev --user
# !pip install tulip-python --user
# !pip install ddot --user
# !pip3 install py_stringmatching --user

In [1]:
# Imports
import import_ipynb
import pandas as pd
import numpy as np
import sys
import os
import csv
import collections
import re
import io
import json
import xml.etree.ElementTree as ET
import requests
import statistics
import scipy.spatial.distance
import py_stringmatching as sm
from py2neo import Graph
import neo4jupyter
from neo4j import GraphDatabase

from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

from bioservices import UniProt

pd.set_option('max_rows', 99999)
pd.set_option('max_colwidth', 400)
pd.describe_option('max_colwidth')

display.max_colwidth : int or None
    The maximum width in characters of a column in the repr of
    a pandas data structure. When the column overflows, a "..."
    placeholder is embedded in the output. A 'None' value means unlimited.
    [default: 50] [currently: 400]


In [2]:
from BIC_DATASETS import BioDatasets as biodts

importing Jupyter notebook from BIC_DATASETS.ipynb
display.max_colwidth : int or None
    The maximum width in characters of a column in the repr of
    a pandas data structure. When the column overflows, a "..."
    placeholder is embedded in the output. A 'None' value means unlimited.
    [default: 50] [currently: 400]


In [3]:
#%%timeit
OMIM = biodts.gen_omim() #1

In [4]:
#%%timeit
DRUGCENTRAL = biodts.gen_drugcentral() #2

In [5]:
#%%timeit
DISGENET = biodts.gen_disgenet() #3

In [6]:
#%%timeit
MONDO = biodts.gen_mondo() #4

In [7]:
#%%timeit
REACTOME = biodts.gen_reactome() #5

In [8]:
#%%timeit
DRUGBANK_PROTEIN = biodts.gen_drug_bank_protein() #6

In [9]:
#%%timeit
DRUGBANK = biodts.gen_drug_bank() #7

In [10]:
#%%timeit
UNIPROT = biodts.gen_uniprot() #8

In [11]:
#%%timeit
IID = biodts.gen_iid_human() #9

## Neo4J

In [12]:
# based on: https://neo4j.com/developer/python/; https://towardsdatascience.com/neo4j-cypher-python-7a919a372be7
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

In [36]:
# Interacting with Neo4j
conn2 = Neo4jConnection(uri="bolt://localhost:11003", user="autobiodatasets", pwd="autobiodatasets")

# create a new database, use database, create nodes in Neo4J
# conn2.query("create database autobiodatasets")
conn2.query("use autobiodatasets MATCH (n) RETURN n")

#create all nodes
# conn2.query("CREATE (UNIPROT:Dataframe {name: 'UNIPROT'}), (IID:Dataframe {name: 'IID'}), (REACTOME:Dataframe {name: 'REACTOME'}), (DRUGBANK:Dataframe {name: 'DRUGBANK'}), (DRUGBANK_PROTEIN:Dataframe {name: 'DRUGBANK_PROTEIN'}), (OMIM:Dataframe {name: 'OMIM'}), (DRUGCENTRAL:Dataframe {name: 'DRUGCENTRAL'}), (DISGENET:Dataframe {name: 'DISGENET'}), (MONDO:Dataframe {name: 'MONDO'})")

# delete all nodes
# conn2.query("MATCH (a) DELETE (a)")

# delete all relationship
conn2.query("MATCH (a)<-[r:integrates_with]-(b) DELETE r")

[]

In [37]:
#based on https://github.com/merqurio/neo4jupyter
#before creating relationships

neo4jupyter.init_notebook_mode()
graph = Graph("bolt://localhost:11003", auth=("autobiodatasets", "autobiodatasets"))
neo4jupyter.draw(graph, {"Dataframe": "name"})

<IPython.core.display.Javascript object>

# Main Algorithm

In [38]:
%%time
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

def changeListToString(value):
  result=value
  if type(value) is list or type(value) is dict:
    result=' '.join(map(str,value))
  return result

UNIPROT.name_ = "UNIPROT"
REACTOME.name_ = "REACTOME"
DISGENET.name_ = "DISGENET"
OMIM.name_ = "OMIM"
DRUGCENTRAL.name_ = "DRUGCENTRAL"
MONDO.name_ = "MONDO"
DRUGBANK.name_ = "DRUGBANK"
DRUGBANK_PROTEIN.name_ = "DRUGBANK_PROTEIN"
IID.name_ = "IID"

unique_values_allow = 2
compare_values_numeric = False
datasets = [UNIPROT, REACTOME, DISGENET, OMIM, DRUGCENTRAL, MONDO, DRUGBANK, DRUGBANK_PROTEIN, IID]
ds_columns = []


for idx1, ds1 in np.ndenumerate(datasets[0:-1]): #for1
  ds1.dropna(how='all', axis=1, inplace=True)
  countI = 0

  while countI < len(ds1.columns): #while1
    col_DS1 = ds1.iloc[:,countI]
    if compare_values_numeric == False:
      if is_numeric_dtype(col_DS1) == True:
        countI=countI+1 
        continue

    name_ds_column_1 = ds1.name_+"."+ds1.columns[countI]
    col_DS1 = col_DS1.map(lambda x: changeListToString(x)) # if data is 'list' or 'dict' and turn to 'str'
    if (len(col_DS1.unique()) <= unique_values_allow) or (any(item in name_ds_column_1 for item in ds_columns)): #if1
      countI=countI+1 
      continue

    else: #else f1
      ds_columns.append(name_ds_column_1)
   
      if col_DS1.isin([0,1]).all() == False: #if2  

        for idx2, ds2 in np.ndenumerate(datasets): #for2
          ds2.dropna(how='all', axis=1, inplace=True)

          if ds1.name_ != ds2.name_: #if3
            countII=0
            
            while countII < len(ds2.columns): #while2
              col_DS2 = ds2.iloc[:,countII]
              if compare_values_numeric == False:
                if is_numeric_dtype(col_DS2) == True:
                  countII=countII+1 
                  continue
              
              name_ds_column_2 = ds2.name_+"."+ds2.columns[countII]
              col_DS2 = col_DS2.map(lambda x: changeListToString(x)) # if data is 'list' or 'dict' and turn to 'str'
              if (len(col_DS2.unique()) <= unique_values_allow) or (any(item in name_ds_column_2 for item in ds_columns)): #if4
                countII=countII+1 
                continue

              else: #else if4
              
                if col_DS2.isin([0,1]).all() == False: #if5
                  minimum = min(col_DS1.nunique(), col_DS2.nunique())

                  if minimum != 0: #if6

                    oc = sm.OverlapCoefficient()                                 # -- the Overlap distance
                    overlap = oc.get_raw_score(set(col_DS1), set(col_DS2))

                    jac = sm.Jaccard()                                           # -- the Jaccard distance
                    jaccard = jac.get_raw_score(set(col_DS1), set(col_DS2))

                    dic = sm.Dice()                                              # -- the Sorensen/Dice distance
                    dice = dic.get_raw_score(set(col_DS1), set(col_DS2))

                    tvi = sm.TverskyIndex()                                      # -- the TverskyIndex distance using Tanimoto coefficient
                    tversky = tvi.get_raw_score(set(col_DS1), set(col_DS2))               
                    
                    if overlap > 0.4 or jaccard > 0.4 or dice > 0.4 or tversky > 0.4: #if7

                      if col_DS1.nunique() < col_DS2.nunique(): #if8
                        print(ds1.name_, "[\"", ds1.columns[countI],"\"] --> ", ds2.name_, "[\"", ds2.columns[countII], 
                              "\"] / Overlap: ", format(overlap, ".3f"), "/ Jaccard: ", format(jaccard, ".3f"), "/ Sørensen-Dice: ", format(dice, ".3f"), "/ Tversky: ", format(tversky, ".3f"))
                        conn2.query("MATCH (a:Dataframe), (b:Dataframe) WHERE a.name = '"+ds1.name_+"' AND b.name = '"+ds2.name_+"' CREATE (a)-[r:integrates_with {using:\""+ds1.columns[countI]+"\"}]->(b) RETURN type(r)")
                     
                      elif col_DS1.nunique() > col_DS2.nunique(): #else if8
                        print(ds1.name_, "[\"", ds1.columns[countI],"\"] <-- ", ds2.name_, "[\"", ds2.columns[countII],
                              "\"] / Overlap: ", format(overlap, ".3f"), "/ Jaccard: ", format(jaccard, ".3f"), "/ Sørensen-Dice: ", format(dice, ".3f"), "/ Tversky: ", format(tversky, ".3f"))
                        conn2.query("MATCH (a:Dataframe), (b:Dataframe) WHERE a.name = '"+ds1.name_+"' AND b.name = '"+ds2.name_+"' CREATE (a)<-[r:integrates_with {using:\""+ds2.columns[countII]+"\"}]-(b) RETURN type(r)")

                      elif col_DS1.nunique() == col_DS2.nunique(): #else if8
                        print(ds1.name_, "[\"", ds1.columns[countI],"\"] <--> ", ds2.name_, "[\"", ds2.columns[countII],
                              "\"] / Overlap: ", format(overlap, ".3f"), "/ Jaccard: ", format(jaccard, ".3f"), "/ Sørensen-Dice: ", format(dice, ".3f"), "/ Tversky: ", format(tversky, ".3f"))
                        conn2.query("MATCH (a:Dataframe), (b:Dataframe) WHERE a.name = '"+ds1.name_+"' AND b.name = '"+ds2.name_+"' CREATE (a)<-[r:integrates_with {using:\""+ds1.columns[countI]+"\"}]->(b) RETURN type(r)")

              countII=countII+1
  
    countI=countI+1                      

UNIPROT [" Entry "] <--  REACTOME [" UNIPROT identifier "] / Overlap:  0.409 / Jaccard:  0.058 / Sørensen-Dice:  0.109 / Tversky:  0.109
UNIPROT [" Entry "] <--  DRUGCENTRAL [" ACCESSION "] / Overlap:  0.829 / Jaccard:  0.004 / Sørensen-Dice:  0.008 / Tversky:  0.008
UNIPROT [" Entry "] <--  DRUGBANK_PROTEIN [" uniprot_id "] / Overlap:  0.909 / Jaccard:  0.008 / Sørensen-Dice:  0.016 / Tversky:  0.016
UNIPROT [" Entry "] <--  IID [" uniprot1 "] / Overlap:  0.953 / Jaccard:  0.030 / Sørensen-Dice:  0.058 / Tversky:  0.058
UNIPROT [" Entry "] <--  IID [" uniprot2 "] / Overlap:  0.960 / Jaccard:  0.031 / Sørensen-Dice:  0.059 / Tversky:  0.059
UNIPROT [" Gene names "] <--  DISGENET [" geneSymbol "] / Overlap:  0.483 / Jaccard:  0.009 / Sørensen-Dice:  0.019 / Tversky:  0.019
UNIPROT [" Gene names "] <--  OMIM [" Approved_Symbol "] / Overlap:  0.466 / Jaccard:  0.015 / Sørensen-Dice:  0.029 / Tversky:  0.029
UNIPROT [" Gene names "] <--  DRUGCENTRAL [" GENE "] / Overlap:  0.486 / Jaccard: 

In [40]:
#based on https://github.com/merqurio/neo4jupyter
#after creating relationships

neo4jupyter.init_notebook_mode()
graph = Graph("bolt://localhost:11003", auth=("autobiodatasets", "autobiodatasets"))
neo4jupyter.draw(graph, {"Dataframe": "name"})

<IPython.core.display.Javascript object>