In [19]:
import pandas as pd
import numpy as np
import os
import subprocess


# Directory for the datasets folder in which the program will run

directory = '/Users/vir/Testing/datasets'

os.chdir(directory)

# Dictionary of urls and filepaths for data 

data_dict = {"https://www.genenames.org/cgi-bin/download/custom?col=gd_app_sym&col=gd_app_name&col=gd_pub_acc_ids&col=gd_pub_refseq_ids&col=gd_pub_eg_id&col=md_eg_id&col=md_prot_id&col=md_mim_id&status=Approved&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit": 'data/vocab/gene_names.csv'}
data_dict['https://www.bgee.org/ftp/current/download/calls/expr_calls/Homo_sapiens_expr_advanced.tsv.gz'] = 'data/bgee/Homo_sapiens_expr_advanced.tsv.gz'
data_dict['https://ctdbase.org/reports/CTD_exposure_events.csv.gz'] = 'data/ctd/CTD_exposure_events.csv.gz'
data_dict['https://www.disgenet.org/static/disgenet_ap1/files/downloads/curated_gene_disease_associations.tsv.gz'] = 'data/disgenet/curated_gene_disease_associations.tsv.gz'
data_dict['https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz'] = 'data/ncbigene/gene2go.gz'
data_dict['http://purl.obolibrary.org/obo/go/go-basic.obo'] = 'data/go/go-basic.obo'
data_dict['http://purl.obolibrary.org/obo/hp.obo'] = 'data/hpo/hp.obo'
data_dict['http://purl.obolibrary.org/obo/hp/hpoa/phenotype.hpoa'] = 'data/hpo/phenotype.hpoa'
data_dict['http://purl.obolibrary.org/obo/MONDO.obo'] = 'data/mondo/mondo.obo'
data_dict['https://reactome.org/download/current/ReactomePathways.txt,https://reactome.org/download/current/ReactomePathwaysRelation.txt,https://reactome.org/download/current/NCBI2Reactome.txt'] = 'data/reactome/ReactomePathways.txt,data/reactome/ReactomePathwaysRelation.txt,data/reactome/NCBI2Reactome.txt'
data_dict['http://sideeffects.embl.de/media/download/meddra_all_se.tsv.gz,http://sideeffects.embl.de/media/download/drug_atc.tsv'] = 'data/sider/meddra_all_se.tsv.gz,data/sider/drug_atc.tsv'
data_dict['http://purl.obolibrary.org/obo/uberon/ext.obo'] = 'data/uberon/ext.obo'
data_dict['https://unmtid-shinyapps.net/download/drugcentral.dump.05102023.sql.gz'] = 'data/drugcentral/drugcentral.dump.05102023.sql.gz'
data_keys = list(data_dict.keys())



In [26]:
def initialize(directory):
    os.chdir(directory)
    if 'datasets' not in directory:
        print('Make sure you have put in the proper directory for the program to run.')
    subprocess.run(['mkdir', 'data'])
    subprocess.run('mkdir data/bgee data/ctd data/disgenet data/drugbank data/vocab data/drugcentral data/ncbigene data/go data/hpo data/mondo data/reactome data/sider data/uberon data/umls'.split(' '))

def gene_names(url, filepath):
    os.system(f'curl {url} -o {filepath}')


def bgee(url, filepath):
    directory = os.getcwd()

    # Downloading data
    os.system(f'curl {url} -o {filepath}')
    os.system(f'gunzip {filepath}')

    # Validating data size 
    if os.path.getsize(filepath) < 46804992:
        print(f'Warning: {filepath} file may have not downloaded fully') 

    # Running processing script
    os.chdir(directory + '/processing_scripts')
    os.system('python bgee.py')
    os.chdir(directory)

    # Validating processing script output
    try:
        file_size = os.path.getsize('data/bgee/anatomy_gene.csv')
        print(f"File Size of data/bgee/anatomy_gene.csv in Bytes is {file_size}.")
    except FileNotFoundError:
        print("File not found. Check bgee.py to make sure data is being appropriatly processed and saved.")


def ctd(url, filepath):
    directory = os.getcwd()

    # Downloading data
    os.system(f'curl {url} -o {filepath}')
    os.system(f'gunzip {filepath}')

    # Validating data size 
    if os.path.getsize(filepath) < 2085711:
        print(f'Warning: {filepath} file may have not downloaded fully')
    
    # Running processing script
    os.chdir(directory + '/processing_scripts')
    os.system('python ctd.py')
    os.chdir(directory)

    # Validating processing script output
    try:
        file_size = os.path.getsize('data/ctd/CTD_exposure_events.csv')
        print(f"File Size of data/ctd/CTD_exposure_events.csv in Bytes is {file_size}.")
    except FileNotFoundError:
        print("File not found. Check ctd.py to make sure data is being appropriatly processed and saved.")



def disgennet(url, filepath):
    os.system(f'curl {url} -o {filepath}')
    os.system(f'gunzip {filepath}')
    # Validating data size 
    # CHECK NUMBER IN IF STATEMENT!!!!!
    if os.path.getsize(filepath) < 5000:
        print(f'Warning: {filepath} file may have not downloaded fully')


def entrez_gene(url, filepath):
    directory = os.getcwd()

    # Downloading data
    os.system(f'curl {url} -o {filepath}')
    os.system(f'gunzip {filepath}')

    # Validating data size 
    if os.path.getsize(filepath) < 373391360:
        print(f'Warning: {filepath} file may have not downloaded fully')

    # Running processing script
    os.chdir(directory + '/processing_scripts')
    os.system('python ncbigene.py')
    os.chdir(directory)

    # Validating processing script output
    try:
        file_size = os.path.getsize('data/ncbigene/protein_go_associations.csv')
        print(f"File Size of data/ncbigene/protein_go_associations.csv in Bytes is {file_size}.")
    except FileNotFoundError:
        print("File not found. Check ncbigene.py to make sure data is being appropriatly processed and saved.")


def gene_ontology(url, filepath):
    directory = os.getcwd()

    # Downloading data
    os.system(f'curl -L {url} -o {filepath}')
    
    # Validating data size 
    if os.path.getsize(filepath) < 15619420:
        print(f'Warning: {filepath} file may have not downloaded fully')

    # Runing processing script
    os.chdir(directory + '/processing_scripts')
    os.system('python go.py')
    os.chdir(directory)

    # Validating processing script output
    try:
        file_size = os.path.getsize('data/go/go_terms_relations.csv')
        print(f"File Size of data/go/go_terms_relations.csv in Bytes is {file_size}.")
    except FileNotFoundError:
        print("File not found. Check go.py to make sure data is being appropriatly processed and saved.")

def hpo(url, filepath):
    directory = os.getcwd()

    # Downloading data
    os.system(f'curl -L {url} -o {filepath}')

    # Validating data size
    if os.path.getsize(filepath) < 4907091:
        print(f'Warning: {filepath} file may have not downloaded fully')

    # Running processing script
    os.chdir(directory + '/processing_scripts')
    os.system('python hpo.py')
    os.chdir(directory)

    try:
        file_size = os.path.getsize('data/hpo/hp_terms.csv')
        print(f"File Size of data/hpo/hp_terms.csv in Bytes is {file_size}.")
    except FileNotFoundError:
        print("File not found. Check hpo.py to make sure data is being appropriatly processed and saved.")

def hpoa(url, filepath):
    directory = os.getcwd()

    # Downloading data
    os.system(f'curl -L {url} -o {filepath}')

    # Validating data size
    if os.path.getsize(filepath) < 16406957:
        print(f'Warning: {filepath} file may have not downloaded fully')

    # Running processing script
    os.chdir(directory + '/processing_scripts')
    os.system('python hpoa.py')
    os.chdir(directory)

    try:
        file_size = os.path.getsize('data/hpo/disease_phenotype_pos.csv')
        print(f"File Size of data/hpo/disease_phenotype_pos.csv in Bytes is {file_size}.")
    except FileNotFoundError:
        print("File not found. Check hpoa.py to make sure data is being appropriatly processed and saved.")

def mondo(url, filepath):
    directory = os.getcwd()

    # Downloading data
    os.system(f'curl -L {url} -o {filepath}')

    # Validating data size
    if os.path.getsize(filepath) < 23691202:
        print(f'Warning: {filepath} file may have not downloaded fully')
    
    # Running processing script
    os.chdir(directory + '/processing_scripts')
    os.system('python mondo.py')
    os.chdir(directory)

    try:
        file_size = os.path.getsize('data/mondo/mondo_terms.csv')
        print(f"File Size of data/mondo/mondo_terms.csv in Bytes is {file_size}.")
        file_size = os.path.getsize('data/mondo/mondo_parents.csv')
        print(f"File Size of data/mondo/mondo_parents.csv in Bytes is {file_size}.")
        file_size = os.path.getsize('data/mondo/mondo_references.csv')
        print(f"File Size of data/mondo/mondo_references.csv in Bytes is {file_size}.")
        file_size = os.path.getsize('data/mondo/mondo_subsets.csv')
        print(f"File Size of data/mondo/mondo_subsets.csv in Bytes is {file_size}.")
        file_size = os.path.getsize('data/mondo/mondo_definitions.csv')
        print(f"File Size of data/mondo/mondo_definitions.csv in Bytes is {file_size}.")
    except FileNotFoundError:
        print("File not found. Check mondo.py to make sure data is being appropriatly processed and saved.")


def reactome(url, filepath):
    directory = os.getcwd()
    url = url.split(',')
    filepath = filepath.split(',')

    # Downloading data
    os.system(f'curl {url[0]} -o {filepath[0]}')
    os.system(f'curl {url[1]} -o {filepath[1]}')
    os.system(f'curl {url[2]} -o {filepath[2]}')

    # Validating data size
    if os.path.getsize(filepath[0]) < 749101 or os.path.getsize(filepath[1]) < 300461 or os.path.getsize(filepath[2]) < 14762377:
        print(f'Warning: {filepath} file may have not downloaded fully')

    # Running processing script
    os.chdir(directory + '/processing_scripts')
    os.system('python reactome.py')
    os.chdir(directory)

    try:
        file_size = os.path.getsize('data/reactome/reactome_ncbi.csv')
        print(f"File Size of data/reactome/reactome_ncbi.csv in Bytes is {file_size}.")
        file_size = os.path.getsize('data/reactome/reactome_relations.csv')
        print(f"File Size of data/reactome/reactome_relations.csv in Bytes is {file_size}.")
        file_size = os.path.getsize('data/reactome/reactome_terms.csv')
        print(f"File Size of data/reactome/reactome_terms.csv in Bytes is {file_size}.")
    except FileNotFoundError:
        print("File not found. Check reactome.py to make sure data is being appropriatly processed and saved.")

def sider(url, filepath):
    directory = os.getcwd()
    url = url.split(',')
    filepath = filepath.split(',')

    # Downloading data
    os.system(f'curl {url[0]} -o {filepath[0]}')
    os.system(f'gunzip {filepath[0]}')
    os.system(f'curl {url[1]} -o {filepath[1]}')

    # Validating data size
    if os.path.getsize(filepath[1]) < 16380: 
        print(f'Warning: {filepath} file may have not downloaded fully')

    # Running processing script
    os.chdir(directory + '/processing_scripts')
    os.system('python sider.py')
    os.chdir(directory)

    try:
        file_size = os.path.getsize('data/sider/sider.csv')
        print(f"File Size of data/sider/sider.csv in Bytes is {file_size}.")
    except FileNotFoundError:
        print("File not found. Check sider.py to make sure data is being appropriatly processed and saved.")


def uberon(url, filepath):
    directory = os.getcwd()

    # Downloading data
    os.system(f'curl -L {url} -o {filepath}')

    # Validating data size
    if os.path.getsize(filepath) < 171: 
        print(f'Warning: {filepath} file may have not downloaded fully')

    # Running processing script
    os.chdir(directory + '/processing_scripts')
    os.system('python uberon.py')
    os.chdir(directory)

    try:
        file_size = os.path.getsize('data/uberon/uberon_is_a.csv')
        print(f"File Size of data/uberon/uberon_is_a.csv in Bytes is {file_size}.")
        file_size = os.path.getsize('data/uberon/uberon_rels.csv')
        print(f"File Size of data/uberon/uberon_rels.csv in Bytes is {file_size}.")
        file_size = os.path.getsize('data/uberon/uberon_terms.csv')
        print(f"File Size of data/uberon/uberon_terms.csv in Bytes is {file_size}.")
    except FileNotFoundError:
        print("File not found. Check uberon.py to make sure data is being appropriatly processed and saved.")

def drug_central(url, filepath):
    os.system(f'curl {url} -o {filepath}')
    os.system(f'gunzip {filepath}')

    os.system('rm -rf data/drugcentral/db')
    os.system('initdb -D data/drugcentral/db')
    os.system('pg_ctl -D data/drugcentral/db -l logfile start')

    os.system('createdb drugcentral')

    # Replace /Users/vir... with path for drugcentral.dump.05102023.sql
    os.system('psql drugcentral < /Users/vir/Testing/datasets/data/drugcentral/drugcentral.dump.05102023.sql')
    os.system('psql -d drugcentral -c \"SELECT DISTINCT * FROM structures RIGHT JOIN (SELECT * FROM omop_relationship WHERE relationship_name IN (\'indication\', \'contraindication\', \'off-label use\')) AS drug_disease ON structures.id = drug_disease.struct_id;\" -P format=csv -o drug_disease.csv')
    os.system('pg_ctl -D data/drugcentral/db stop')

def drugbank():
    # Drugbank files will need to be downloaded manually as Drugbank requires authentication before downloading data.
    # Please download the Drugbank Complete Database, all carrier polypeptides, all enzyme polypeptides, target polypeptides,
    # transporter polypeptides and all vocabulary to data/drugbank folder.

    os.system('unzip data/drugbank/full_database.xml.zip -d data/drugbank')
    os.system('rm data/drugbank/drugbank_all_full_database.xml.zip')

    os.system('unzip data/drugbank/drugbank_all_carrier_polypeptide_ids.csv.zip -d data/drugbank/drugbank_all_carrier_polypeptide_ids.csv')
    os.system('rm data/drugbank/drugbank_all_carrier_polypeptide_ids.csv.zip')

    os.system('unzip data/drugbank/drugbank_all_enzyme_polypeptide_ids.csv.zip -d data/drugbank/drugbank_all_enzyme_polypeptide_ids.csv')
    os.system('rm data/drugbank/drugbank_all_enzyme_polypeptide_ids.csv.zip')

    os.system('unzip data/drugbank/drugbank_all_target_polypeptide_ids.csv.zip -d data/drugbank/drugbank_all_target_polypeptide_ids.csv')
    os.system('rm data/drugbank/drugbank_all_target_polypeptide_ids.csv.zip')

    os.system('unzip data/drugbank/drugbank_all_transporter_polypeptide_ids.csv.zip -d data/drugbank/drugbank_all_transporter_polypeptide_ids.csv')
    os.system('rm data/drugbank/drugbank_all_transporter_polypeptide_ids.csv.zip')

    os.chdir(directory + '/processing_scripts')
    os.system('python drug_drug.py')
    os.system('python drug_protein.py')
    os.chdir(directory)


def UMLS():
    # UMLS files will need to be downloaded manually as UMLS requires authentication before downloading data.
    # Please download the MRCONSO.RRF file to data/umls folder.

    os.chdir(directory + '/processing_scripts')
    os.system('python umls.py')
    os.system('python map_umls_mondo.py')
    os.chdir(directory)


# initialize(directory)
# gene_names(data_keys[0], data_dict.get(data_keys[0]))
# bgee(data_keys[1], data_dict[data_keys[1]])
# ctd(data_keys[2], data_dict[data_keys[2]])
# disgennet(data_keys[3], data_dict[data_keys[3]])
# entrez_gene(data_keys[4], data_dict[data_keys[4]])
# gene_ontology(data_keys[5], data_dict[data_keys[5]])
# hpo(data_keys[6], data_dict[data_keys[6]])
# hpoa(data_keys[7], data_dict[data_keys[7]])
# mondo(data_keys[8], data_dict[data_keys[8]])
# reactome(data_keys[9], data_dict[data_keys[9]])
# sider(data_keys[10], data_dict[data_keys[10]])
# uberon(data_keys[11], data_dict[data_keys[11]])
# drug_central(data_keys[12], data_dict[data_keys[12]])

# run build_kg

# get duplicate edges and see the ratio of duplicate edges to total edges
# get total nodes and see ratio of total nodes with total edges


# kg = pd.read_csv('/data/kg/auxillary/kg_raw.csv')
# nodes = pd.read_csv('data/kg/auxillary/nodes.csv')
# edges = pd.read_csv('data/kg/auxillary/edges.csv')



def duplicate_edges(edges):
    df = pd.DataFrame()

    df['duplicate_edges'] = edges['x_index'].astype('string') + "_" + edges['y_index'].astype('string')
    vc = df['duplicate_edges'].value_counts()

    num_duplicate_edges = (vc[vc > 1]).shape[0]
    if num_duplicate_edges > 3000:
        print('Warning: there exist more than 3000 duplicate edges.')
    



    


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   343    0   343    0     0   2074      0 --:--:-- --:--:-- --:--:--  2241
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 20.8M  100 20.8M    0     0  12.0M      0  0:00:01  0:00:01 --:--:-- 21.8M


File Size of data/uberon/uberon_is_a.csv in Bytes is 166711.
File Size of data/uberon/uberon_rels.csv in Bytes is 212438.
File Size of data/uberon/uberon_terms.csv in Bytes is 2588863.
