<h2>E: clinics

In [4]:
import pandas as pd
import icecream as ic
import numpy as np
import plotly
import matplotlib as plt
import xml.etree.ElementTree as ET
import pandas as pd
import psycopg2
import psycopg2.extras

In [1]:
chemical_list = ['Acetazolamide','Acetylcholine','Adenosine']

E1: therapeutic areas

In [66]:
# We collected ATC classification system codes from DrugBank and the Kyoto Encylopedia of Genes and Genomes. 
# To capture the ATC hierarchy, we annotated molecules with their full ATC code (level 5), plus all higher levels (4 to 1)

# drugbank_df = pd.read_csv("DB/drugbank_extracted1.csv")
def e1_func(chemical):
    output = drugbank_df[drugbank_df["dg_name"]==chemical]['atc_codes'].unique()
    output = output.tolist()[0]
    output = output.split(';')
    output = ' '.join(output)
    return output

E2: indications

In [107]:
# We fetched approved and phase I-IV drug indications from ChEMBL and RepoDB 78 (v.1, http://apps.chiragjpgroup.org/repoDB)
#chembl
E2_chemble=pd.read_csv('../E1_Chemble_data.csv.gz',compression='gzip')
E2_repoDB = pd.read_csv('../E2_repoDB.csv')

  E2_chemble=pd.read_csv('../E1_Chemble_data.csv.gz',compression='gzip')


In [69]:
def e2_func(chemical):
    output = E2_chemble[E2_chemble['compound_name']==chemical]
    output = output.values
    output = [item for sublist in output for item in sublist]
    output = [str(x) for x in output]
    output = ' '.join(output)
    
    output_repo = E2_repoDB[E2_repoDB['drug_name']==chemical]
    output_repo = output_repo.values
    output_repo = [item for sublist in output_repo for item in sublist]
    output_repo = [str(x) for x in output_repo]
    output_repo = ' '.join(output_repo)
    final_out = [output, output_repo]
    final_out = ' '.join(final_out)
    return final_out

E3: side effects

In [26]:
# side effects. We collected drug side effects from SIDER80 (v.4, http://sideeffects.embl.de), 
# expressed as Unified Medical Language System terms. 
# We did not consider frequency information since we and others have found it to be too scarce for comprehensive statistical analyses
E3_sider=pd.read_csv('../E3_meddra_all_label_se.tsv.gz', sep ='\t', compression='gzip', names=['compound_id', 'UMLS_concept_id', 'method_of_detection', 'concept_name', 'MedDRA_concept_type', 'UMLS_id_for_MedDRA_term', 'MedDRA_concept_name'])
E3_sider_name = pd.read_csv('../E3_sider_names.tsv', sep = '\t',names=['UMLS_concept_id', 'drug_name'])
E3_sider_atc = pd.read_csv('../E3_sider_atc.tsv', sep = '\t', names=['UMLS_concept_id', 'atc_codes'])

In [106]:
def e3_func(chemical):
    atc = drugbank_df[drugbank_df["dg_name"]==chemical]['atc_codes'].unique()[0].split(';')
    UMLS_concept_id = E3_sider_atc[E3_sider_atc['atc_codes'].isin(atc)]['UMLS_concept_id'].values[0]
    output = E3_sider[E3_sider['UMLS_concept_id']==UMLS_concept_id]
    output = output.values.tolist()
    output = [x for sublist in output for x in sublist]
    output = [str(x) for x in output]
    output = ' '.join(output)
    return output   
    

E4: disease phenotypes

In [78]:
# disease phenotypes. Associations between chemicals and disease phenotypes were downloaded from the Comparative Toxicogenomics Database (CTD)83 (http://ctdbase.org, July 2016). 
# We took only ‘curated’ CTD data. In CTD, compound-disease associations are classified as ‘therapeutic’ (T) or ‘marker/mechanism’ (M) (usually corresponding to a disease-causing effect). #!drop nan?
# T and M annotations were kept separately for each molecule. 
# CTD contains a medical vocabulary (MEDIC) that is essentially based on the MeSH hierarchy. 
# For each annotated disease, we added parent terms all the way to the root of the MEDIC hierarchy

E4_CTD = pd.read_csv('../E4_CTD_chemicals_diseases.csv')
E4_chemical_names = pd.read_csv('../E4_CTD_chemical_names.csv')
E4_disease_names = pd.read_csv('../E4_CTD_disease_names.csv')

In [83]:
E4_CTD = E4_CTD[~E4_CTD['DirectEvidence'].isna()]

In [102]:
def e4_func(chemical):
    chemical_id = E4_chemical_names[E4_chemical_names['ChemicalName']==chemical]['ChemicalID'].values[0].split(':')[1]
    output = E4_CTD[E4_CTD['ChemicalID']==chemical_id]
    output = output.values.tolist()
    output = [x for sublist in output for x in sublist]
    output = [str(x) for x in output]
    output = ' '.join(output)
    return output

In [98]:
# added parent terms all the way to the root of the MEDIC hierarchy? add all parent ID for diseases?
E4_disease_names

Unnamed: 0,DiseaseName,DiseaseID,AltDiseaseIDs,Definition,ParentIDs,TreeNumbers,ParentTreeNumbers,Synonyms,SlimMappings
0,10p Deletion Syndrome (Partial),MESH:C538288,,,MESH:D002872|MESH:D025063,C16.131.260/C538288|C16.320.180/C538288|C23.55...,C16.131.260|C16.320.180|C23.550.210.050.500.500,"Chromosome 10, 10p- Partial|Chromosome 10, mon...",Congenital abnormality|Genetic disease (inborn...
1,13q deletion syndrome,MESH:C535484,,,MESH:D002872|MESH:D025063,C16.131.260/C535484|C16.320.180/C535484|C23.55...,C16.131.260|C16.320.180|C23.550.210.050.500.500,Chromosome 13q deletion|Chromosome 13q deletio...,Congenital abnormality|Genetic disease (inborn...
2,15q24 Microdeletion,MESH:C579849,DO:DOID:0060395,,MESH:D002872|MESH:D008607|MESH:D025063,C10.597.606.360/C579849|C16.131.260/C579849|C1...,C10.597.606.360|C16.131.260|C16.320.180|C23.55...,15q24 Deletion|15q24 Microdeletion Syndrome|In...,Congenital abnormality|Genetic disease (inborn...
3,16p11.2 Deletion Syndrome,MESH:C579850,,,MESH:D001321|MESH:D002872|MESH:D008607|MESH:D0...,C10.597.606.360/C579850|C16.131.260/C579850|C1...,C10.597.606.360|C16.131.260|C16.320.180|C23.55...,,Congenital abnormality|Genetic disease (inborn...
4,"17,20-Lyase Deficiency, Isolated",MESH:C567076,,,MESH:D000312,C12.050.351.875.253.090.500/C567076|C12.200.70...,C12.050.351.875.253.090.500|C12.200.706.316.09...,"17-Alpha-Hydroxylase-17,20-Lyase Deficiency, C...",Congenital abnormality|Endocrine system diseas...
...,...,...,...,...,...,...,...,...,...
13185,Zunich neuroectodermal syndrome,MESH:C536729,DO:DOID:0112152|OMIM:280000,,MESH:D003103|MESH:D006314|MESH:D006330|MESH:D0...,C09.218.458.341.562/C536729|C10.562/C536729|C1...,C09.218.458.341.562|C10.562|C10.597.606.360|C1...,"CHIME|CHIME syndrome|COLOBOMA, CONGENITAL HEAR...",Cardiovascular disease|Congenital abnormality|...
13186,Zuska's Disease,MESH:C536730,,,MESH:D000038|MESH:D001941|MESH:D005402,C01.830.025/C536730|C17.800.090/C536730|C23.30...,C01.830.025|C17.800.090|C23.300.575|C23.550.47...,Lactation and squamous metaplasia of lactifero...,Pathology (anatomical condition)|Pathology (pr...
13187,Zygodactyly 1,MESH:C565223,OMIM:609815,,MESH:D013576,C05.116.099.370.894.819/C565223|C05.660.585.80...,C05.116.099.370.894.819|C05.660.585.800|C05.66...,ZD1,Congenital abnormality|Musculoskeletal disease
13188,Zygomatic Fractures,MESH:D015051,,Fractures of the zygoma.,MESH:D008446|MESH:D012887,C10.900.300.284.500.950|C26.404.750.959|C26.91...,C10.900.300.284.500|C26.404.750|C26.915.300.42...,"Fractures, Zygomatic|Fracture, Zygomatic|Zygom...",Nervous system disease|Wounds and injuries


E5: DDIs

In [104]:
def e5_func(chemical):
    output = drugbank_df[drugbank_df["dg_name"]==chemical]['dg_interactions'].unique()
    output = output.tolist()[0]
    output = output.split(';')
    output = " ".join(output)
    return output

# Running

In [5]:
chemical_list = ['Acetazolamide','Acetylcholine','Adenosine']

drugbank_df = pd.read_csv("DB/drugbank_extracted1.csv")
E2_chemble=pd.read_csv('../E1_Chemble_data.csv.gz',compression='gzip')
E2_repoDB = pd.read_csv('../E2_repoDB.csv')
E3_sider=pd.read_csv('../E3_meddra_all_label_se.tsv.gz', sep ='\t', compression='gzip', names=['compound_id', 'UMLS_concept_id', 'method_of_detection', 'concept_name', 'MedDRA_concept_type', 'UMLS_id_for_MedDRA_term', 'MedDRA_concept_name'])
E3_sider_name = pd.read_csv('../E3_sider_names.tsv', sep = '\t',names=['UMLS_concept_id', 'drug_name'])
E3_sider_atc = pd.read_csv('../E3_sider_atc.tsv', sep = '\t', names=['UMLS_concept_id', 'atc_codes'])
E4_CTD = pd.read_csv('../E4_CTD_chemicals_diseases.csv')
E4_CTD = E4_CTD[~E4_CTD['DirectEvidence'].isna()]
E4_chemical_names = pd.read_csv('../E4_CTD_chemical_names.csv')
E4_disease_names = pd.read_csv('../E4_CTD_disease_names.csv')


  E2_chemble=pd.read_csv('../E1_Chemble_data.csv.gz',compression='gzip')


In [107]:
E1_result = []
E2_result = []
E3_result = []
E4_result = []
E5_result = []

for x in chemical_list:

    e1 = e1_func(x)
    E1_result.append(e1)
    e2 = e2_func(x)
    E2_result.append(e2)
    e3 = e3_func(x)
    E3_result.append(e3)
    e4 = e4_func(x)
    E3_result.append(e4)
    e5 = e5_func(x)
    E3_result.append(e5)