In [99]:
import pandas as pd
import numpy as np
import os
from collections import OrderedDict
import pubchempy as pcp
import json

In [102]:
def save_dict_file(dictvar, file):
    with open(file, 'w') as file:
        file.write(json.dumps(dictvar))

In [104]:
covid19_drugs = ['sapanisertib', 'rapamycin', 'zotatifin', 'verdinexor', 'chloroquine', 'dabrafenib', 'sanglifehrin a', 'fk-506', 'pevonedistat', 'tomivosertib', 'captopril', 'lisinopril', 'camostat', 'nafamostat', 'chloramphenicol', 'tigecycline', 'linezolid']

In [144]:
def update_node_file(new_nodes, node_file='dataset/diseases.nodes'):
    # grab the current existing nodes from the file
    if os.path.getsize(node_file) > 0:
        curr_nodes = pd.read_table(node_file, header=None).iloc[:,0].tolist()
    else:
        curr_nodes = []
    # add the new nodes, remove duplicates, and sort
    curr_nodes = curr_nodes + new_nodes
    curr_nodes = list(set(curr_nodes))
    curr_nodes.sort()
    # update the file
    curr_nodes = pd.DataFrame({ 'nodes': curr_nodes })
    curr_nodes.to_csv(node_file, header=False, index=False)

In [259]:
def store_df(df, file):
    df.to_csv(file, header=True, index=False)

In [3]:
# dataset paths
# drug central
dc_path = 'raw-dataset/drugcentral-drug_indications.tsv'
# stitch
st_path = 'raw-dataset/stitch-9606.actions.tsv'

## Drug Central

In [5]:
# grab drugs and diseases from drug central
dc = pd.read_table(dc_path)
dc.head()

Unnamed: 0,DRUG_ID,DRUG_NAME,INDICATION_FDB,UMLS_CUI,SNOMEDCT_CUI,DOID
0,965,drostanolone propionate,Malignant tumor of breast,C0006142,254837009.0,DOID:1612
1,318,benzbromarone,Gout,C0018099,90560007.0,DOID:13189
2,318,benzbromarone,Hyperuricemia,C0740394,35885006.0,DOID:1920
3,1031,epitizide,Hypertensive disorder,C0020538,38341003.0,DOID:10763
4,3578,tafenoquine,Malaria,C0024530,61462000.0,DOID:12365


In [42]:
dc['INDICATION_FDB'][dc.isna()['UMLS_CUI']]

747                            Systemic Dermatomyositis
761                               Otitis Externa Eczema
766       Osteoarthritis in Patients at High Ulcer Risk
772      Post-Op Gynecological Infection due to E. Coli
781                   Bronchospasm Prevention with COPD
                              ...                      
10352        Maintenance of Healing Erosive Esophagitis
10355                   Duodenal Ulcer due to H. Pylori
10356                 Gastric Hypersecretory Conditions
10357     Pathological Gastric Hypersecretory Condition
10363                  Myocardial Infarction Prevention
Name: INDICATION_FDB, Length: 2489, dtype: object

### Remove NAs

In [45]:
# remove dc rows with invalid UMLS_CUI
dc = dc.dropna(subset=['UMLS_CUI'])

In [47]:
print('There are in total %d rows of drug-disease edges (valid UMLS CUI) in drug central' % (dc.shape[0]))

There are in total 8469 rows of drug-disease edges (valid UMLS CUI) in drug central


### Drugs

In [48]:
dc_drugs = dc['DRUG_NAME'].tolist()

In [78]:
def fetch_drug_compounds(drugs):
    drugs = list(set(drugs))
    res = {}
    for drug in drugs:
        res[drug] = pcp.get_compounds(drug, 'name')
    return(res)

In [79]:
drugs_compounds = fetch_drug_compounds(dc_drugs)

In [96]:
def clean_drug_compounds(drugs):
    cleaned = {}
    for drug in drugs:
        if len(drugs[drug]) == 0: continue
        compounds = [compound.cid for compound in drugs[drug]]
        cleaned[drug] = compounds
    return(cleaned)

There exist some synonyms, specifically:

```
46853873 cyanocobalamin cobalamin      --> cobalamin
12560 benzamycin erythromycin          --> erythromycin
4594 esomeprazole omeprazole           --> omeprazole
70683024 lipegfilgrastim pegfilgrastim --> pegfilgrastim
9833444 pancreozymin sincalide         --> sincalide
772 bemiparin heparin                  --> heparin
5311507 clopenthixol zuclopenthixol    --> clopenthixol
3083544 arformoterol formoterol        --> formoterol
```

In [116]:
cleaned_drugs_compounds = clean_drug_compounds(drugs_compounds)
save_dict_file(cleaned_drugs_compounds, 'dataset/drugs.compounds')

In [233]:
def reverse_drugs_compounds(drugs):
    res = {}
    dup = {}
    for drug in drugs:
        compounds = drugs[drug]
        if len(compounds) > 1: dup[drug] = compounds
        for compound in compounds:
            if compound in res:
                res[compound] = res[compound] + ', ' + drug
            else: res[compound] = drug
    return(res, dup)

In [234]:
compounds_drugs, duplicates_drug_compounds = reverse_drugs_compounds(cleaned_drugs_compounds)

### Diseases

In [151]:
dc = dc[dc['DRUG_NAME'].isin(cleaned_drugs_compounds.keys())]
dc.head()

Unnamed: 0,DRUG_ID,DRUG_NAME,INDICATION_FDB,UMLS_CUI,SNOMEDCT_CUI,DOID
0,965,drostanolone propionate,Malignant tumor of breast,C0006142,254837009.0,DOID:1612
1,318,benzbromarone,Gout,C0018099,90560007.0,DOID:13189
2,318,benzbromarone,Hyperuricemia,C0740394,35885006.0,DOID:1920
3,1031,epitizide,Hypertensive disorder,C0020538,38341003.0,DOID:10763
4,3578,tafenoquine,Malaria,C0024530,61462000.0,DOID:12365


In [143]:
# filter diseases by the available drugs
dc_diseases = dc[dc['DRUG_NAME'].isin(cleaned_drugs_compounds.keys())]['UMLS_CUI'].tolist()

In [145]:
update_node_file(dc_diseases)

## STITCH

In [214]:
st = pd.read_table(st_path)
st.head()

Unnamed: 0,item_id_a,item_id_b,mode,action,a_is_acting,score
0,9606.ENSP00000170630,CIDm00010461,expression,,f,150
1,CIDm00010461,9606.ENSP00000170630,expression,,t,150
2,9606.ENSP00000353915,CIDs23627457,binding,,f,191
3,CIDs23627457,9606.ENSP00000353915,binding,,f,191
4,9606.ENSP00000256906,CIDs44408029,binding,,f,521


In [215]:
st.shape

(21773491, 6)

In [216]:
modes = ['activation', 'binding', 'catalysis', 'inhibition', 'reaction']
st = st[st['mode'].str.contains('|'.join(modes), na=False)]
st.shape

(18745253, 6)

In [217]:
# filter s.t. item_a contains compounds
st = st[st['item_id_a'].str.contains('CID')]

In [218]:
st.shape

(9458945, 6)

In [219]:
st.head()

Unnamed: 0,item_id_a,item_id_b,mode,action,a_is_acting,score
3,CIDs23627457,9606.ENSP00000353915,binding,,f,191
5,CIDs44408029,9606.ENSP00000256906,binding,,f,521
9,CIDs23590374,9606.ENSP00000267377,binding,,f,159
13,CIDs73351473,9606.ENSP00000295589,binding,,f,407
17,CIDs10666045,9606.ENSP00000336630,binding,,f,209


### Filter compounds

In [220]:
st_compounds = [int(compound[4:]) for compound in st['item_id_a'].tolist()]

In [221]:
st['compound_id'] = st_compounds

In [222]:
st.head()

Unnamed: 0,item_id_a,item_id_b,mode,action,a_is_acting,score,compound_id
3,CIDs23627457,9606.ENSP00000353915,binding,,f,191,23627457
5,CIDs44408029,9606.ENSP00000256906,binding,,f,521,44408029
9,CIDs23590374,9606.ENSP00000267377,binding,,f,159,23590374
13,CIDs73351473,9606.ENSP00000295589,binding,,f,407,73351473
17,CIDs10666045,9606.ENSP00000336630,binding,,f,209,10666045


In [223]:
# filter by the compounds found in drug central
st = st[st['compound_id'].isin(compounds_drugs.keys())]

In [173]:
genes = [gene[5:] for gene in st['item_id_b'].tolist()]

In [176]:
cleaned_genes = list(set(genes))

In [179]:
with open('raw-dataset/stitch-genes', 'w') as file:
    for gene in cleaned_genes:
        file.write(gene + '\n')

### Gene conversion

In [181]:
gene_path = 'raw-dataset/stitch-genes_conversion.csv'
genes_alias = pd.read_csv(gene_path)
genes_alias.head()

Unnamed: 0,initial_alias,converted_alias,name,description,namespace
0,ENSP00000314992,ENSG00000180245,RRH,retinal pigment epithelium-derived rhodopsin h...,ENSP
1,ENSP00000230122,ENSG00000112365,ZBTB24,zinc finger and BTB domain containing 24 [Sour...,ENSP
2,ENSP00000357591,ENSG00000173626,TRAPPC3L,trafficking protein particle complex 3 like [S...,ENSP
3,ENSP00000368552,ENSG00000124523,SIRT5,sirtuin 5 [Source:HGNC Symbol;Acc:HGNC:14933],ENSP
4,ENSP00000261845,ENSG00000069956,MAPK6,mitogen-activated protein kinase 6 [Source:HGN...,ENSP


In [190]:
genes_alias = genes_alias[genes_alias['name'] != 'None']
genes_alias.shape

(8325, 5)

In [192]:
genes_alias_init = genes_alias['initial_alias'].tolist()
genes_alias_name = genes_alias['name'].tolist()
print(len(genes_alias_init), len(genes_alias_name))

8325 8325


In [193]:
genes_conversion = {genes_alias_init[i]: genes_alias_name[i] for i in range(len(genes_alias_init))}

Note, multiple ENSPs can be mapped into the same gene name:

```
RTEL1 ENSP00000359035 ENSP00000457868
IFNA6 ENSP00000369558 ENSP00000259555
TRPV1 ENSP00000382659 ENSP00000459962
SHOX2 ENSP00000398704 ENSP00000419362
```

In [198]:
update_node_file(list(set(genes_alias_name)), 'dataset/genes.nodes')

### Update the main STITCH dataframe

In [202]:
genes = [genes_conversion[gene[5:]] if gene[5:] in genes_conversion else None\
         for gene in st['item_id_b'].tolist()]

In [224]:
st['gene_names'] = genes

In [225]:
st = st.dropna(subset=['gene_names'])
st.shape

(182310, 8)

In [227]:
st = st[['compound_id', 'gene_names', 'mode']]
st

Unnamed: 0,compound_id,gene_names,mode
145,4768,LTB4R,binding
242,5709,CCKBR,binding
284,21800,PPARA,binding
482,3878,TMPRSS7,inhibition
492,5361092,OPRK1,binding
...,...,...,...
21772695,753,CD8A,binding
21772853,4893,GPR63,binding
21772982,753,RGPD3,binding
21773316,8966,NPY4R,binding


In [260]:
store_df(st, 'dataset/drug-gene-all.edges')

## Reconcile compound IDs

Which compound IDs should we use if a drug can be mapped to multiple IDs?

In [230]:
st_compounds = st['compound_id'].tolist()

In [249]:
found_duplicates = []
for compound in st_compounds:
    drug = compounds_drugs[compound]
    if drug in duplicates_drug_compounds:
        found_duplicates.append((drug, compound))

In [255]:
found_duplicates = sorted(list(set([i for i in found_duplicates])))
found_duplicates

[('N-Acetyltyrosine', 68310),
 ('Polymyxin B', 4868),
 ('Vitamin E', 2116),
 ('Vitamin E', 14985),
 ('acarbose', 41774),
 ('acarbose', 441184),
 ('acarbose', 444254),
 ('acetylcarnitine', 1),
 ('alatrofloxacin', 3086677),
 ('alfacalcidol', 2091),
 ('alfacalcidol', 5282181),
 ('amphotericin B', 1972),
 ('amphotericin B', 5280965),
 ('aprotinin', 16130295),
 ('argatroban', 92721),
 ('artemisinin', 2240),
 ('artemisinin', 68827),
 ('artemisinin', 452191),
 ('artemisinin', 9838675),
 ('artenimol', 107770),
 ('artenimol', 11832956),
 ('artesunate', 65664),
 ('artesunate', 6917864),
 ('asenapine', 163091),
 ('asenapine', 3036780),
 ('asenapine', 9903970),
 ('aspartic acid', 424),
 ('aspartic acid', 5960),
 ('aspartic acid', 83887),
 ('auranofin', 6918453),
 ('aurothioglucose', 6104),
 ('avibactam', 9835049),
 ('aztreonam', 2274),
 ('aztreonam', 5742832),
 ('beclabuvir', 25016295),
 ('beraprost', 2352),
 ('beraprost', 5282428),
 ('beraprost', 6917951),
 ('betamethasone dipropionate', 21800),


In [256]:
# re-clean the cleaned_drugs_compounds variable
prev = '-'
for item in found_duplicates:
    drug = item[0]
    if prev != drug:
        prev = drug
        cleaned_drugs_compounds[drug] = []
    cleaned_drugs_compounds[drug].append(item[1])

In [269]:
save_dict_file(cleaned_drugs_compounds, 'dataset/drugs.compounds')

In [264]:
# create all disease - drug associations
dc_drugs = dc[dc['DRUG_NAME'].isin(cleaned_drugs_compounds.keys())]['DRUG_NAME'].tolist()
dc_diseases = dc[dc['DRUG_NAME'].isin(cleaned_drugs_compounds.keys())]['UMLS_CUI'].tolist()
print(len(dc_drugs), len(dc_diseases))

8123 8123


In [266]:
treat_drugs = []
treat_diseases = []
for i, drug in enumerate(dc_drugs):
    disease = dc_diseases[i]
    for compound in cleaned_drugs_compounds[drug]:
        treat_drugs.append(compound)
        treat_diseases.append(disease)

In [270]:
treat_df = pd.DataFrame(data={ 'compound_id': treat_drugs, 'diseases': treat_diseases })

In [272]:
store_df(treat_df, 'dataset/drug-disease-treat.edges')

In [274]:
update_node_file(treat_drugs, 'dataset/drugs.nodes')