In [1]:
import pandas as pd
import numpy as np
import os
from collections import OrderedDict
import json

In [5]:
def save_dict_file(dictvar, file):
    with open(file, 'w') as file:
        file.write(json.dumps(dictvar))

In [3]:
covid19_drugs = ['sapanisertib', 'rapamycin', 'zotatifin', 'verdinexor', 'chloroquine', 'dabrafenib', 'sanglifehrin a', 'fk-506', 'pevonedistat', 'tomivosertib', 'captopril', 'lisinopril', 'camostat', 'nafamostat', 'chloramphenicol', 'tigecycline', 'linezolid']

In [4]:
def update_node_file(new_nodes, node_file='../data/clean/diseases.nodes'):
    # grab the current existing nodes from the file
    if os.path.getsize(node_file) > 0:
        curr_nodes = pd.read_table(node_file, header=None).iloc[:,0].tolist()
    else:
        curr_nodes = []
    # add the new nodes, remove duplicates, and sort
    curr_nodes = curr_nodes + new_nodes
    curr_nodes = list(set(curr_nodes))
    curr_nodes.sort()
    # update the file
    curr_nodes = pd.DataFrame({ 'nodes': curr_nodes })
    curr_nodes.to_csv(node_file, header=False, index=False)

In [5]:
def store_df(df, file):
    df.to_csv(file, header=True, index=False)

In [2]:
# dataset paths
# drug central
dc_path = '../data/raw/drugcentral-drug_indications.tsv'
# drug bank
db_path_vocabs = '../data/raw/drugbank-vocab.csv'
db_path_link_carrier   = '../data/raw/drugbank-carrier_links.csv'
db_path_link_enzyme    = '../data/raw/drugbank-enzyme_links.csv'
db_path_link_target    = '../data/raw/drugbank-target_links.csv'
db_path_link_transport = '../data/raw/drugbank-transporter_links.csv'

# CSV Exploration

## DrugBank

In [3]:
db_vocab = pd.read_csv(db_path_vocabs)
db_vocab.head()

Unnamed: 0,DrugBank ID,Accession Numbers,Common name,CAS,UNII,Synonyms,Standard InChI Key
0,DB00001,BIOD00024 | BTD00024,Lepirudin,138068-37-8,Y43GF64R34,Hirudin variant-1 | Lepirudin recombinant,
1,DB00002,BIOD00071 | BTD00071,Cetuximab,205923-56-4,PQX0D8J21J,Cetuximab | Cétuximab | Cetuximabum,
2,DB00003,BIOD00001 | BTD00001,Dornase alfa,143831-71-4,953A26OA1Y,Deoxyribonuclease (human clone 18-1 protein mo...,
3,DB00004,BIOD00084 | BTD00084,Denileukin diftitox,173146-27-5,25E79B5CTM,Denileukin | Interleukin-2/diptheria toxin fus...,
4,DB00005,BIOD00052 | BTD00052,Etanercept,185243-69-0,OP401G7OJC,Etanercept | etanercept-szzs | etanercept-ykro...,


In [4]:
drug_name_dict = {id: name for (id, name) in zip(db_vocab['DrugBank ID'], db_vocab['Common name'])}
drug_name_dict

{'DB00001': 'Lepirudin',
 'DB00002': 'Cetuximab',
 'DB00003': 'Dornase alfa',
 'DB00004': 'Denileukin diftitox',
 'DB00005': 'Etanercept',
 'DB00006': 'Bivalirudin',
 'DB00007': 'Leuprolide',
 'DB00008': 'Peginterferon alfa-2a',
 'DB00009': 'Alteplase',
 'DB00010': 'Sermorelin',
 'DB00011': 'Interferon alfa-n1',
 'DB00012': 'Darbepoetin alfa',
 'DB00013': 'Urokinase',
 'DB00014': 'Goserelin',
 'DB00015': 'Reteplase',
 'DB00016': 'Erythropoietin',
 'DB00017': 'Salmon calcitonin',
 'DB00018': 'Interferon alfa-n3',
 'DB00019': 'Pegfilgrastim',
 'DB00020': 'Sargramostim',
 'DB00022': 'Peginterferon alfa-2b',
 'DB00023': 'Asparaginase Escherichia coli',
 'DB00024': 'Thyrotropin alfa',
 'DB00025': 'Antihemophilic factor, human recombinant',
 'DB00026': 'Anakinra',
 'DB00027': 'Gramicidin D',
 'DB00028': 'Human immunoglobulin G',
 'DB00029': 'Anistreplase',
 'DB00030': 'Insulin human',
 'DB00031': 'Tenecteplase',
 'DB00032': 'Menotropins',
 'DB00033': 'Interferon gamma-1b',
 'DB00034': 'Inter

In [6]:
save_dict_file(drug_name_dict, '../data/clean/drugs-name-dict.json')

In [8]:
db_link_carrier = pd.read_csv(db_path_link_carrier)
db_link_carrier.head()

Unnamed: 0,DrugBank ID,Name,Type,UniProt ID,UniProt Name
0,DB00023,Asparaginase Escherichia coli,BiotechDrug,P05543,Thyroxine-binding globulin
1,DB00059,Pegaspargase,BiotechDrug,P05543,Thyroxine-binding globulin
2,DB00070,Hyaluronidase (ovine),BiotechDrug,P02768,Serum albumin
3,DB00126,Ascorbic acid,SmallMoleculeDrug,P02768,Serum albumin
4,DB00132,alpha-Linolenic acid,SmallMoleculeDrug,O15540,"Fatty acid-binding protein, brain"


In [9]:
db_link_enzyme = pd.read_csv(db_path_link_enzyme)
db_link_enzyme.head()

Unnamed: 0,DrugBank ID,Name,Type,UniProt ID,UniProt Name
0,DB00006,Bivalirudin,SmallMoleculeDrug,P05164,Myeloperoxidase
1,DB00007,Leuprolide,SmallMoleculeDrug,P08684,Cytochrome P450 3A4
2,DB00008,Peginterferon alfa-2a,BiotechDrug,P05177,Cytochrome P450 1A2
3,DB00011,Interferon alfa-n1,BiotechDrug,P05177,Cytochrome P450 1A2
4,DB00013,Urokinase,BiotechDrug,P39900,Macrophage metalloelastase


In [10]:
db_link_target = pd.read_csv(db_path_link_target)
db_link_target.head()

Unnamed: 0,DrugBank ID,Name,Type,UniProt ID,UniProt Name
0,DB00001,Lepirudin,BiotechDrug,P00734,Prothrombin
1,DB00002,Cetuximab,BiotechDrug,P00533,Epidermal growth factor receptor
2,DB00002,Cetuximab,BiotechDrug,O75015,Low affinity immunoglobulin gamma Fc region re...
3,DB00002,Cetuximab,BiotechDrug,P00736,Complement C1r subcomponent
4,DB00002,Cetuximab,BiotechDrug,P02745,Complement C1q subcomponent subunit A


In [11]:
db_link_transport = pd.read_csv(db_path_link_transport)
db_link_transport.head()

Unnamed: 0,DrugBank ID,Name,Type,UniProt ID,UniProt Name
0,DB00027,Gramicidin D,SmallMoleculeDrug,P08183,Multidrug resistance protein 1
1,DB00067,Vasopressin,SmallMoleculeDrug,Q92887,Canalicular multispecific organic anion transp...
2,DB00091,Cyclosporine,SmallMoleculeDrug,P08183,Multidrug resistance protein 1
3,DB00091,Cyclosporine,SmallMoleculeDrug,O95342,Bile salt export pump
4,DB00091,Cyclosporine,SmallMoleculeDrug,Q12908,Ileal sodium/bile acid cotransporter


## Drug Central

In [12]:
# grab drugs and diseases from drug central
dc = pd.read_table(dc_path)
dc.head()

Unnamed: 0,DRUG_ID,DRUG_NAME,INDICATION_FDB,UMLS_CUI,SNOMEDCT_CUI,DOID
0,965,drostanolone propionate,Malignant tumor of breast,C0006142,254837009.0,DOID:1612
1,318,benzbromarone,Gout,C0018099,90560007.0,DOID:13189
2,318,benzbromarone,Hyperuricemia,C0740394,35885006.0,DOID:1920
3,1031,epitizide,Hypertensive disorder,C0020538,38341003.0,DOID:10763
4,3578,tafenoquine,Malaria,C0024530,61462000.0,DOID:12365


In [13]:
dc['INDICATION_FDB'][dc.isna()['UMLS_CUI']]

747                            Systemic Dermatomyositis
761                               Otitis Externa Eczema
766       Osteoarthritis in Patients at High Ulcer Risk
772      Post-Op Gynecological Infection due to E. Coli
781                   Bronchospasm Prevention with COPD
                              ...                      
10352        Maintenance of Healing Erosive Esophagitis
10355                   Duodenal Ulcer due to H. Pylori
10356                 Gastric Hypersecretory Conditions
10357     Pathological Gastric Hypersecretory Condition
10363                  Myocardial Infarction Prevention
Name: INDICATION_FDB, Length: 2489, dtype: object

### Remove invalid UMLS CUI

In [14]:
# remove dc rows with invalid UMLS_CUI
dc = dc.dropna(subset=['UMLS_CUI'])

In [15]:
print('There are in total %d rows of drug-disease edges (valid UMLS CUI) in drug central' % (dc.shape[0]))

There are in total 8469 rows of drug-disease edges (valid UMLS CUI) in drug central


### Drugs

In [16]:
dc_drugs = dc['DRUG_NAME'].tolist()
dc_drugs = [drug.lower() for drug in dc_drugs]

In [17]:
db_drugs_commonnames = [drug.lower() for drug in db_vocab['Common name'].tolist()]
db_drugs_synonyms = [str(drug).lower() for drug in db_vocab['Synonyms'].tolist()]
db_drugs_id = db_vocab['DrugBank ID'].tolist()

In [18]:
dc_unique_drugs = list(set(dc_drugs))
len(dc_unique_drugs)

2301

In [19]:
drugs_dict = {}
drugs_not_found = []
for drug in dc_unique_drugs:
    for i in range(len(db_drugs_commonnames)):
        if drug in db_drugs_commonnames[i] or drug in db_drugs_synonyms[i]:
            drugs_dict[drug] = db_drugs_id[i]
            break
    if drug not in drugs_dict: drugs_not_found.append(drug)

In [20]:
dc_drugs_dbid = [drugs_dict[drug] if drug in drugs_dict else None for drug in dc_drugs]

In [21]:
dc['DRUGBANK_ID'] = dc_drugs_dbid
dc.head()

Unnamed: 0,DRUG_ID,DRUG_NAME,INDICATION_FDB,UMLS_CUI,SNOMEDCT_CUI,DOID,DRUGBANK_ID
0,965,drostanolone propionate,Malignant tumor of breast,C0006142,254837009.0,DOID:1612,DB14655
1,318,benzbromarone,Gout,C0018099,90560007.0,DOID:13189,DB12319
2,318,benzbromarone,Hyperuricemia,C0740394,35885006.0,DOID:1920,DB12319
3,1031,epitizide,Hypertensive disorder,C0020538,38341003.0,DOID:10763,DB13989
4,3578,tafenoquine,Malaria,C0024530,61462000.0,DOID:12365,DB06608


In [22]:
dc = dc.dropna(subset=['DRUGBANK_ID'])
dc = dc[['DRUGBANK_ID', 'UMLS_CUI']]
dc.head()

Unnamed: 0,DRUGBANK_ID,UMLS_CUI
0,DB14655,C0006142
1,DB12319,C0018099
2,DB12319,C0740394
3,DB13989,C0020538
4,DB06608,C0024530


In [23]:
# store diseases
dc_diseases = dc['UMLS_CUI'].tolist()
update_node_file(dc_diseases)

In [37]:
# store drugs
dc_drugs = dc['DRUGBANK_ID'].tolist()
update_node_file(dc_drugs, node_file='../data/clean/drugs.nodes')

In [25]:
# store drug-treat-disease
store_df(dc, '../data/clean/drug-disease-treat.edges')

### Protein-gene name conversion

In [26]:
# combine all protein IDs
protein_ids = db_link_carrier['UniProt ID'].tolist() + \
              db_link_enzyme['UniProt ID'].tolist() + \
              db_link_target['UniProt ID'].tolist() + \
              db_link_transport['UniProt ID'].tolist()
protein_ids = list(set(protein_ids))

In [27]:
with open('../data/raw/db-proteins', 'w') as file:
    for protein in protein_ids:
        file.write(protein + '\n')
# use UniProt website to fetch gene names

In [28]:
protein_gene_df = pd.read_table('../data/raw/db-protein-gene')
protein_gene_df.head()

Unnamed: 0,From,To
0,O00483,NDUFA4
1,P55786,NPEPPS
2,P40859,soxA
3,P16788,UL97
4,P19793,RXRA


In [29]:
protein_ids = protein_gene_df['From'].tolist()
gene_names = [str(gene).upper() for gene in protein_gene_df['To'].tolist()]

In [30]:
# create a conversion dict
protein_gene_dict = {protein: gene_names[i] for i, protein in enumerate(protein_ids)}

In [31]:
def fetch_drug_protein_df(df, conv_dict):
    gene_names = [conv_dict[protein] if protein in conv_dict else None for protein in df['UniProt ID'].tolist()]
    drugs = df['DrugBank ID'].tolist()
    res = pd.DataFrame(data={ 'DRUGS': drugs, 'GENES': gene_names })
    res = res.dropna(subset=['GENES'])
    return(res)

In [32]:
db_link_carrier = fetch_drug_protein_df(db_link_carrier, protein_gene_dict)
db_link_enzyme = fetch_drug_protein_df(db_link_enzyme, protein_gene_dict)
db_link_target = fetch_drug_protein_df(db_link_target, protein_gene_dict)
db_link_transport = fetch_drug_protein_df(db_link_transport, protein_gene_dict)

In [33]:
# store drug-gene-[action]
store_df(db_link_carrier, '../data/clean/drug-gene-carrier.edges')
store_df(db_link_enzyme, '../data/clean/drug-gene-enzyme.edges')
store_df(db_link_target, '../data/clean/drug-gene-target.edges')
store_df(db_link_transport, '../data/clean/drug-gene-transport.edges')

In [38]:
# update drugs
update_node_file(db_link_carrier['DRUGS'].tolist(), node_file='../data/clean/drugs.nodes')
update_node_file(db_link_enzyme['DRUGS'].tolist(), node_file='../data/clean/drugs.nodes')
update_node_file(db_link_target['DRUGS'].tolist(), node_file='../data/clean/drugs.nodes')
update_node_file(db_link_transport['DRUGS'].tolist(), node_file='../data/clean/drugs.nodes')

In [36]:
# update drugs
update_node_file(db_link_carrier['GENES'].tolist(), node_file='../data/clean/genes.nodes')
update_node_file(db_link_enzyme['GENES'].tolist(), node_file='../data/clean/genes.nodes')
update_node_file(db_link_target['GENES'].tolist(), node_file='../data/clean/genes.nodes')
update_node_file(db_link_transport['GENES'].tolist(), node_file='../data/clean/genes.nodes')