# Drug Interactions Network Analysis
### Part 0 - Data and Mapping Acquisition**  

#### Author: Kenneth Leung

#### Data sources:  
- https://pubchem.ncbi.nlm.nih.gov
- https://go.drugbank.com/drugs/
___

### 1. Import dependencies and files

In [22]:
import pandas as pd
import numpy as np
import re
import zipfile
import json

import urllib
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

In [4]:
# Unzip all tar/zip files
zip_files_list = os.listdir('data')

for file in zip_files_list:
    with zipfile.ZipFile(f'data/{file}', 'r') as zip_ref:
        zip_ref.extractall('data')

os.listdir('data')

<IPython.core.display.Javascript object>

['ChCh-Miner_durgbank-chem-chem.tsv',
 'ChCh-Miner_durgbank-chem-chem.zip',
 'ChChSe-Decagon_polypharmacy.csv',
 'ChChSe-Decagon_polypharmacy.zip',
 'ChSe-Decagon_monopharmacy.csv',
 'ChSe-Decagon_monopharmacy.zip']

___
### 2. View datasets

#### Drugbank Drug Interactions Dataset

In [9]:
df_db_int = pd.read_csv("data/ChCh-Miner_durgbank-chem-chem.tsv", sep='\t', header=None)
df_db_int.head(5)

<IPython.core.display.Javascript object>

Unnamed: 0,0,1
0,DB00862,DB00966
1,DB00575,DB00806
2,DB01242,DB08893
3,DB01151,DB08883
4,DB01235,DB01275


#### Polypharmacy Side Effects

In [11]:
df_poly_se = pd.read_csv("data/ChChSe-Decagon_polypharmacy.csv")
df_poly_se.head(5)

<IPython.core.display.Javascript object>

Unnamed: 0,# STITCH 1,STITCH 2,Polypharmacy Side Effect,Side Effect Name
0,CID000002173,CID000003345,C0151714,hypermagnesemia
1,CID000002173,CID000003345,C0035344,retinopathy of prematurity
2,CID000002173,CID000003345,C0004144,atelectasis
3,CID000002173,CID000003345,C0002063,alkalosis
4,CID000002173,CID000003345,C0004604,Back Ache


#### Monodrug Side Effects

In [12]:
df_mono_se = pd.read_csv("data/ChSe-Decagon_monopharmacy.csv")
df_mono_se.head(5)

<IPython.core.display.Javascript object>

Unnamed: 0,# STITCH,Individual Side Effect,Side Effect Name
0,CID003062316,C1096328,central nervous system mass
1,CID003062316,C0162830,Photosensitivity reaction
2,CID003062316,C1611725,leukaemic infiltration brain
3,CID003062316,C0541767,platelet adhesiveness abnormal
4,CID003062316,C0242973,Ventricular dysfunction


___
### 3. Retrieve corresponding mapping for drug codes

#### CID codes (PubChem Identifier)
- e.g. CID003062316 -> https://pubchem.ncbi.nlm.nih.gov/compound/003062316

In [18]:
# Get CID numbers from df columns
poly_se_cid_list_1 = df_poly_se['# STITCH 1'].tolist()
poly_se_cid_list_2 = df_poly_se['STITCH 2'].tolist()
mono_se_cid_list = df_mono_se['# STITCH'].tolist()

# Combine all lists
master_cid_list = poly_se_cid_list_1 + poly_se_cid_list_2 + mono_se_cid_list

# Remove duplicates in master list
master_cid_list = list(dict.fromkeys(master_cid_list))

In [20]:
len(master_cid_list)

645

In [23]:
# Setup functions for drug name retrieval
# Set wait times
waittime = 3
sleeptime = 0.5

# Initiate web driver
try:
    driver.close() # Close any existing WebDrivers
except Exception:
    pass

# Set webdriver options
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('ignore-certificate-errors')

# Initiate webdriver
driver = webdriver.Chrome(options=options)

# Define dictionary to store mapping key-value pairs
cid_mapping = {}

for cid in master_cid_list:
    cid_number = re.sub("[^0-9]", "", cid)
    url = f'https://pubchem.ncbi.nlm.nih.gov/compound/{cid_number}'
    
    # Get driver to retrieve URL
    driver.get(url)

    # Wait for page to load
    driver.implicitly_wait(waittime)
    
    try:
        # Retrieve drug name
        element = driver.find_element_by_xpath("//h1[@class='m-zero p-zero']")
        drug_name = element.text
        cid_mapping[cid] = drug_name
    except:
        cid_mapping[cid] = 'N/A'
        print(f'Drug not found for {cid}')

In [38]:
cid_mapping

{'CID000002173': '6-{[Amino(phenyl)acetyl]amino}-3,3-dimethyl-7-oxo-4-thia-1-azabicyclo[3.2.0]heptane-2-carboxylate',
 'CID000005206': 'Sevoflurane',
 'CID000003929': 'N-[[(5S)-3-[3-Fluoro-4-(4-morpholinyl)phenyl]-2-oxo-5-oxazolidinyl]methyl]-acetamide',
 'CID000001302': '2-(6-Methoxy-2-naphthyl)propionic acid',
 'CID000005267': 'Duraspiron',
 'CID000004601': 'Orphenadrine',
 'CID000005090': 'Rofecoxib',
 'CID000004946': 'Propranolol',
 'CID000005391': 'Temazepam',
 'CID000002802': 'Clonazepam',
 'CID000004212': 'Mitoxantrone',
 'CID000000596': 'Depocyt',
 'CID000002522': '5-[2-[1-(5-Cyclopropyl-5-hydroxypent-3-en-2-yl)-7a-methyl-2,3,3a,5,6,7-hexahydro-1H-inden-4-ylidene]ethylidene]-4-methylenecyclohexane-1,3-diol',
 'CID000003405': 'CID 3405',
 'CID000003446': 'Gabapentin',
 'CID000004107': 'Methocarbamol',
 'CID000003161': 'CID 3161',
 'CID000003823': 'Ketoconazole',
 'CID000005556': 'Triazolam',
 'CID000002156': '[2-[4-[(2-Butylbenzofuran-3-yl)carbonyl]-2,6-diiodophenoxy]ethyl]dieth

In [24]:
len(cid_mapping)

645

In [34]:
# # Export CID mapping as JSON
# with open('data/CID_mapping.json', 'w') as fp:
#     json.dump(cid_mapping, fp)
    
# # Read saved CID mapping JSON
# with open('data/CID_mapping.json', 'r') as fp:
#     test = json.load(fp)

___
#### Obtain master mapping list of DB codes (DrugBank Accession Number)
- e.g. DB00254 -> https://go.drugbank.com/drugs/DB00254

In [48]:
# Get DB code for each of the two columns
db_list_1 = df_db_int[0].tolist()
db_list_2 = df_db_int[1].tolist()

# Combine both lists for DB codes
master_db_list = db_list_1 + db_list_2

# Remove duplicates in master list
master_db_list = list(dict.fromkeys(master_db_list))

# Sort order
master_db_list = sorted(master_db_list)

In [49]:
len(master_db_list)

1514

In [50]:
master_db_list

['DB00005',
 'DB00006',
 'DB00007',
 'DB00008',
 'DB00009',
 'DB00012',
 'DB00013',
 'DB00014',
 'DB00015',
 'DB00016',
 'DB00017',
 'DB00018',
 'DB00019',
 'DB00020',
 'DB00021',
 'DB00022',
 'DB00023',
 'DB00026',
 'DB00028',
 'DB00029',
 'DB00030',
 'DB00031',
 'DB00033',
 'DB00035',
 'DB00036',
 'DB00039',
 'DB00040',
 'DB00041',
 'DB00042',
 'DB00043',
 'DB00046',
 'DB00047',
 'DB00048',
 'DB00050',
 'DB00051',
 'DB00052',
 'DB00054',
 'DB00056',
 'DB00059',
 'DB00060',
 'DB00061',
 'DB00063',
 'DB00065',
 'DB00068',
 'DB00069',
 'DB00070',
 'DB00072',
 'DB00073',
 'DB00074',
 'DB00078',
 'DB00080',
 'DB00081',
 'DB00082',
 'DB00083',
 'DB00085',
 'DB00086',
 'DB00087',
 'DB00089',
 'DB00091',
 'DB00098',
 'DB00099',
 'DB00100',
 'DB00103',
 'DB00104',
 'DB00105',
 'DB00107',
 'DB00108',
 'DB00109',
 'DB00110',
 'DB00112',
 'DB00115',
 'DB00120',
 'DB00121',
 'DB00122',
 'DB00126',
 'DB00130',
 'DB00136',
 'DB00140',
 'DB00146',
 'DB00150',
 'DB00153',
 'DB00158',
 'DB00159',
 'DB

In [53]:
# Setup functions for drug name retrieval
# Set wait times
waittime = 3
sleeptime = 0.5

# Initiate web driver
try:
    driver.close() # Close any existing WebDrivers
except Exception:
    pass

# Set webdriver options
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('ignore-certificate-errors')

# Initiate webdriver
driver = webdriver.Chrome(options=options)

# Define dictionary to store mapping key-value pairs
db_mapping = {}

for db_id in master_db_list:
    url = f'https://go.drugbank.com/drugs/{db_id}'
    
    # Get driver to retrieve URL
    driver.get(url)

    # Wait for page to load
    driver.implicitly_wait(waittime)
    
    try:
        # Retrieve drug name
        element = driver.find_element_by_xpath("//h1[@class='align-self-center mr-auto']")
        drug_name = element.text
        db_mapping[db_id] = drug_name
    except:
        db_mapping[db_id] = 'N/A'
        print(f'Drug not found for {db_id}')

Drug not found for DB06808


In [54]:
len(db_mapping)

1514

In [56]:
db_mapping

{'DB00005': 'Etanercept',
 'DB00006': 'Bivalirudin',
 'DB00007': 'Leuprolide',
 'DB00008': 'Peginterferon alfa-2a',
 'DB00009': 'Alteplase',
 'DB00012': 'Darbepoetin alfa',
 'DB00013': 'Urokinase',
 'DB00014': 'Goserelin',
 'DB00015': 'Reteplase',
 'DB00016': 'Erythropoietin',
 'DB00017': 'Salmon calcitonin',
 'DB00018': 'Interferon alfa-n3',
 'DB00019': 'Pegfilgrastim',
 'DB00020': 'Sargramostim',
 'DB00021': 'Secretin human',
 'DB00022': 'Peginterferon alfa-2b',
 'DB00023': 'Asparaginase Escherichia coli',
 'DB00026': 'Anakinra',
 'DB00028': 'Human immunoglobulin G',
 'DB00029': 'Anistreplase',
 'DB00030': 'Insulin human',
 'DB00031': 'Tenecteplase',
 'DB00033': 'Interferon gamma-1b',
 'DB00035': 'Desmopressin',
 'DB00036': 'Coagulation factor VIIa Recombinant Human',
 'DB00039': 'Palifermin',
 'DB00040': 'Glucagon',
 'DB00041': 'Aldesleukin',
 'DB00042': 'Botulinum toxin type B',
 'DB00043': 'Omalizumab',
 'DB00046': 'Insulin lispro',
 'DB00047': 'Insulin glargine',
 'DB00048': 'Col

In [55]:
# Export DB mapping as JSON
# with open('data/DB_mapping.json', 'w') as fp:
#     json.dump(db_mapping, fp)