In [10]:
import xml.etree.ElementTree as ET
import pandas as pd


# from multiprocessing import Pool
from utils_data import *
from tqdm import tqdm

Load and merge the dataset from previous studies.

In [2]:
drug_adr = pd.read_csv('original data/Supplementary Data 1.txt', \
                   sep='\t', header=0)
drug_adr = drug_adr.drop(drug_adr.columns[-1], axis=1)
drug_atc = pd.read_csv("original data/Supplementary Data 4.txt",sep='\t',header=0)
adr_similar_df = pd.read_csv('adr_similar_df.csv')
data1 = pd.merge(drug_adr,drug_atc,how='left',on='GenericName')
data1['GenericName'] = data1['GenericName'].str.lower()
data1['GenericName'] = data1['GenericName'].str.replace('.',' ')
drug3 = data1[['GenericName']].drop_duplicates()

  


Parse and merge the DrugBank dataset.

In [3]:
tree = ET.parse("original data/full database.xml") 
root = tree.getroot()
drug_dict = {"dg_id": "",
             "dg_name": "",
             "dg_smile": "",
             "dg_ATC": "",
             "dg_synonyms": ""
             }

with open('pareGrug_drugbank.txt', 'w', encoding='utf-8') as writer:
	writer.write('|'.join(e for e in drug_dict) + '\n') #write key

	xmlns = '{http://www.drugbank.ca}'
	for drug in tqdm(root):
		# parse dg_id
		drug_bank_iter = xmlns + 'drugbank-id'
		dg_id = drug.find(drug_bank_iter)
		drug_dict['dg_id'] = dg_id.text

		# parse dg_name
		drug_name_iter = xmlns + 'name'
		dg_name = drug.find(drug_name_iter)
		drug_dict['dg_name'] = dg_name.text

		# parse SMILES
		smiles_property = drug.find(".//{http://www.drugbank.ca}property[{http://www.drugbank.ca}kind='SMILES']")
		if smiles_property is not None:
			dg_smile = smiles_property.find("{http://www.drugbank.ca}value").text
			drug_dict['dg_smile'] = dg_smile

		# parse ATC
		atc_iter = xmlns + 'atc-codes'
		dg_atcs = drug.find(".//" + atc_iter).findall("{http://www.drugbank.ca}atc-code")
		if dg_atcs is not None:
			dg_atc_tem=[i.attrib['code'] for i in dg_atcs]

		drug_dict['dg_ATC'] = ";".join(dg_atc_tem)


		# parse synonyms
		drug_synms_iter = xmlns + 'synonyms'
		dg_synms = drug.find(drug_synms_iter)
		dg_synm_ = dg_synms.findall(xmlns + 'synonym')
		dg_synm = ','.join(e.text for e in dg_synm_)
		drug_dict['dg_synonyms'] = dg_synm


		writer.write('|'.join(drug_dict[tt] for tt in drug_dict) + '\n') #write value
		#reset
		drug_dict = {"dg_id": "",
             "dg_name": "",
             "dg_smile": "",
             "dg_ATC": "",
             "dg_synonyms": ""
             }


drug_smiles_drugbank = pd.read_csv("pareGrug_drugbank.txt",sep='|',header=0)
drug_smiles_drugbank.rename(columns={'dg_name':'GenericName'},inplace=True)
drug_smiles_drugbank['GenericName'] = drug_smiles_drugbank['GenericName'].str.lower()
drug_smiles_drugbank['dg_synonyms'] = drug_smiles_drugbank['dg_synonyms'].str.lower()
drug31 = pd.merge(drug3,drug_smiles_drugbank[['GenericName','dg_smile','dg_ATC']],how='left' ,on='GenericName')

#81 drugs that failed to match with DrugBank.
miss_drug = drug31[drug31['dg_smile'].isna()]
drug32 = drug31.copy()

100%|██████████| 15235/15235 [00:01<00:00, 11345.27it/s]


Search for drugs with failed SMILES encoding matches in the PubChem database.

In [8]:
miss_smile={}
for i in tqdm(miss_drug['GenericName']):
	miss_smile[i]=search_from_Pubchem(i)
 
for i in range(drug32.shape[0]):
    if pd.isna(drug32.iloc[i,1])== True:
        drug32.iloc[i,1] = miss_smile[drug32.iloc[i,0]]
drug32[drug32['dg_smile'].isna()].shape

100%|██████████| 81/81 [11:49<00:00,  8.76s/it]


(17, 3)

Standardize SMILES encoding.

In [1]:
drug32['dg_smile_unif'] = drug32['dg_smile'].map(unify_smiles)
miss_smile={}
for i in tqdm(drug32[drug32['dg_smile_unif'].isna()]['GenericName']):
	miss_smile[i]=search_from_Pubchem(i)
 
for i in range(drug32.shape[0]):
    if pd.isna(drug32.iloc[i,3])== True:
        drug32.iloc[i,3] = miss_smile[drug32.iloc[i,0]]

#743 drugs
drug_complete = drug32.dropna(subset=['dg_smile_unif'])
drug_complete['GenericName'] = drug_complete['GenericName'].str.lower()

# drug_complete.to_csv('drug_complete.csv',index=False)

In [10]:
data1['GenericName'] = data1['GenericName'].str.lower()
data1['SideeffectTerm'] = data1['SideeffectTerm'].str.lower()
adr_similar_df1 = adr_similar_df.drop(labels=adr_similar_df.columns[0],axis=1)
adr_similar_df1['adr_name'] =adr_similar_df1['adr_name'].str.lower() 
adr_similar_df1.rename(columns={'adr_name': 'SideeffectTerm'},inplace=True)

#merging drug and side effective information
data2 = pd.merge(data1.iloc[:,0:3], drug_complete.iloc[:,[0,3]],how='left',on='GenericName') 
data2 = pd.merge(data2, adr_similar_df1,how='left',on='SideeffectTerm')
data3 = data2.dropna(subset=['dg_smile_unif'],axis=0)
# data3.to_csv('data3.csv',index=False)


Extract the fingerprint features of the drug.

In [11]:
#get smile code
smiles_all=data3[['GenericName','dg_smile_unif']].drop_duplicates()
pubchemfinger ={}
Macc = {}
Morgan = {}
Rtoplo = {}
#pubchem fingerprint
for i in tqdm(range(smiles_all.shape[0])):
    pubchemfinger[smiles_all.iloc[i,0]]= getcompund(smiles_all.iloc[i,1], smiles_all.iloc[i,0])
#maccs, morgan, rtoplo fringerprint    
for i in tqdm(range(0, smiles_all.shape[0])):
    Macc[smiles_all.iloc[i,0]], Morgan[smiles_all.iloc[i,0]], Rtoplo[smiles_all.iloc[i,0]] = fingerpint(smiles=smiles_all.iloc[i,1],drugname=smiles_all.iloc[i,0])

smiles_all['Macc'] = smiles_all['GenericName'].map(lambda x: Macc[x])
smiles_all['pubchem'] = smiles_all['GenericName'].map(lambda x: pubchemfinger[x])
smiles_all['Morgan'] = smiles_all['GenericName'].map(lambda x: Morgan[x])
smiles_all['Rtoplo'] = smiles_all['GenericName'].map(lambda x: Rtoplo[x])
smiles_all['pharmfp'] = smiles_all.iloc[:,1].apply(phamfp) #Pharmacophore fingerprints.

data4 = pd.merge(data3, smiles_all.iloc[:,[0,2,3,4,5,6]], how='left', on='GenericName')
# data4.to_csv('analysis_data.csv', index=False)