In [6]:
##import all packages
from bs4 import BeautifulSoup, NavigableString
from urllib.request import urlopen
import os, re
import numpy as np
import pubchempy as pcp
import pandas as pd
from tqdm import tqdm


In [8]:
def retrieveBioassayInfo(cid):
    aid = []
    score = []
    gene = []
    act = []
    try:
        page = urlopen('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/'+cid +'/assaysummary/XML')
        soup = BeautifulSoup(page, "html.parser")
        colNames = soup.find("columns").find_all("column")
        aidIndex = -1
        activityIndex = -1
        for i, col in enumerate(colNames):
            if col.text == "AID":
                aidIndex = i
            elif col.text == "Activity Value [uM]":
                scoreIndex = i
            elif col.text == "Bioactivity Outcome":
                activityIndex = i
            elif col.text == "Target GeneID":
                geneIndex = i
        for row in soup.find_all("row"):
            values= row.find_all("cell")
            if values[geneIndex].text !=  '':
                score.append(values[scoreIndex].text)
                gene.append(values[geneIndex].text)
                aid.append(values[aidIndex].text)
                act.append(values[activityIndex].text)
    except Exception as e: 
        pass
    return(aid, act, gene, score)


In [9]:
def collectDrugActiveTar(cid):
    (aid, act, gene, score) = retrieveBioassayInfo(cid)
    activeIndices = [i for i, x in enumerate(act) if x == "Active"]
    cidList = [cid]*len(activeIndices)
    df = pd.DataFrame({'cid':cidList,
                   'aid':[x for i, x in enumerate(aid) if i in activeIndices],
                    'score':[x for i, x in enumerate(score) if i in activeIndices], 
                    'entrezID':[x for i, x in enumerate(gene) if i in activeIndices]})
    return(df)
    

In [29]:
def createFullTargetDF(drugData):
    """
    This method creates a full df with all pubchem targets
    :param: a drug data frame with one column listed with a drug's cid and a second with drug id
    :return: data frame with all bioassay target information
    """
    fullTargetDF = pd.DataFrame(columns=['cid','aid','score',
                                         'entrezID', "drug_id"])
    for i, row in tqdm(drugData.iterrows(), total=drugData.shape[0]):
        ##Collect drug targets 
        pubChemDF = collectDrugActiveTar(row["PubCHEM"])
        
        ##get drug id information
        nameDF = pd.DataFrame({"drug_id": [row["drug_id"]]*pubChemDF.shape[0]})
        
        ##combine all drug information
        drugDF = pd.concat([pubChemDF,nameDF],axis=1)
        
        fullTargetDF = pd.concat([fullTargetDF, drugDF])
    ##write out
    return(fullTargetDF)
    
 

In [30]:
##Make a data frame with one column called "PubCHEM" and other named "drug_id"
d = {'PubCHEM': ["123631"], 'drug_id': ["Gefitinib"]}
df = pd.DataFrame(data=d)
createFullTargetDF(df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


100%|██████████| 1/1 [00:06<00:00,  7.00s/it]


Unnamed: 0,aid,cid,drug_id,entrezID,score
0,393,123631,Gefitinib,5292,
1,514,123631,Gefitinib,9748,
2,1433,123631,Gefitinib,1956,0.002
3,1433,123631,Gefitinib,285220,0.59
4,1433,123631,Gefitinib,2322,1.1
5,1433,123631,Gefitinib,2322,1
6,1433,123631,Gefitinib,2322,3
7,1433,123631,Gefitinib,2444,2
8,1433,123631,Gefitinib,6793,0.47
9,1433,123631,Gefitinib,2872,1.2
