<a href="https://colab.research.google.com/github/kattens/PubChem-Data-Handler/blob/main/Pubchem_Downloader_Phase_one.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%%capture
!pip install PubChemPy

In [3]:
import csv
import pandas as pd
import pubchempy as pcp
import os
import requests

In [4]:
file_path = '/content/drive/MyDrive/cdot_actives_50 1.xlsx'

df = pd.read_excel(file_path)

In [5]:
#based on a experiment theres some NaN values in the column that we should remove
#remove the rows with NaN as the pubchem_cid value
df = df.dropna(subset=['pubchem_cid'])

In [6]:
#access to the pubchem_cid column
df['pubchem_cid']

Unnamed: 0,pubchem_cid
0,5330175.0
1,5311340.0
2,11511120.0
3,221354.0
4,6806409.0
...,...
60,9829836.0
61,51031035.0
62,452192.0
63,30323.0


In [7]:
#make a list
pubchem_ids = df['pubchem_cid'].tolist() #type = float
pubchem_targets = df['target'].tolist() #type = string

#convert the float to int
pubchem_ids = [int(i) for i in pubchem_ids]

#make every element in pubchem_targets a list and if see | break and make a new entry in the list
for i in range(len(pubchem_targets)):
  if isinstance(pubchem_targets[i], str): #checking if the element is a string before continuing
    if '|' in pubchem_targets[i]:
        pubchem_targets[i] = pubchem_targets[i].strip().split('|')
    else:
        #remove the '' from the entry
        pubchem_targets[i] = pubchem_targets[i].strip().replace(' ', '') #Added strip before replace
        pubchem_targets[i] = [pubchem_targets[i]]

'''
#just to make sure the values are correct to make a dict
print(len(pubchem_ids))
print(len(pubchem_targets))
print(type(pubchem_targets[2][0]))
'''

#make a dictionary such that ids are the keys and targets are the values
pubchem_dict = dict(zip(pubchem_ids , pubchem_targets))
print(pubchem_dict)


{5330175: ['SRC'], 5311340: ['OPRL1'], 11511120: ['EGFR', 'ERBB2', 'ERBB4'], 221354: ['CCR1'], 6806409: ['TP53', 'USP14'], 5329480: ['EGFR', 'ERBB2'], 12947: ['TLR7', 'TLR9'], 444810: ['MRGPRX1'], 135421339: ['BRAF'], 9939609: ['PLA2G7'], 42627755: ['ERNÂ\xa01.00'], 53464483: ['TRPV4'], 3647519: ['HNMT'], 9810709: ['TOP2A'], 6413301: ['TP53', 'USP14'], 119081415: ['CDK7'], 5311382: ['EGFR', 'FGFR1', 'PDGFRB', 'PKMYT1', 'SRC', 'WEE1'], 53315868: ['EHMT2'], 10219: ['RPS2'], 9914412: ['AURKA', 'AURKB'], 2993: ['KCNN1', 'KCNN3'], 24756910: ['EGFR', 'ERBB2'], 6918097: ['ADORA3'], 4534086: ['SLC8A1', 'TRPC3', 'TRPC5', 'TRPC6'], 73416445: ['ATP1A1'], 132928: ['BDKRB2'], 5281035: ['AR'], 121750: ['TYMS'], 9852185: ['BCL2'], 51000408: ['ATR'], 73602827: ['CDK7'], 3499: ['PRKCA', 'PRKCB', 'PRKCD', 'PRKCG', 'PRKCZ'], 9809926: ['CACNA2D1'], 41867: ['CHD1', 'TOP2A'], 6918837: ['HDAC1', 'HDAC2', 'HDAC3', 'HDAC4', 'HDAC6', 'HDAC7', 'HDAC8', 'HDAC9'], 24858111: ['DNMT1', 'DNMT3A', 'DNMT3B'], 33630: ['

# Now we have a dictionary of the ids and targets.


In [8]:
for cid in pubchem_ids:
  print(cid)

5330175
5311340
11511120
221354
6806409
5329480
12947
444810
135421339
9939609
42627755
53464483
3647519
9810709
6413301
119081415
5311382
53315868
10219
9914412
2993
24756910
6918097
4534086
73416445
132928
5281035
121750
9852185
51000408
73602827
3499
9809926
41867
6918837
24858111
33630
2798
441074
4735
3034034
676352
49855250
154257
24795070
11978790
439530
6445562
9829526
5583
446536
46848036
9549305
25150857
53315882
6197
51358113
16007391
9829836
51031035
452192
30323
11785878


In [9]:
'''
Next step is to download the ids biological test results csv files from pubchem website
We tried to use the PUG API but it wasnt downloading the correct files so we did it manually
'''

#folder to save the csv files in:
folder_path = '/content/drive/MyDrive/IDS_Target_Result'

def fetch_bioassay_results(pubchem_ids, folder_path):
    # Ensure the folder exists
    os.makedirs(folder_path, exist_ok=True)
    for cid in pubchem_ids:
        # The url, using f-string formatting -> same for all the files
        url = f"https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi?infmt=json&outfmt=csv&query={{%22download%22:%22*%22,%22collection%22:%22bioactivity%22,%22order%22:[%22acvalue,asc%22],%22start%22:1,%22limit%22:10000000,%22downloadfilename%22:%22pubchem_cid_{cid}_bioactivity%22,%22nullatbottom%22:1,%22where%22:{{%22ands%22:[{{%22cid%22:%22{cid}%22}}]}}}}"

        # Send a GET request to the API
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # Define the file path
            file_path = os.path.join(folder_path, f"{cid}.csv")
            # Write the content to a CSV file
            with open(file_path, 'wb') as file:
                file.write(response.content)
            print(f"Bioassay data for CID {cid} has been downloaded and saved to {file_path}")
        else:
            print(f"Failed to retrieve bioassay data for CID {cid}. HTTP Status Code: {response.status_code}")




In [None]:
fetch_bioassay_results(pubchem_ids , folder_path)

#Now that we have all the csv files we should extract and create the dictionary for the new ids and targets.

In [12]:
#open csv and get the column names
new_df = pd.read_csv('/content/drive/MyDrive/IDS_Target_Result/10219.csv')
print(new_df.columns)

Index([' baid', 'acvalue', 'aid', 'sid', 'cid', 'geneid', 'pmid', 'aidtype',
       'aidmdate', 'hasdrc', 'rnai', 'activity', 'protacxn', 'acname',
       'acqualifier', 'aidsrcname', 'aidname', 'cmpdname', 'targetname',
       'targeturl', 'ecs', 'repacxn', 'taxid', 'cellids', 'targettaxid',
       'anatomyid', 'anatomy', 'dois', 'pmcids', 'pclids', 'citations'],
      dtype='object')


In [13]:
#create an empty dictionary
bioessay_dict = {}

#go through all the csv files in the folder, have the name of the file as key and the column 'targetname' as value
for file in os.listdir(folder_path):
  if file.endswith('.csv'):
    file_path = os.path.join(folder_path , file)
    df = pd.read_csv(file_path)
    bioessay_dict[file] = df['targetname'].tolist()


#As shown here:
3034034 pubchem id has many targets
we may run into things like that but number of the unique elements are not much.

In [19]:
#give example from the bioessay_dict with the first entry
print(list(bioessay_dict.items())[40])
#check how many elements are there in the value which is a list
print(len(list(bioessay_dict.values())[30]))
#also drop the values that are similar only keep the unique ones in the list
print(len(set(list(bioessay_dict.values())[30])))
print(list(set(list(bioessay_dict.values())[30])))

('3034034.csv', ['Plasmodium falciparum (malaria parasite P. falciparum)', 'Plasmodium falciparum (malaria parasite P. falciparum)', 'Plasmodium falciparum (malaria parasite P. falciparum)', 'CYP2D6 - cytochrome P450 family 2 subfamily D member 6 (gene/pseudogene) (human)', 'Plasmodium falciparum (malaria parasite P. falciparum)', 'Plasmodium falciparum (malaria parasite P. falciparum)', 'Plasmodium falciparum (malaria parasite P. falciparum)', 'Plasmodium falciparum (malaria parasite P. falciparum)', 'Plasmodium falciparum (malaria parasite P. falciparum)', 'Plasmodium falciparum (malaria parasite P. falciparum)', 'Plasmodium falciparum (malaria parasite P. falciparum)', 'Plasmodium falciparum (malaria parasite P. falciparum)', 'Plasmodium falciparum (malaria parasite P. falciparum)', 'Plasmodium falciparum (malaria parasite P. falciparum)', 'Plasmodium falciparum (malaria parasite P. falciparum)', 'Plasmodium falciparum (malaria parasite P. falciparum)', 'Plasmodium falciparum (malar

In [None]:
#modify the bioessay_dict in a way that the values are only unique elements -> use set() function
for i in

# push the folder of the ids targets to github:

#how to do next step:
make a folder with the name of the id (key in dict), and then download the uniprot sequence files (the first entry in the search query) for all the values and put them in the folder. in the end we should have 63 folders with many uniprot sequences in them.