<a href="https://colab.research.google.com/github/kattens/PubChem-Data-Handler/blob/main/Pubchem_Downloader_Phase_one.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
%%capture
!pip install PubChemPy

In [56]:
import csv
import pandas as pd
import pubchempy as pcp
import os
import requests

In [57]:
file_path = '/content/drive/MyDrive/cdot_actives_50 1.xlsx'

df = pd.read_excel(file_path)

In [58]:
#based on a experiment theres some NaN values in the column that we should remove
#remove the rows with NaN as the pubchem_cid value
df = df.dropna(subset=['pubchem_cid'])

In [59]:
#access to the pubchem_cid column
df['pubchem_cid']

Unnamed: 0,pubchem_cid
0,5330175.0
1,5311340.0
2,11511120.0
3,221354.0
4,6806409.0
...,...
60,9829836.0
61,51031035.0
62,452192.0
63,30323.0


In [60]:
#make a list
pubchem_ids = df['pubchem_cid'].tolist() #type = float
pubchem_targets = df['target'].tolist() #type = string

#convert the float to int
pubchem_ids = [int(i) for i in pubchem_ids]

#make every element in pubchem_targets a list and if see | break and make a new entry in the list
for i in range(len(pubchem_targets)):
  if isinstance(pubchem_targets[i], str): #checking if the element is a string before continuing
    if '|' in pubchem_targets[i]:
        pubchem_targets[i] = pubchem_targets[i].strip().split('|')
    else:
        #remove the '' from the entry
        pubchem_targets[i] = pubchem_targets[i].strip().replace(' ', '') #Added strip before replace
        pubchem_targets[i] = [pubchem_targets[i]]

'''
#just to make sure the values are correct to make a dict
print(len(pubchem_ids))
print(len(pubchem_targets))
print(type(pubchem_targets[2][0]))
'''

#make a dictionary such that ids are the keys and targets are the values
pubchem_dict = dict(zip(pubchem_ids , pubchem_targets))
print(pubchem_dict)


{5330175: ['SRC'], 5311340: ['OPRL1'], 11511120: ['EGFR', 'ERBB2', 'ERBB4'], 221354: ['CCR1'], 6806409: ['TP53', 'USP14'], 5329480: ['EGFR', 'ERBB2'], 12947: ['TLR7', 'TLR9'], 444810: ['MRGPRX1'], 135421339: ['BRAF'], 9939609: ['PLA2G7'], 42627755: ['ERNÂ\xa01.00'], 53464483: ['TRPV4'], 3647519: ['HNMT'], 9810709: ['TOP2A'], 6413301: ['TP53', 'USP14'], 119081415: ['CDK7'], 5311382: ['EGFR', 'FGFR1', 'PDGFRB', 'PKMYT1', 'SRC', 'WEE1'], 53315868: ['EHMT2'], 10219: ['RPS2'], 9914412: ['AURKA', 'AURKB'], 2993: ['KCNN1', 'KCNN3'], 24756910: ['EGFR', 'ERBB2'], 6918097: ['ADORA3'], 4534086: ['SLC8A1', 'TRPC3', 'TRPC5', 'TRPC6'], 73416445: ['ATP1A1'], 132928: ['BDKRB2'], 5281035: ['AR'], 121750: ['TYMS'], 9852185: ['BCL2'], 51000408: ['ATR'], 73602827: ['CDK7'], 3499: ['PRKCA', 'PRKCB', 'PRKCD', 'PRKCG', 'PRKCZ'], 9809926: ['CACNA2D1'], 41867: ['CHD1', 'TOP2A'], 6918837: ['HDAC1', 'HDAC2', 'HDAC3', 'HDAC4', 'HDAC6', 'HDAC7', 'HDAC8', 'HDAC9'], 24858111: ['DNMT1', 'DNMT3A', 'DNMT3B'], 33630: ['

# Now we have a dictionary of the ids and targets.


In [61]:
for cid in pubchem_ids:
  print(cid)

5330175
5311340
11511120
221354
6806409
5329480
12947
444810
135421339
9939609
42627755
53464483
3647519
9810709
6413301
119081415
5311382
53315868
10219
9914412
2993
24756910
6918097
4534086
73416445
132928
5281035
121750
9852185
51000408
73602827
3499
9809926
41867
6918837
24858111
33630
2798
441074
4735
3034034
676352
49855250
154257
24795070
11978790
439530
6445562
9829526
5583
446536
46848036
9549305
25150857
53315882
6197
51358113
16007391
9829836
51031035
452192
30323
11785878


In [62]:
'''
Next step is to download the ids biological test results csv files from pubchem website
We tried to use the PUG API but it wasnt downloading the correct files so we did it manually
'''

#folder to save the csv files in:
folder_path = '/content/drive/MyDrive/IDS_Target_Result'

def fetch_bioassay_results(pubchem_ids, folder_path):
    # Ensure the folder exists
    os.makedirs(folder_path, exist_ok=True)
    for cid in pubchem_ids:
        # The url, using f-string formatting -> same for all the files
        url = f"https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi?infmt=json&outfmt=csv&query={{%22download%22:%22*%22,%22collection%22:%22bioactivity%22,%22order%22:[%22acvalue,asc%22],%22start%22:1,%22limit%22:10000000,%22downloadfilename%22:%22pubchem_cid_{cid}_bioactivity%22,%22nullatbottom%22:1,%22where%22:{{%22ands%22:[{{%22cid%22:%22{cid}%22}}]}}}}"

        # Send a GET request to the API
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # Define the file path
            file_path = os.path.join(folder_path, f"{cid}.csv")
            # Write the content to a CSV file
            with open(file_path, 'wb') as file:
                file.write(response.content)
            print(f"Bioassay data for CID {cid} has been downloaded and saved to {file_path}")
        else:
            print(f"Failed to retrieve bioassay data for CID {cid}. HTTP Status Code: {response.status_code}")




In [63]:
#uncomment if you need to download the ids targets
#fetch_bioassay_results(pubchem_ids , folder_path)

#Now that we have all the csv files we should extract and create the dictionary for the new ids and targets.

In [115]:
#open csv and get the column names
new_df = pd.read_csv('/content/drive/MyDrive/IDS_Target_Result/10219.csv')
print(new_df.columns)

Index([' baid', 'acvalue', 'aid', 'sid', 'cid', 'geneid', 'pmid', 'aidtype',
       'aidmdate', 'hasdrc', 'rnai', 'activity', 'protacxn', 'acname',
       'acqualifier', 'aidsrcname', 'aidname', 'cmpdname', 'targetname',
       'targeturl', 'ecs', 'repacxn', 'taxid', 'cellids', 'targettaxid',
       'anatomyid', 'anatomy', 'dois', 'pmcids', 'pclids', 'citations'],
      dtype='object')


In [144]:
#create an empty dictionary
bioessay_dict = {}

#go through all the csv files in the folder, have the name of the file as key and the column 'targetname' as value
for file in os.listdir(folder_path):
  if file.endswith('.csv'):
    file_path = os.path.join(folder_path , file)
    df = pd.read_csv(file_path)
    file = file.replace('.csv', '')  # Remove .csv from the key
    #if the value is equal to nan or not a string remove it
    df = df[df['targetname'].notna()]
    #modify the bioessay_dict in a way that the values are only unique elements -> use set() function
    bioessay_dict[file] = list(set(df['targetname']))
    #remove dashes (-) from the list of the values
    for i in range(len(bioessay_dict[file])):
      if '-' in bioessay_dict[file][i]:
        bioessay_dict[file][i] = bioessay_dict[file][i].replace('-', '')
        print(bioessay_dict[file][i])  # for debugging -> its working

'''
#to see if its working
print(bioessay_dict['5330175'])
print(bioessay_dict.keys())
#print(bioessay_dict.values()[61])
print(list(bioessay_dict.values())[0])
print(len(list(bioessay_dict.values())[0]))
'''

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
CDK7  cyclin dependent kinase 7 (human)
MAPK8  mitogenactivated protein kinase 8 (human)
IKBKB  inhibitor of nuclear factor kappa B kinase subunit beta (human)
WNK3  WNK lysine deficient protein kinase 3 (human)
TEC  tec protein tyrosine kinase (human)
MINK1  misshapen like kinase 1 (human)
Chain A, ATPDEPENDENT DNA HELICASE Q1 (human)
GPX4  glutathione peroxidase 4 (human)
PTK2B  protein tyrosine kinase 2 beta (human)
PTK6  protein tyrosine kinase 6 (human)
CDK2  cyclin dependent kinase 2 (human)
MAP2K1  mitogenactivated protein kinase kinase 1 (human)
PRKCB  protein kinase C beta (human)
SYK  spleen associated tyrosine kinase (human)
MAPK3  mitogenactivated protein kinase 3 (human)
ORF1ab  ORF1a polyprotein;ORF1ab polyprotein (Severe acute respiratory syndrome coronavirus 2)
LYN  LYN protooncogene, Src family tyrosine kinase (human)
TAOK2  TAO kinase 2 (human)
TXK  TXK tyrosine kinase (human)
VDR  vitamin D receptor (hu

"\n#to see if its working \nprint(bioessay_dict['5330175'])\nprint(bioessay_dict.keys())\n#print(bioessay_dict.values()[61])\nprint(list(bioessay_dict.values())[0])\nprint(len(list(bioessay_dict.values())[0]))\n"

### **Why did we decide to remove the '-' from the target names?**
1. **Removing the Dash (`-`) Broadens the Search Scope**:
   - Correct: Removing the dash makes the query less strict, allowing UniProt to search for terms independently rather than as a single specific entity.
   - With the dash, UniProt interprets the query as a combined condition (`DRD1 - dopamine receptor D1`), which might not match any entry directly or narrows the search results significantly.

2. **Dashes in Names Indicate Context, Not the Full Name**:
   - Partially correct: While the dash *can* separate parts of the query (e.g., gene name vs. organism name), its meaning is dependent on the context. For example:
     - `"DRD1 - dopamine receptor D1 (human)"` might be interpreted as looking for a human protein explicitly labeled in that format.
     - Without the dash (`DRD1 dopamine receptor D1`), UniProt searches for "DRD1" and "dopamine receptor D1" more freely, increasing potential matches.

3. **Why Removing the Dash Expands Results**:
   - UniProt's search treats the dash (`-`) as part of the query, which could limit results to exact matches with the dash present. When you remove the dash, the system searches for terms individually, yielding broader matches.

---

### **Refined Explanation**
- If the dash exists in your query (`DRD1 - dopamine receptor D1`), UniProt interprets it as a specific, tightly coupled phrase. This might not yield results unless an entry explicitly matches this exact structure.
- Without the dash (`DRD1 dopamine receptor D1`), UniProt treats the terms as separate keywords and attempts to find matches containing any or all of them. This is why removing the dash often yields broader and more relevant results.

---

### **Source**

The behavior you've observed when searching UniProt with and without a dash (`-`) in your query stems from how UniProt's search engine interprets special characters and query syntax. While UniProt's documentation doesn't explicitly detail the impact of dashes in search queries, we can infer the following based on general search engine behaviors and available information:

**1. Special Characters in Search Queries:**
- Search engines often treat special characters like dashes as operators or delimiters. In some contexts, a dash can signify exclusion (e.g., `term1 -term2` searches for entries containing "term1" but not "term2"). However, without explicit documentation from UniProt on this behavior, it's unclear how a dash is processed in your specific query.

**2. Broadening Search Results:**
- Removing the dash from your query (`DRD1 dopamine receptor D1`) allows the search engine to interpret the terms more flexibly, potentially returning a broader set of results. This approach aligns with general search practices where simplifying queries can yield more comprehensive results.

**3. Organism Specification:**
- Including terms like "human" in your query helps specify the organism of interest. UniProt entries often include the organism name, so adding this term can refine your search to entries related to human proteins.

**Recommendations:**
- **Simplify Queries:** Use straightforward terms without special characters unless you're certain of their function in the search syntax.
- **Specify Organism:** Include the organism name (e.g., "human") to narrow down results to the species of interest.
- **Consult Documentation:** For complex queries, refer to UniProt's [Advanced Search Help](https://www.uniprot.org/help/advanced_search) for guidance on query syntax and field-specific searches.



# Summary of Changes  
We removed `.csv` from the keys, ensuring that the list of values for each key is unique, and removed dashes from the values in the target name list.

# Next Step  
1. Create a folder named after each ID (key in the dictionary).  
2. For each value in the list corresponding to a key, download the UniProt sequence file (using the first entry in the search query).  
3. Place the downloaded files into the respective folder.  

The final result should be 63 folders, each containing multiple UniProt sequence files.

In [None]:
'''

https://www.uniprot.org/uniprotkb/P21964/entry


https://www.uniprot.org/uniprotkb?query=COMT+-+catecholOmethyltransferase+%28human%29

https://www.uniprot.org/uniprotkb?query=SYK+spleen+associated+tyrosine+kinase+%28human%29

https://www.uniprot.org/uniprotkb/P61073/entry

'''

#UniProt API

In [148]:
# Step 1: Function to search UniProt and get the accession ID
def search_uniprot(query):
    """
    Search UniProt for a given query and return the accession ID of the first result.
    """
    base_search_url = "https://rest.uniprot.org/uniprotkb/search"
    search_params = {
        "query": query,
        "fields": "accession",
        "size": 1,  # Limit to the first result
    }
    response = requests.get(base_search_url, params=search_params)

    if response.status_code == 200:
        results = response.json().get("results", [])
        if results:
            accession = results[0]["primaryAccession"]
            print(f"Found entry: {accession}")
            return accession
        else:
            print("No results found.")
            return None
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return None

# Step 2: Function to download the FASTA sequence using accession ID
def download_uniprot_sequence(accession, folder_path):
    """
    Download the FASTA sequence for a given accession ID and save it in the specified folder.
    """
    # Ensure the folder exists
    os.makedirs(folder_path, exist_ok=True)

    # Construct the file path
    file_path = os.path.join(folder_path, f"{accession}.fasta")

    # Base URL for UniProt sequence download
    base_url = f"https://rest.uniprot.org/uniprotkb/{accession}.fasta"
    response = requests.get(base_url)

    if response.status_code == 200:
        # Save the sequence to the specified folder
        with open(file_path, "w") as fasta_file:
            fasta_file.write(response.text)
        print(f"Sequence saved at {file_path}")
    else:
        print(f"Error: {response.status_code}, {response.text}")

# Step 3: Combined function to search and download
def get_uniprot_fasta(query, folder_path):
    """
    Search UniProt for a query, retrieve the first result's accession ID,
    and download the FASTA sequence to the specified folder.
    """
    # Search the entry
    accession = search_uniprot(query)
    if accession:
        # Download the sequence
        download_uniprot_sequence(accession, folder_path)


In [163]:
'''
This is the pipeline of making the folders and downloading the files
'''

FolderPath = '/content/drive/MyDrive/ExamplePath'

first_key, first_value = next(iter(bioessay_dict.items()))

bioessay_dict_test[first_key] = first_value

print(bioessay_dict_test)
print(len(first_value))
print(len(first_key))

{'5330175': ['S  surface glycoprotein (Severe acute respiratory syndrome coronavirus 2)', 'LCK  LCK protooncogene, Src family tyrosine kinase (human)', 'CYP2C9  cytochrome P450 family 2 subfamily C member 9 (human)', 'COMT  catecholOmethyltransferase (human)', 'HNMT  histamine Nmethyltransferase (human)', 'CIB1  calcium and integrin binding 1 (human)', 'Chain A, Calcium and integrinbinding protein 1 (human)', 'HDAC6  histone deacetylase 6 (human)', 'NNMT  nicotinamide Nmethyltransferase (human)', 'NSD2  nuclear receptor binding SET domain protein 2 (human)', 'CYP3A7  cytochrome P450 family 3 subfamily A member 7 (human)', 'Severe acute respiratory syndrome coronavirus 2', 'NTMT1  Nterminal XaaProLys Nmethyltransferase 1 (human)', 'GAMT  guanidinoacetate Nmethyltransferase (human)', 'CSK  Cterminal Src kinase (human)', 'FH  fumarate hydratase (human)', 'Rattus norvegicus (Norway rat)', 'CYP2D6  cytochrome P450 family 2 subfamily D member 6 (gene/pseudogene) (human)', 'GPT2  glutamicpyru

In [161]:
#go through the dictionary
for key in bioessay_dict_test:
    #create a folder with the key name in FolderPath
    folder_path = os.path.join(FolderPath, key)
    #create the folder
    os.makedirs(folder_path, exist_ok=True)
    #after making folder download the values in the dict
    for value in bioessay_dict[key]:
      for entry in value:
        get_uniprot_fasta(entry, folder_path)



Found entry: P10484
Sequence saved at /content/drive/MyDrive/ExamplePath/5330175/P10484.fasta
No results found.
No results found.
Found entry: P10484
Sequence saved at /content/drive/MyDrive/ExamplePath/5330175/P10484.fasta
Found entry: Q9JJT9
Sequence saved at /content/drive/MyDrive/ExamplePath/5330175/Q9JJT9.fasta
Found entry: A0A4P7VJP0
Sequence saved at /content/drive/MyDrive/ExamplePath/5330175/A0A4P7VJP0.fasta
Found entry: M9MSB2
Sequence saved at /content/drive/MyDrive/ExamplePath/5330175/M9MSB2.fasta
Found entry: Q7ZVB2
Sequence saved at /content/drive/MyDrive/ExamplePath/5330175/Q7ZVB2.fasta
Found entry: P51521
Sequence saved at /content/drive/MyDrive/ExamplePath/5330175/P51521.fasta
Found entry: Q00496
Sequence saved at /content/drive/MyDrive/ExamplePath/5330175/Q00496.fasta
No results found.
Found entry: Q60393
Sequence saved at /content/drive/MyDrive/ExamplePath/5330175/Q60393.fasta
Found entry: Q8B9Q8
Sequence saved at /content/drive/MyDrive/ExamplePath/5330175/Q8B9Q8.fast

KeyboardInterrupt: 

In [162]:
ppath = '/content/drive/MyDrive/ExamplePath/5330175'

#check how many files are ther in ppath
print(len(os.listdir(ppath)))

33
