<a href="https://colab.research.google.com/github/kattens/PubChem-Data-Handler/blob/main/Blast_Run.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recap:

We created a CSV file using the PubChem IDs from the initial `cdot_targets` CSV file. This file includes all the PubChem IDs for which we searched PubChem to identify targets that interact with them, recording these interactions in the "Target Names" column. We also included a column for "Accession IDs," which are the UniProt IDs corresponding to each target name. To streamline the process, we added a column with only gene names to enhance the accuracy of the BLAST results using the accession IDs.

#Goal:
we wanna run the blast from our previously created csv file with the targets uniprot ids agains our target (plasmodiom malaria)

In [None]:
#install Bio
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


The NCBIWWW module allows for running BLAST searches online, while NCBIXML parses the XML results from these searches into a manageable format in Python.
The `SeqIO` module in Biopython is used for reading and writing sequences in various bioinformatics file formats.

In [None]:
import csv
import pandas as pd
from Bio.Blast import NCBIWWW,NCBIXML
from Bio import SeqIO
import requests

In [None]:
path = '/content/drive/MyDrive/target_results.csv'
df = pd.read_csv(path)

In [None]:
df.head()

Unnamed: 0,PubChem ID,Target Names,Accession IDs,Target Gene Name
0,5330175,"['Tyrosineprotein', 'NTMT1', 'FH', 'Chain', 'N...","['A0A0K2VLS4', 'S4R3J7', 'P07954', 'P0C023', '...","['NTMT1', 'FH', 'NSD2', 'KDR', 'GPX4', 'COMT',..."
1,5311340,"['ID4', 'ALDH1A1', 'EZH2', 'MYC', 'GLA', 'APOB...","['P47928', 'Q5SYQ8', 'Q921E6', 'A0A8A5GQJ2', '...","['ID4', 'ALDH1A1', 'EZH2', 'MYC', 'GLA', 'APOB..."
2,11511120,"['AcylCoA', 'Epidermal', 'Mitogenactivated', '...","['B2BXS0', 'Q9Z0P7', 'L8GZV5', 'P05067', 'Q9ZN...","['NADH', 'MAP', 'CYP2C9', 'NSD2', 'ERBB4', 'GP..."
3,221354,"['CYP2D6', 'lethal', 'ALDH1A1', 'RGS12', 'ALOX...","['P10635', 'A1Z198', 'Q5SYQ8', 'E9Q652', 'I3L1...","['CYP2D6', 'ALDH1A1', 'RGS12', 'ALOX15B', 'HPG..."
4,6806409,[],[],[]


#Parameters:
blast_sequence_from_uniprot(accession_id, identity_threshold=80.0, e_value_threshold=1e-5, taxonomy="5858"):
    

In [None]:
def fetch_uniprot_sequence(accession_id):
    """
    Fetch a sequence from UniProt using the accession ID.
    """
    url = f"https://rest.uniprot.org/uniprotkb/{accession_id}.fasta"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            print(f"UniProt query failed for {accession_id} with status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error querying UniProt for {accession_id}: {e}")
        return None

def blast_sequence_from_uniprot(accession_id, taxonomy="5858"):
    """
    Fetch a sequence from UniProt, run BLAST with custom settings, and print results.
    """
    print(f"Fetching sequence for Accession ID: {accession_id} from UniProt...")
    sequence_data = fetch_uniprot_sequence(accession_id)
    if not sequence_data:
        print(f"Sequence for Accession ID {accession_id} could not be found in UniProt.")
        return

    entrez_query = f"txid{taxonomy}[ORGN]"
    print(f"Running BLAST for Accession ID: {accession_id}...")
    result_handle = NCBIWWW.qblast("blastp", "nr", sequence_data,
                                   expect=100, matrix_name="BLOSUM62",
                                   alignments=250, hitlist_size=250,
                                   filter="F", gapcosts="11 1",
                                   entrez_query=entrez_query)

    blast_record = NCBIXML.read(result_handle)
    if not blast_record.alignments:
        print(f"No BLAST results found for Accession ID: {accession_id}")
        return

    print(f"Results for Accession ID: {accession_id}")
    for alignment in blast_record.alignments[:5]:  # Top 5 hits similar to the web interface
        print(f"  Hit ID: {alignment.hit_id}")
        print(f"  Hit Description: {alignment.hit_def}")
        for hsp in alignment.hsps[:1]:  # Only the first HSP per alignment
            print(f"    E-value: {hsp.expect}")
            print(f"    Score: {hsp.score}")
            print(f"    Query Alignment: {hsp.query[:50]}...")
            print(f"    Subject Alignment: {hsp.sbjct[:50]}...")
            print("-" * 80)



In [None]:
# Example usage
blast_sequence_from_uniprot("P10635")

Fetching sequence for Accession ID: P10635 from UniProt...
Running BLAST for Accession ID: P10635...
Results for Accession ID: P10635
  Hit ID: emb|SBT80758.1|
  Hit Description: drug/metabolite transporter, putative [Plasmodium malariae]
    E-value: 5.11915
    Score: 64.0
    Query Alignment: QLAWTPVVVLNGLAAVREALVTHGEDTAD-------RPPVPI-TQILGFG...
    Subject Alignment: KIAYMPIIILSVTGAIRQAIVIIALQYTDSHNVAIIQPTIPIFTAILSYY...
--------------------------------------------------------------------------------
  Hit ID: ref|XP_028864306.1|
  Hit Description: drug/metabolite transporter, putative [Plasmodium malariae] >ref|XP_067070681.1| drug/metabolite transporter DMT1 [Plasmodium brasilianum] >gb|KAI4834792.1| drug/metabolite transporter DMT1 [Plasmodium brasilianum] >emb|SCP03351.1| drug/metabolite transporter, putative [Plasmodium malariae]
    E-value: 5.22542
    Score: 64.0
    Query Alignment: QLAWTPVVVLNGLAAVREALVTHGEDTAD-------RPPVPI-TQILGFG...
    Subject Alignment: KIAYMPIIILSVTGA

In [None]:
# Example usage
blast_sequence_from_uniprot("L8GZV5")

Fetching sequence for Accession ID: L8GZV5 from UniProt...
Running BLAST for Accession ID: L8GZV5...
Results for Accession ID: L8GZV5
  Hit ID: emb|SBT71238.1|
  Hit Description: mitogen-activated protein kinase 2, putative [Plasmodium malariae]
    E-value: 1.82337e-41
    Score: 388.0
    Query Alignment: RLTDLYDLQHVIGQGAYGVVWLALDRRSGQRVAVKKIADVFGDSKEAKRT...
    Subject Alignment: RVPDNYEIKHLIGRGSYGYVYLAYDKNTNKNVAIKKVNRMFEDLIDCKRI...
--------------------------------------------------------------------------------
  Hit ID: ref|XP_028861557.1|
  Hit Description: mitogen-activated protein kinase 2, putative [Plasmodium malariae] >emb|SBS85304.1| mitogen-activated protein kinase 2, putative (MAPK2) [Plasmodium malariae] >emb|SCN12660.1| mitogen-activated protein kinase 2, putative [Plasmodium malariae]
    E-value: 3.36168e-41
    Score: 387.0
    Query Alignment: RLTDLYDLQHVIGQGAYGVVWLALDRRSGQRVAVKKIADVFGDSKEAKRT...
    Subject Alignment: RVPDNYEIKHLIGRGSYGYVYLAYDKNTNKNVAIKKVNRMFEDLIDC

In [None]:
# Example usage
blast_sequence_from_uniprot("Q9Z0P7")

Fetching sequence for Accession ID: Q9Z0P7 from UniProt...
Running BLAST for Accession ID: Q9Z0P7...
Results for Accession ID: Q9Z0P7
  Hit ID: ref|XP_028859374.1|
  Hit Description: pre-mRNA-processing-splicing factor 8, putative [Plasmodium malariae] >ref|XP_067075240.1| pre-mRNA-processing-splicing factor 8 [Plasmodium brasilianum] >gb|KAI4840639.1| pre-mRNA-processing-splicing factor 8 [Plasmodium brasilianum] >emb|SBT86203.1| pre-mRNA-processing-splicing factor 8, putative [Plasmodium malariae]
    E-value: 10.014
    Score: 62.0
    Query Alignment: SVAPGPAAPPASGPSAPPAFASLFPPGLHAIYGECRRLYPDQPN-PLQV...
    Subject Alignment: SIPPNMHSIPPNMHSIPPNMHSI-PPNMHSIPPNMYSMPPNMPNMPLNM...
--------------------------------------------------------------------------------
  Hit ID: emb|SBT01368.1|
  Hit Description: PIR Superfamily Protein [Plasmodium malariae]
    E-value: 22.4057
    Score: 59.0
    Query Alignment: ELTFRLKRETGESAPPTWPAELMQGLARYV...
    Subject Alignment: DLRWKLDEDTKIRCPPKKPSEV

#NEW CODE:
### this would be easier to use to loop through different types of taxonomies but the results are the same as the code above

In [None]:
from Bio import Entrez #The `Entrez` module in Biopython facilitates searching and retrieving data from NCBI databases via the Entrez online system.

def blast_with_custom_settings(accession_id, email="your_email@example.com",
                               db="nr", evalue=10, taxonomy=None):
    """
    Fetch a sequence using an accession ID, run BLAST with custom settings, and print results.

    Parameters:
        accession_id (str): Accession ID for the sequence to BLAST.
        email (str): Email for NCBI Entrez.
        db (str): Target database for BLAST (e.g., 'nr' for non-redundant database).
        evalue (float): E-value threshold for BLAST hits.
        taxonomy (str): Taxonomy ID or organism name to filter results.
    """
    Entrez.email = email

    try:
        # Format the Entrez taxonomy filter
        entrez_query = f"txid{taxonomy}[ORGN]" if taxonomy else None

        # Run BLAST with custom parameters on the nr database
        print(f"Running BLAST on database '{db}' with E-value threshold {evalue}...")
        result_handle = NCBIWWW.qblast("blastp", db, f"SP|{accession_id}", expect=evalue, entrez_query=entrez_query)

        # Parse BLAST results
        blast_record = NCBIXML.read(result_handle)

        # Check if there are alignments
        if not blast_record.alignments:
            print(f"No results found for Accession ID: {accession_id}")
            return

        # Print top hits
        print(f"Results for Accession ID: {accession_id}")
        for alignment in blast_record.alignments[:5]:  # Limit to top 5 hits
            print(f"  Hit ID: {alignment.hit_id}")
            print(f"  Hit Description: {alignment.hit_def}")
            for hsp in alignment.hsps[:1]:  # Only the first HSP per alignment
                print(f"    E-value: {hsp.expect}")
                print(f"    Score: {hsp.score}")
                print(f"    Query Alignment: {hsp.query[:50]}...")
                print(f"    Subject Alignment: {hsp.sbjct[:50]}...")
            print("-" * 80)

    except Exception as e:
        print(f"Error for Accession ID {accession_id}: {e}")



ModuleNotFoundError: No module named 'Bio'

In [None]:
# Example usage
blast_with_custom_settings(
    accession_id="P10635",
    email="kattayun.ens@gmail.com",
    db="nr",  # Use the non-redundant (nr) database
    evalue=1,
    taxonomy="5810"  # Plasmodium malaria
)


Running BLAST on database 'nr' with E-value threshold 1...
Results for Accession ID: P10635
  Hit ID: gb|KYK65665.1|
  Hit Description: cytochrome p450 superfamily protein, partial [Toxoplasma gondii TgCatPRC2]
    E-value: 1.30795e-07
    Score: 123.0
    Query Alignment: FRIPKGTTLITNLSSVLKDEAVWEKPFR-FHPEHFLDAQGHFVKPEAFLP...
    Subject Alignment: FSIPAGTVILVDNYSLTRDEVLWGQDANVFNPDRFV---GRIWQQAPWLP...
--------------------------------------------------------------------------------
  Hit ID: ref|XP_018635310.1|
  Hit Description: cytochrome p450 superfamily protein [Toxoplasma gondii ME49] >gb|EPT25706.1| cytochrome p450 superfamily protein [Toxoplasma gondii ME49] >gb|ESS35314.1| cytochrome p450 superfamily protein [Toxoplasma gondii VEG] >gb|KFG36736.1| cytochrome p450 superfamily protein [Toxoplasma gondii GAB2-2007-GAL-DOM2] >tpe|CEL77777.1| TPA: cytochrome p450, putative [Toxoplasma gondii VEG]
    E-value: 2.55111e-07
    Score: 132.0
    Query Alignment: AHMPYTTAVIHEVQRFGDIVPLGVT

### since the base code is created and is working, we want to save the data in a json file for each accession ID