<a href="https://colab.research.google.com/github/kalyani234/drug_dissertation/blob/main/Alllinks_Proteinsequences_fetch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import requests
import pandas as pd
from tqdm import tqdm
from google.colab import files
import time



# Load protein IDs from the uploaded protein_ids.csv file
protein_ids = pd.read_csv("protein_ids.csv")  # Adjust file name if needed
protein_ids_list = protein_ids['protein_id'].tolist()

# Define functions to fetch protein sequences from different sources
def fetch_uniprot_protein(protein_id):
    url = f"https://www.uniprot.org/uniprot/{protein_id}.fasta"
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    return None

def fetch_ncbi_protein(protein_id):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        'db': 'protein',
        'id': protein_id,
        'rettype': 'fasta',
        'retmode': 'text'
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.text
    return None

def fetch_kegg_protein(protein_id):
    url = f"http://rest.kegg.jp/get/{protein_id}/aaseq"
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    return None

def fetch_ensembl_protein(protein_id):
    url = f"https://rest.ensembl.org/sequence/id/{protein_id}?type=protein"
    headers = {"Content-Type": "text/plain"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return f">{protein_id}\n{response.text}"  # ENSEMBL returns plain text
    return None

def fetch_pdb_protein(protein_id):
    url = f"https://www.rcsb.org/fasta/entry/{protein_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    return None

# Try fetching from all sources in order
def fetch_protein_sequence(protein_id):
    sequence = fetch_uniprot_protein(protein_id)
    if sequence:
        return sequence

    sequence = fetch_ncbi_protein(protein_id)
    if sequence:
        return sequence

    sequence = fetch_kegg_protein(protein_id)
    if sequence:
        return sequence

    sequence = fetch_ensembl_protein(protein_id)
    if sequence:
        return sequence

    sequence = fetch_pdb_protein(protein_id)
    if sequence:
        return sequence

    return None

# Fetch sequences batch-by-batch and save results
def fetch_and_save_protein_sequences_in_batches(protein_ids, batch_size=1000):
    missing_proteins = []
    total_fetched = 0

    # Split into batches
    def split_into_batches(data, batch_size):
        for i in range(0, len(data), batch_size):
            yield data[i:i + batch_size]

    for batch_index, batch in enumerate(split_into_batches(protein_ids, batch_size)):
        print(f"Processing batch {batch_index + 1}")

        batch_missing = []
        with open(f"protein_sequences_batch_{batch_index + 1}.fasta", "w") as fasta_file:
            for protein_id in tqdm(batch, desc=f"Batch {batch_index + 1}"):
                sequence = fetch_protein_sequence(protein_id)
                if sequence:
                    fasta_file.write(sequence)
                    total_fetched += 1
                else:
                    batch_missing.append(protein_id)
                time.sleep(0.3)  # Respect rate limits

        # Save missing proteins for this batch
        with open(f"missing_proteins_batch_{batch_index + 1}.log", "w") as log_file:
            for protein_id in batch_missing:
                log_file.write(protein_id + "\n")

        missing_proteins.extend(batch_missing)

    return total_fetched, missing_proteins

# Run the fetching process
batch_size = 1000  # Adjust batch size if needed
fetched_count, all_missing = fetch_and_save_protein_sequences_in_batches(protein_ids_list, batch_size)

# Download all generated files
print("\nDownloading generated files...")
for batch_index in range((len(protein_ids_list) + batch_size - 1) // batch_size):
    files.download(f"protein_sequences_batch_{batch_index + 1}.fasta")
    files.download(f"missing_proteins_batch_{batch_index + 1}.log")

print(f"Finished processing! Total sequences fetched: {fetched_count}")
if all_missing:
    print(f"Some protein IDs could not be fetched. Check the log files for details.")


Processing batch 1


Batch 1: 100%|██████████| 1000/1000 [18:41<00:00,  1.12s/it]


Processing batch 2


Batch 2: 100%|██████████| 1000/1000 [18:41<00:00,  1.12s/it]


Processing batch 3


Batch 3: 100%|██████████| 1000/1000 [18:34<00:00,  1.11s/it]


Processing batch 4


Batch 4: 100%|██████████| 1000/1000 [18:35<00:00,  1.12s/it]


Processing batch 5


Batch 5: 100%|██████████| 1000/1000 [18:31<00:00,  1.11s/it]


Processing batch 6


Batch 6: 100%|██████████| 1000/1000 [18:33<00:00,  1.11s/it]


Processing batch 7


Batch 7: 100%|██████████| 1000/1000 [18:33<00:00,  1.11s/it]


Processing batch 8


Batch 8: 100%|██████████| 1000/1000 [18:34<00:00,  1.11s/it]


Processing batch 9


Batch 9: 100%|██████████| 1000/1000 [18:34<00:00,  1.11s/it]


Processing batch 10


Batch 10: 100%|██████████| 1000/1000 [18:33<00:00,  1.11s/it]


Processing batch 11


Batch 11: 100%|██████████| 1000/1000 [18:31<00:00,  1.11s/it]


Processing batch 12


Batch 12: 100%|██████████| 1000/1000 [18:34<00:00,  1.11s/it]


Processing batch 13


Batch 13: 100%|██████████| 1000/1000 [18:32<00:00,  1.11s/it]


Processing batch 14


Batch 14: 100%|██████████| 1000/1000 [18:33<00:00,  1.11s/it]


Processing batch 15


Batch 15: 100%|██████████| 1000/1000 [18:33<00:00,  1.11s/it]


Processing batch 16


Batch 16: 100%|██████████| 1000/1000 [18:32<00:00,  1.11s/it]


Processing batch 17


Batch 17: 100%|██████████| 1000/1000 [18:31<00:00,  1.11s/it]


Processing batch 18


Batch 18: 100%|██████████| 1000/1000 [18:32<00:00,  1.11s/it]


Processing batch 19


Batch 19: 100%|██████████| 1000/1000 [18:28<00:00,  1.11s/it]


Processing batch 20


Batch 20: 100%|██████████| 102/102 [01:53<00:00,  1.11s/it]


Downloading generated files...





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Finished processing! Total sequences fetched: 19102


In [3]:
import glob

# Combine all FASTA files into one
with open("protein_sequences_all.fasta", "w") as outfile:
    for filename in sorted(glob.glob("protein_sequences_batch_*.fasta")):
        with open(filename, "r") as infile:
            outfile.write(infile.read())

# Combine all log files into one
with open("missing_proteins_all.log", "w") as outfile:
    for filename in sorted(glob.glob("missing_proteins_batch_*.log")):
        with open(filename, "r") as infile:
            outfile.write(infile.read())

# Download the combined files
from google.colab import files
files.download("protein_sequences_all.fasta")
files.download("missing_proteins_all.log")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>