<a href="https://colab.research.google.com/github/kalyani234/drug_dissertation/blob/main/Protein_sequence_fetch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

# Define paths
output_dir = '/content/drive/MyDrive/ColabNotebooks/drug'
protein_ids_path = os.path.join(output_dir, 'protein_ids.csv')
output_fasta_path = os.path.join(output_dir, 'protein_sequences.fasta')

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Load protein IDs from CSV file in Google Drive
protein_ids_df = pd.read_csv(protein_ids_path)
protein_ids = protein_ids_df['protein_id'].tolist()

# Fetch a single protein's FASTA sequence
def fetch_fasta(pid):
    url = f"https://www.uniprot.org/uniprot/{pid}.fasta"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        print(f"Fetched sequence for {pid}")
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {pid}: {e}")
        return None

# Fetch all sequences in parallel and save each one directly
def fetch_all_fasta(protein_ids):
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(fetch_fasta, pid): pid for pid in protein_ids}
        with open(output_fasta_path, "a") as fasta_file:
            for future in as_completed(futures):
                fasta_data = future.result()
                if fasta_data:
                    fasta_file.write(fasta_data)

# Run the parallel fetch
fetch_all_fasta(protein_ids)
print(f"Protein sequences saved to {output_fasta_path}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Fetched sequence for Q836J0
Fetched sequence for P52700
Fetched sequence for P06276Fetched sequence for Q16539

Fetched sequence for Q16539
Fetched sequence for P0A017
Fetched sequence for Q16539
Fetched sequence for P00734
Fetched sequence for P00734
Fetched sequence for P17612
Fetched sequence for P61925
Fetched sequence for Q9NZK7
Fetched sequence for P49841
Fetched sequence for P31751
Fetched sequence for P11217Fetched sequence for P63208

Fetched sequence for Q16773
Fetched sequence for P00929
Fetched sequence for P0A2K1
Fetched sequence for P0A2K1Fetched sequence for P00929
Fetched sequence for P00929

Fetched sequence for P0A2K1
Fetched sequence for O76074
Fetched sequence for Q08499
Fetched sequence for Q13946
Fetched sequence for Q13370
Fetched sequence for O76083
Fetched sequence for O60658
Fetched sequence for O00408
Fetched sequence for P18545
Fetched sequence for Q9UNI1
Fetched sequence for Q9UNI1
Fetched seq