In [None]:
#=== Installs ===#
# in shell
# pip install biopython

In [None]:
#=== Imports and setup ===#
from dotenv import load_dotenv
import os
from Bio import Entrez
import time
import csv
import threading
import subprocess
import pandas as pd
import sqlite3
import re
import xml.etree.ElementTree as ET

# Specify the exact path to the .env file
load_dotenv(dotenv_path="/home/jordan/Github/0.local/generic-single-cell-pipeline/.env")

# Set Entrez credentials
Entrez.email = os.getenv("NCBI_EMAIL")
Entrez.api_key = os.getenv("NCBI_API_KEY")


In [None]:
#=== Fetch sequencing run IDs from SRA using Entrez NCBI API using a specific search term ===#
def fetch_sra_IDs(search_term, batch_size, max_records, verbose=True):
    
    ##
    if verbose:
        print("Called fetch_sra_IDs function with the following parameters:")
        print(f"search term: {search_term}")
        print(f"batch size: {batch_size}")
        print(f"max records to return: {max_records}")

    ## Prepare results list
    results = []

    ## Step 1: Search SRA and get WebEnv + QueryKey
    if verbose:
        print("Querying Enztrez for WebEnv and Query Key")
    #
    handle = Entrez.esearch(db="sra", term=search_term, retmax=0, usehistory="y")
    record = Entrez.read(handle)
    handle.close()
    #
    if verbose:
        print("Entrez query returned the following results:")
        print(record)

    #
    count = int(record['Count'])
    webenv = record['WebEnv']
    query_key = record['QueryKey']
    #
    if verbose:
        print(f"Total matching SRA records: {count}")
        print(f"webenv id: {webenv}")
        print(f"query key: {query_key}")
    
    ##
    if verbose:
        print("Querying Entrez for run metadata")
    #
    for start in range(0, max_records, batch_size):
        
        end = min(max_records, start + batch_size)

        ##
        if verbose:
            print(f"Fetching records {start + 1} to {end}")
        #
        handle = Entrez.efetch(
            db="sra",
            rettype="uilist",
            retstart=start,
            retmax=batch_size,
            webenv=webenv,
            query_key=query_key
        )
        
        #
        ids_list = handle.read().splitlines()
        handle.close()

        ## initialize empty list for IDs
        id_list = []
        
        # regex pattern to extract numbers between <Id> and </Id>
        pattern = re.compile(r"<Id>(\d+)</Id>")
        
        # loop over each line
        for line in ids_list:
            # decode byte string to regular string
            line_str = line.decode("utf-8")
            # search for ID pattern
            match = pattern.search(line_str)
            if match:
                # extract ID number and add to list
                id_list.append(match.group(1))

        #
        if verbose:
            print(id_list)

        #
        results.extend(id_list)
        
        #
        time.sleep(1)  # Gentle pause between batches

    ####
    if verbose:
        print(f"Collected {len(results)} IDs")
        print("Returning collected IDs")
    ##
    return(results)


In [None]:
#=== Fetch sequencing run metadata from SRA using SRA IDs ===#
def fetch_sra_metadata(sra_ids, output_path, verbose=True):

    ##
    if verbose:
        print("Called fetch_sra_metadata function with the following parameters:")
        print(f"number of SRA IDs: {len(sra_ids)}")
        print(f"batch size: {batch_size}")
        print(f"path for output file: {output_path}")
    

    ## Initiate list to store results prior to output
    local_results = []

    ### Define regex patterns for retrieving metadata of interest
    patterns = {
        'Title': r"<Title>(.*?)</Title>",
        'Platform': r"<Platform[^>]*>(.*?)</Platform>",
        'PlatformInstrument': r"<Platform\s+instrument_model=\"(.*?)\">",
        'Statistics_total_runs': r"<Statistics[^>]*total_runs=\"(.*?)\"",
        'Statistics_total_spots': r"<Statistics[^>]*total_spots=\"(.*?)\"",
        'Statistics_total_bases': r"<Statistics[^>]*total_bases=\"(.*?)\"",
        'Statistics_total_size': r"<Statistics[^>]*total_size=\"(.*?)\"",
        'Submitter_acc': r"<Submitter[^>]*acc=\"(.*?)\"",
        'Submitter_center_name': r"<Submitter[^>]*center_name=\"(.*?)\"",
        'Submitter_contact_name': r"<Submitter[^>]*contact_name=\"(.*?)\"",
        'Submitter_lab_name': r"<Submitter[^>]*lab_name=\"(.*?)\"",
        'Experiment_acc': r"<Experiment[^>]*acc=\"(.*?)\"",
        'Experiment_ver': r"<Experiment[^>]*ver=\"(.*?)\"",
        'Experiment_status': r"<Experiment[^>]*status=\"(.*?)\"",
        'Experiment_name': r"<Experiment[^>]*name=\"(.*?)\"",
        'Study_acc': r"<Study[^>]*acc=\"(.*?)\"",
        'Study_name': r"<Study[^>]*name=\"(.*?)\"",
        'Organism_taxid': r"<Organism[^>]*taxid=\"(.*?)\"",
        'Organism_ScientificName': r"<Organism[^>]*ScientificName=\"(.*?)\"",
        'Sample_acc': r"<Sample[^>]*acc=\"(.*?)\"",
        'Sample_name': r"<Sample[^>]*name=\"(.*?)\"",
        'Instrument_ILLUMINA': r"<Instrument[^>]*ILLUMINA=\"(.*?)\"",
        'LibraryName': r"<LIBRARY_NAME>(.*?)</LIBRARY_NAME>",
        'LibraryStrategy': r"<LIBRARY_STRATEGY>(.*?)</LIBRARY_STRATEGY>",
        'LibrarySource': r"<LIBRARY_SOURCE>(.*?)</LIBRARY_SOURCE>",
        'LibrarySelection': r"<LIBRARY_SELECTION>(.*?)</LIBRARY_SELECTION>",
        'LibraryConstructionProtocol': r"<LIBRARY_CONSTRUCTION_PROTOCOL>(.*?)</LIBRARY_CONSTRUCTION_PROTOCOL>",
        'Bioproject': r"<Bioproject>(.*?)</Bioproject>",
        'Biosample': r"<Biosample>(.*?)</Biosample>"
    }

    #### Initiate the loop
    for i in range(0, (len(sra_ids)-1)):

        ##
        sra_id = id_list[i]
        #
        if verbose:
            print(f"Querying database with SRA ID: {sra_id}")

        ##
        try:
            handle = Entrez.efetch(db="sra", id=sra_id, rettype="docsum", retmode="xml")
            summary = Entrez.read(handle)
            handle.close()
            
            #
            #if verbose:
                #print(f"Summary of fetched metadata:")
                #print(summary)

            ## testing ##
            #explore(summary)
            #print(type(summary))
            #print(summary[0])
            #print(str(summary[0]['Item']))
            #print(str(summary[0]['Id']))
            #print(str(summary[0]['ExpXml']))

            

            ##
            extracted_data = {}
            ##
            for key, pattern in patterns.items():
                match = re.search(pattern, str(summary[0]['ExpXml']))
                if match:
                    extracted_data[key] = match.group(1)
                else:
                    extracted_data[key] = None
                
            print(extracted_data)
            
            # local_results.append({
            #     'Run': extracted_data['Run'],
            # })

            # print(local_results)

        ####
        except Exception as e:
            print(f"Error with SRA ID {sra_id}: {e}")
            
        time.sleep(0.3)  # Respectful delay
        
        #batch_results = fetch_metadata(id_list)
        #results.extend(batch_results)
    
    ## return local_results

In [None]:
# Search parameters
search_term = '"3\'"'
batch_size = 100
max_records = 100000

##
sra_IDs = fetch_sra_IDs(search_term, batch_size, max_records, verbose=True)

##
with open("sra_IDs.txt", "w") as file:
    file.writelines(item + "\n" for item in sra_IDs)

In [None]:
#=== Scan SRA database for matching records ===#

# Search parameters
search_term = '"3\'"'
batch_size = 100
max_records = 1

fetch_sra_metadata(search_term, batch_size, max_records, verbose=True)

# Step 3: Convert to DataFrame
#df = pd.DataFrame(results)
#print(f"\n✅ Collected {len(df)} records.")

# Step 4: Save to CSV
#output_file = "sra_10x_gemx_results.csv"
#df.to_csv(output_file, index=False)
#print(f"✅ Data saved to {output_file}")

# Step 5: Quick summary
#print("\nTop library strategies:")
#print(df['LibraryStrategy'].value_counts())

#print("\nTop BioProjects:")
#print(df['BioProject'].value_counts().head())
