In [1]:
!pip install biopython


Defaulting to user installation because normal site-packages is not writeable


In [1]:
import pandas as pd
import time
from Bio import Entrez

def fetch_snp_info(rsid):
    try:
        Entrez.email = "ilcardenas26@gmail.com"  # Replace with your email
        handle = Entrez.efetch(db="snp", id=rsid, rettype="docset", retmode="text")
        time.sleep(3)  # Be mindful of API rate limits
        return handle.read()
    except Exception as e:
        print(f"Error fetching data for {rsid}: {e}")
        return None

# Read your file with rsIDs
file_path = "AncestryDNA.txt"
df = pd.read_csv(file_path, delimiter="\t")

# Ensure 'snp_info' column exists
if 'snp_info' not in df.columns:
    df['snp_info'] = None

# Fetch and save data row by row
output_file = "snp_info_output.csv"


In [5]:
for index, row in df.iterrows():
    if pd.isna(row['snp_info']):  # Skip already fetched rows
        for row2 in df.iterrows():
            if row2[4].isna(): 
                rsid = row['rsid']
                print(f"Fetching data for {rsid}...")
                snp_info = fetch_snp_info(rsid)
                df.at[index, 'snp_info'] = snp_info
                # Save after each fetch
                df.to_csv(output_file, index=False)
                print(f"Saved progress after {rsid}.")

IndexError: tuple index out of range

In [6]:
import csv
import time
from Bio import Entrez

def fetch_snp_info(rsid):
    try:
        Entrez.email = "ilcardenas26@gmail.com"
        handle = Entrez.efetch(db="snp", id=rsid, rettype="docset", retmode="text")
        time.sleep(3)
        return handle.read()
    except Exception as e:
        print(f"Error fetching data for {rsid}: {e}")
        return None

# Input and output files
input_file = "AncestryDNA.txt"
output_file = "snp_info_output.csv"

# Process line by line
with open(input_file, "r") as infile, open(output_file, "a", newline="") as outfile:
    reader = csv.DictReader(infile, delimiter="\t")
    fieldnames = reader.fieldnames + ["snp_info"]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)

    # Write header if output file is empty
    if outfile.tell() == 0:
        writer.writeheader()

    for row in reader:
        if "snp_info" not in row or not row["snp_info"]:  # Skip if already fetched
            print(f"Fetching data for {row['rsid']}...")
            row["snp_info"] = fetch_snp_info(row["rsid"])
            writer.writerow(row)
            print(f"Saved {row['rsid']} to file.")

Fetching data for rs3131972...
Saved rs3131972 to file.
Fetching data for rs114525117...
Saved rs114525117 to file.
Fetching data for rs4040617...
Saved rs4040617 to file.
Fetching data for rs141175086...
Saved rs141175086 to file.
Fetching data for rs115093905...
Saved rs115093905 to file.
Fetching data for rs11240777...
Saved rs11240777 to file.
Fetching data for rs6681049...
Saved rs6681049 to file.
Fetching data for rs4422948...
Saved rs4422948 to file.
Fetching data for rs57494724...
Saved rs57494724 to file.
Fetching data for rs4475691...
Saved rs4475691 to file.
Fetching data for rs6657440...
Saved rs6657440 to file.
Fetching data for rs4970461...
Saved rs4970461 to file.
Fetching data for rs7537756...
Saved rs7537756 to file.
Fetching data for rs13302982...
Saved rs13302982 to file.
Fetching data for rs2880024...
Saved rs2880024 to file.
Fetching data for rs74047407...
Saved rs74047407 to file.
Fetching data for rs1110052...
Saved rs1110052 to file.
Fetching data for rs7523549.

KeyboardInterrupt: 

In [5]:
import sqlite3
import pandas as pd
import time
from Bio import Entrez
import multiprocessing
from multiprocessing import Pool, cpu_count
from tqdm import tqdm


def fetch_snp_info(rsid):
    """
    Fetch SNP info using NCBI Entrez API for a given rsID.
    """
    try:
        Entrez.email = "ilcardenas26@gmail.com"
        # print(f"Processing rsID: {rsid}")  # Log the rsID being processed
        handle = Entrez.efetch(db="snp", id=rsid, rettype="docset", retmode="text")
        time.sleep(1)  # Respect API rate limits
        return rsid, handle.read()

    except Exception as e:
        print(f"Error fetching data for {rsid}: {e}")
        if "Error 400" in e:
            return rsid, "No data found in SNP database."
        else: 
            return rsid, None


def process_chunk(rsid_chunk):
    """
    Process a chunk of rsIDs by fetching SNP info.
    """
    results = []
    for rsid in rsid_chunk:
        results.append(fetch_snp_info(rsid))
    return results


def update_database(db_file, results):
    """
    Update the SQLite database with the fetched SNP info.
    """
    conn = sqlite3.connect(db_file)
    print("Connected to DB")
    cursor = conn.cursor()
    print("created cursor")
    for rsid, snp_info in results:
        print(rsid, snp_info)
        cursor.execute("UPDATE snp_data SET snp_info = ? WHERE rsid = ?", (snp_info, rsid))
        print('Item updated')
    conn.commit()
    conn.close()


def main():
    # Input and database setup
    input_file = "AncestryDNA.txt"
    db_file = "snp_data.db"

    # Create SQLite database and load data if not already present
    # conn = sqlite3.connect(db_file)
    # df = pd.read_csv(input_file, delimiter="\t")
    # df.to_sql("snp_data", conn, if_exists="replace", index=False)
    
    # # Add the 'snp_info' column if it doesn't exist
    # if 'snp_info' not in df.columns:
    #     df['snp_info'] = None

    # # Save to the database
    # df.to_sql("snp_data", conn, if_exists="replace", index=False)
    # conn.close()
    

    # Fetch all rsIDs with missing SNP info
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()

    # cursor.execute(("CREATE INDEX indexing ON snp_data (rsid);"))
    cursor.execute("SELECT rsid FROM snp_data WHERE snp_info IS NULL")
    rsid_list = [row[0] for row in cursor.fetchall()]
    print("RSID List created")
    print (rsid_list[:5])
    conn.close()

    # Divide rsIDs into chunks for multiprocessing
    
    print("Starting chunking")
    num_processes = min(cpu_count(), len(rsid_list))  # Use as many processes as available cores
    # chunk_size = len(rsid_list) // num_processes + 1
    chunk_size = 10
   
    rsid_chunks = [rsid_list[i:i + chunk_size] for i in range(0, len(rsid_list), chunk_size)]
    print("Chunks: ", len(rsid_chunks))

    print("Finished Chuncking")
    return rsid_chunks, rsid_list, num_processes
if __name__ == "__main__":
    rsid_chuncks, rsid_list, num_procesess = main()  

RSID List created
['rs2274328', 'rs2274330', 'rs17032907', 'rs3765968', 'rs3765966']
Starting chunking
Chunks:  67500
Finished Chuncking


In [None]:
rsid_chuncks, rsid_list, num_procesess 

In [6]:
db_file = "snp_data.db"
def printtest(id):
        print(id)
        
conn = sqlite3.connect(db_file)
print("Connected to DB")
cursor = conn.cursor()
print("created cursor")


             
with Pool(12) as pool:
        for segment in rsid_chuncks:
                for i in tqdm(range(0,len(rsid_chuncks)), total=len(rsid_chuncks), desc="Progress"):              
                        for item in segment:
                                rsid, snp_info = fetch_snp_info(item)
                                # item = fetch_snp_info(row["rsid"])
                                cursor.execute("UPDATE snp_data SET snp_info = ? WHERE rsid = ?", (snp_info, rsid))
                                conn.commit()
                               
conn.close()              

    # # Save the final updated data to CSV
    # conn = sqlite3.connect(db_file)
    # df_updated = pd.read_sql("SELECT * FROM snp_data", conn)
    # # df_updated.to_csv("snp_info_output.csv", index=False)
    # conn.close()

print("Processing complete. Data saved to snp_info_output.csv.")

Connected to DB
created cursor


Progress:   0%|          | 1/67500 [00:24<453:36:19, 24.19s/it]

Error fetching data for rs2274328: HTTP Error 400: Bad Request


Progress:   0%|          | 17/67500 [06:27<428:21:56, 22.85s/it]

Error fetching data for rs2274330: HTTP Error 400: Bad Request


Progress:   0%|          | 25/67500 [09:32<435:13:36, 23.22s/it]

Error fetching data for rs3765968: HTTP Error 400: Bad Request


Progress:   0%|          | 35/67500 [13:16<430:15:58, 22.96s/it]

Error fetching data for rs12021597: HTTP Error 400: Bad Request


Progress:   0%|          | 61/67500 [23:11<425:11:54, 22.70s/it]

Error fetching data for rs17032912: HTTP Error 400: Bad Request


Progress:   0%|          | 71/67500 [27:02<437:48:24, 23.37s/it]

Error fetching data for rs2274330: HTTP Error 400: Bad Request


Progress:   0%|          | 97/67500 [36:57<428:55:16, 22.91s/it]

Error fetching data for rs2274330: HTTP Error 400: Bad Request


Progress:   0%|          | 107/67500 [41:26<592:27:21, 31.65s/it]

Error fetching data for rs3765968: HTTP Error 400: Bad Request


Progress:   0%|          | 137/67500 [53:21<429:33:04, 22.96s/it]

Error fetching data for rs3765968: HTTP Error 400: Bad Request


Progress:   0%|          | 160/67500 [1:02:30<414:41:27, 22.17s/it]

Error fetching data for rs2274332: HTTP Error 400: Bad Request


Progress:   0%|          | 176/67500 [1:08:25<412:13:40, 22.04s/it]

Error fetching data for rs2274332: HTTP Error 400: Bad Request


Progress:   0%|          | 177/67500 [1:08:46<408:49:58, 21.86s/it]

Error fetching data for rs3765968: HTTP Error 400: Bad Request


Progress:   0%|          | 181/67500 [1:10:18<422:53:23, 22.61s/it]

Error fetching data for rs17389460: HTTP Error 400: Bad Request


Progress:   0%|          | 189/67500 [1:13:23<419:48:05, 22.45s/it]

Error fetching data for rs2274332: HTTP Error 400: Bad Request


Progress:   0%|          | 203/67500 [1:18:38<425:34:47, 22.77s/it]

Error fetching data for rs3765968: HTTP Error 400: Bad Request


Progress:   0%|          | 211/67500 [1:21:35<412:26:49, 22.07s/it]

Error fetching data for rs2274330: HTTP Error 400: Bad Request


Progress:   0%|          | 263/67500 [1:41:36<450:21:19, 24.11s/it]

Error fetching data for rs17032907: HTTP Error 400: Bad Request


Progress:   0%|          | 270/67500 [1:44:13<422:36:30, 22.63s/it]

Error fetching data for rs17389460: HTTP Error 400: Bad Request


Progress:   0%|          | 283/67500 [1:49:06<420:00:31, 22.49s/it]

Error fetching data for rs2274328: HTTP Error 400: Bad Request


Progress:   0%|          | 287/67500 [1:50:33<409:29:10, 21.93s/it]

Error fetching data for rs17032907: HTTP Error 400: Bad Request


Progress:   0%|          | 308/67500 [1:58:25<408:08:39, 21.87s/it]

Error fetching data for rs3765968: HTTP Error 400: Bad Request


Progress:   0%|          | 317/67500 [2:01:44<414:11:42, 22.19s/it]

Error fetching data for rs3765966: HTTP Error 400: Bad Request
Error fetching data for rs17389460: HTTP Error 400: Bad Request


Progress:   1%|          | 352/67500 [2:14:47<424:38:46, 22.77s/it]

Error fetching data for rs3765966: HTTP Error 400: Bad Request


Progress:   1%|          | 375/67500 [2:23:24<419:43:25, 22.51s/it]

Error fetching data for rs3765968: HTTP Error 400: Bad Request


Progress:   1%|          | 430/67500 [2:44:27<417:50:43, 22.43s/it]

Error fetching data for rs17032907: HTTP Error 400: Bad Request


Progress:   1%|          | 470/67500 [2:59:26<418:27:22, 22.47s/it]

Error fetching data for rs2274328: HTTP Error 400: Bad Request


Progress:   1%|          | 582/67500 [3:40:54<402:40:56, 21.66s/it]

Error fetching data for rs2274332: HTTP Error 400: Bad Request


Progress:   1%|          | 628/67500 [3:57:55<414:07:15, 22.29s/it]

Error fetching data for rs3765966: HTTP Error 400: Bad Request


Progress:   1%|          | 630/67500 [3:58:40<416:04:39, 22.40s/it]

Error fetching data for rs2274330: HTTP Error 400: Bad Request


Progress:   1%|          | 757/67500 [4:45:27<429:25:39, 23.16s/it]

Error fetching data for rs17032912: HTTP Error 400: Bad Request


Progress:   1%|          | 783/67500 [4:55:06<419:38:39, 22.64s/it]

Error fetching data for rs3765968: HTTP Error 400: Bad Request


Progress:   1%|          | 789/67500 [4:57:15<401:36:40, 21.67s/it]

Error fetching data for rs3765968: HTTP Error 400: Bad Request


Progress:   1%|          | 801/67500 [5:02:01<419:09:01, 22.62s/it]

Error fetching data for rs17389460: IncompleteRead(3520 bytes read)





TypeError: argument of type 'IncompleteRead' is not iterable