# Notebook to investigate the reference sequences for each accession

In [1]:
import pandas as pd

In [2]:
#
ncbi_sra_runtable = pd.read_csv('../metadata/sra_metadata_raw.csv', sep=',')

# Manually getting SRA accession IDs

In [3]:
# get list of folders within google cloud storage bucket folder that start with SRR
SRA_folders = !gsutil ls gs://orcestra-archive/rawdata/RNA/SRA/

In [4]:
# get basename of every path in SRA_folders
SRA_basenames = [x.split('/')[-2] for x in SRA_folders]
# remove all basenames that are not in the 'Run' column of the ncbi_sra_runtable
SRA_basenames = [x for x in SRA_basenames if x in ncbi_sra_runtable['Run'].values]

print("total SRA folders = " + str(len(SRA_basenames)))
print("First five are:")
print(SRA_basenames[0:5])


total SRA folders = 1019
First five are:
['SRR8615222', 'SRR8615223', 'SRR8615224', 'SRR8615225', 'SRR8615226']


# getting SRA accessions from snakemake

In [7]:
# create a dictionary of SRA basenames and their corresponding RefSeq accession numbers
import concurrent.futures
# Create a ThreadPoolExecutor with the desired number of threads (e.g., 4)
executor = concurrent.futures.ThreadPoolExecutor(max_workers=6)

SRA_to_RefSeq = {}

def process_sra_acc(sra_acc):
    if sra_acc in SRA_basenames:
        x = !align-info {sra_acc}
        acc_refs = {}
        for i in range(len(x)):
            if 'refseq' in x[i]:
                file_name = x[i].split(',')[0]
                file_url  = x[i].split(',')[-1].split('::')[-1]
                acc_refs[file_name] = file_url
        SRA_to_RefSeq[sra_acc] = acc_refs           
        return SRA_to_RefSeq[sra_acc]
# Submit tasks to the executor for each Run entry in ncbi_sra_runtable
# for i in range(len(SRA_basenames)):
print(len(SRA_basenames))
for i in range(2):
    sra_acc = SRA_basenames[i]
    executor.submit(process_sra_acc, sra_acc)

# Shutdown the executor and wait for all tasks to complete
executor.shutdown(wait=True)

1019


In [48]:
# write the dictionary to a json file for later use
import json
with open('../metadata/SRA_to_RefSeq.json', 'w') as fp:
    json.dump(SRA_to_RefSeq, fp)
    
# read the json file back in to make sure it worked
with open('../metadata/SRA_to_RefSeq.json', 'r') as fp:
    SRA_to_RefSeq = json.load(fp)
    
# print the first 5 entries
print("First five entries in SRA_to_RefSeq:")
print(list(SRA_to_RefSeq.items())[0:5])


First five entries in SRA_to_RefSeq:
[('SRR8615273', {'CM000663.1': 'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/CM000663.1', 'CM000664.1': 'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/CM000664.1', 'CM000665.1': 'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/CM000665.1', 'CM000666.1': 'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/CM000666.1', 'CM000667.1': 'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/CM000667.1', 'CM000668.1': 'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/CM000668.1', 'CM000669.1': 'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/CM000669.1', 'CM000670.1': 'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/CM000670.1', 'CM000671.1': 'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/CM000671.1', 'CM000672.1': 'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/CM000672.1', 'CM000673.1': 'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/CM000673.1', 'CM000674.1': 'https://sra-download.ncbi.nlm.nih.gov/traces/refseq

In [49]:
# Create a set to store the unique key-value pairs
# Initialize an empty dictionary to store the merged key-value pairs
merged_dict = {}

# Iterate through the outer dictionary
for outer_key, inner_dict in SRA_to_RefSeq.items():
    # Merge the inner dictionary into the merged_dict
    merged_dict.update(inner_dict)

# Print the merged dictionary
# print(merged_dict)
# unique_pairs = list(unique_pairs)
# unique_pairs[0]
cachefiles = pd.DataFrame(merged_dict.items())
cachefiles

cachefiles.to_csv('../metadata/SRA_to_RefSeq.csv', index=False, header=False)

In [50]:
unique_pairs

[('GL000220.1',
  'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/GL000220.1'),
 ('GL000193.1',
  'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/GL000193.1'),
 ('GL000194.1',
  'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/GL000194.1'),
 ('GL000205.1',
  'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/GL000205.1'),
 ('GL000206.1',
  'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/GL000206.1'),
 ('GL000235.1',
  'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/GL000235.1'),
 ('GL000226.1',
  'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/GL000226.1'),
 ('GL000204.1',
  'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/GL000204.1'),
 ('CM000667.1',
  'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/CM000667.1'),
 ('CM000684.1',
  'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/CM000684.1'),
 ('CM000669.1',
  'https://sra-download.ncbi.nlm.nih.gov/traces/refseq/CM000669.1'),
 ('GL000212.1',
  'https://sra-download.ncbi.nlm.nih.gov/traces/r