# This notebook computes the data for the other notebook to plot

## Sample type classification, ad hoc

In [None]:
def classify_organism(organism, taxonomy_hierarchy, assay_type):
    organism = organism.lower()
    assay_type = assay_type.lower()
    ranks = taxonomy_hierarchy
    
    #if 'environmental' in organism:
    #    return 'Environmental'
    if 'metagenome' in organism:
        return 'Metagenome'

    if ranks:
        if organism == 'severe acute respiratory syndrome coronavirus 2':
            return 'SARS-CoV-2'
        elif 'species' in ranks and ranks['species'] is not None and ranks['species'].lower() == 'homo sapiens':
            return 'Human'
        elif 'species' in ranks and ranks['species'] is not None and ranks['species'].lower() == 'mus musculus':
            return 'Mouse'
        elif 'superkingdom' in ranks and ranks['superkingdom'] is not None and ranks['superkingdom'] == 'Viruses':
            return 'Virome'
        elif 'class' in ranks and ranks['class'] is not None and ranks['class'] == 'Mammalia':
            return 'Mammal'
        elif 'phylum' in ranks and ranks['phylum'] is not None and ranks['phylum'] == 'Chordata':
            return 'Vertebrate'
        elif 'kingdom' in ranks and ranks['kingdom'] is not None and ranks['kingdom'] in ['Metazoa', 'Animalia']:
            return 'Invertebrate'
        elif 'kingdom' in ranks and ranks['kingdom'] is not None and ranks['kingdom'] == 'Fungi':
            return 'Fungus'
        elif 'kingdom' in ranks and ranks['kingdom'] is not None and ranks['kingdom'] in ['Viridiplantae', 'Plantae']:
            return 'Plant'
        elif 'superkingdom' in ranks and ranks['superkingdom'] is not None and ranks['superkingdom'] in ['Bacteria', 'Archaea']:
            return 'Prokaryote'

    #elif 'wgs' in assay_type or 'wga' in assay_type and 'mammal' in organism:
    #    return 'Mammal WGS'
    return 'Other'

def simplify_assay_type(assay_type):
    if assay_type == 'WGA' or assay_type == 'WGS':
        return "WGS/WGA"
    elif "RNA" in assay_type:
        return "RNA-Seq"
    return "OTHER"

## Parses local NCBI taxonomy

In [73]:
import csv

# Load names.dmp and nodes.dmp
def load_taxonomy_data():
    names = {}
    nodes = {}

    # Read names.dmp
    with open("taxonomy/names.dmp", "r") as f:
        reader = csv.reader(f, delimiter="|", quoting=csv.QUOTE_NONE)
        for row in reader:
            tax_id = row[0].strip()
            name = row[1].strip()
            name_class = row[3].strip()
            if name_class == "scientific name":
                names[tax_id] = name

    # Read nodes.dmp
    with open("taxonomy/nodes.dmp", "r") as f:
        reader = csv.reader(f, delimiter="|", quoting=csv.QUOTE_NONE)
        for row in reader:
            tax_id = row[0].strip()
            parent_tax_id = row[1].strip()
            rank = row[2].strip()
            nodes[tax_id] = {"parent_tax_id": parent_tax_id, "rank": rank}

    return names, nodes

# Function to get taxonomy hierarchy
def get_taxonomy_hierarchy(species_name, names, nodes):
    tax_id = None
    for k, v in names.items():
        if v.lower() == species_name.lower():
            tax_id = k
            break
    if tax_id is None:
        return f"Species '{species_name}' not found in the database"

    hierarchy = {"kingdom": None, "phylum": None, "class": None, "superkingdom": None, "species": None}
    current_id = tax_id

    while current_id != "1":  # 1 is the tax_id for the root
        if current_id not in nodes:
            break
        node = nodes[current_id]
        rank = node["rank"]
        if rank in hierarchy:
            hierarchy[rank] = names[current_id]
        current_id = node["parent_tax_id"]

    return hierarchy

def get_taxonomy_hierarchy_from_taxid(tax_id, names, nodes):
    hierarchy = {"kingdom": None, "phylum": None, "class": None, "superkingdom": None, "species": None}
    current_id = tax_id

    while current_id != "1":  # 1 is the tax_id for the root
        if current_id not in nodes:
            break
        node = nodes[current_id]
        rank = node["rank"]
        if rank in hierarchy:
            hierarchy[rank] = names[current_id]
        current_id = node["parent_tax_id"]

    return hierarchy

# Example usage
names, nodes = load_taxonomy_data()
species = "Homo sapiens"
taxonomy_hierarchy = get_taxonomy_hierarchy(species, names, nodes)
print(taxonomy_hierarchy)


Species 'Homo sapiens' not found in the database


## Counts SRA accessions

In [None]:
import csv, io
import zstandard as zstd

from collections import Counter

c = Counter()

# File path to the compressed CSV file
file_path = 'sra_taxid.csv.zst'
counter = 0
counter_bad = 0
# Open and decompress the zstd file
with open(file_path, 'rb') as compressed_file:
    dctx = zstd.ZstdDecompressor()
    with dctx.stream_reader(compressed_file) as reader:
        # Wrap the decompressed stream with a text IO wrapper to read as CSV
        text_stream = io.TextIOWrapper(reader, encoding='utf-8')
        csv_reader = csv.reader(text_stream)
        
        # Process the CSV file line by line
        for row in csv_reader:
            try:
                acc, assay_type, organism, tax_id, taxonomic_rank, scientific_name = row
            except:
                #print("bad row",row)
                counter_bad += 1
                continue
            taxonomy_hierarchy = get_taxonomy_hierarchy_from_taxid(tax_id, names, nodes)
            org_class = classify_organism(organism,taxonomy_hierarchy,assay_type)
            simplified_assay_type = simplify_assay_type(assay_type)
            debug = False
            c[(org_class,simplified_assay_type)] += 1
            if debug:
                print(acc, assay_type, organism, tax_id, taxonomic_rank, scientific_name,sep="-----")
                print(org_class)
                print(taxonomy_hierarchy)
                print(simplified_assay_type)
            counter += 1
            #if counter==100000:
            #    break
print(counter,"good rows",counter_bad,"bad rows")
print(c)

## Merge-sort-like streaming mechanism

In [74]:
import zstandard as zstd
import csv
import io

def read_zstd_file_line_by_line(file_path):
    with open(file_path, 'rb') as f:
        dctx = zstd.ZstdDecompressor()
        with dctx.stream_reader(f) as reader:
            text_stream = io.TextIOWrapper(reader, encoding='utf-8')
            csv_reader = csv.reader(text_stream)
            for row in csv_reader:
                yield row


def merge_sort_and_process(file1_path, file2_path, processing_func):
    file1_gen = read_zstd_file_line_by_line(file1_path)
    file2_gen = read_zstd_file_line_by_line(file2_path)

    file1_line = next(file1_gen, None)
    file2_line = next(file2_gen, None)

    while file1_line is not None and file2_line is not None:
        accession1 = file1_line[0]
        accession2 = file2_line[0]
        #break
        if accession1 < accession2:
            file1_line = next(file1_gen, None)
        elif accession1 > accession2:
            file2_line = next(file2_gen, None)
        else:
            processing_func(file1_line, file2_line)
            file1_line = next(file1_gen, None)
            file2_line = next(file2_gen, None)


## Now stream both the sra_taxid file and the diamond results file line by line and process accessions streamingly

In [81]:

from collections import defaultdict

# Initialize counters
palmcores_counter = defaultdict(int)
beetle_counter = defaultdict(int)
var_obelisk_counter = defaultdict(int)
var_Deltavirus_counter = defaultdict(int)
var_Osiris_counter = defaultdict(int)
papilloma_counter = defaultdict(int)

def process_lines(file1_line, file2_line):
    global palmcores_counter, beetle_counter, var_obelisk_counter, var_Deltavirus_counter, var_Osiris_counter, papilloma_counter
    #print("file1",file1_line)
    try:
        acc, palmcores, beetle, var_obelisk, var_Deltavirus, var_Osiris, papilloma = file1_line
    except:
        print("bad row file1", file1_line)
        return
    #print("file2",file2_line)
    try:
        acc2, assay_type, organism, tax_id, taxonomic_rank, scientific_name = file2_line
    except:
        print("bad row file2", file1_line)
        return
    assert(acc == acc2)
    taxonomy_hierarchy = get_taxonomy_hierarchy_from_taxid(tax_id, names, nodes)
    org_class = classify_organism(organism,taxonomy_hierarchy,assay_type)
    simplified_assay_type = simplify_assay_type(assay_type)
    
    # Increment counters
    key = (org_class, simplified_assay_type)
    palmcores_counter[key] += int(palmcores)
    beetle_counter[key] += int(beetle)
    var_obelisk_counter[key] += int(var_obelisk)
    var_Deltavirus_counter[key] += int(var_Deltavirus)
    var_Osiris_counter[key] += int(var_Osiris)
    papilloma_counter[key] += int(papilloma)

# Paths to the files
file1_path = '../results/output.sorted.txt.zst'
file2_path = '../plot/sra_taxid.csv.zst'

# Run the merge sort and process function
merge_sort_and_process(file1_path, file2_path, process_lines)

print("palmcores_counter = ", dict(palmcores_counter))
print("beetle_counter =", dict(beetle_counter))
print("Var_Obelisk_counter = ", dict(var_obelisk_counter))
print("Var_Deltavirus_counter = ", dict(var_Deltavirus_counter))
print("Var_Osiris_counter =", dict(var_Osiris_counter))
print("Papilloma_counter =", dict(papilloma_counter))


bad row file2 ['SRR19732797', '126', '926', '0', '0', '0', '5']
palmcores_counter =  {('Prokaryote', 'WGS/WGA'): 177285018, ('Human', 'OTHER'): 449625871, ('Metagenome', 'OTHER'): 272954821, ('Plant', 'WGS/WGA'): 254057206, ('Mouse', 'OTHER'): 218091520, ('Prokaryote', 'OTHER'): 18779186, ('Vertebrate', 'OTHER'): 51882542, ('Plant', 'OTHER'): 137977029, ('Mammal', 'WGS/WGA'): 215672097, ('Mouse', 'WGS/WGA'): 74027111, ('Other', 'WGS/WGA'): 51964904, ('Human', 'WGS/WGA'): 313726697, ('Vertebrate', 'WGS/WGA'): 113179229, ('Invertebrate', 'OTHER'): 43764023, ('Metagenome', 'WGS/WGA'): 964130982, ('Plant', 'RNA-Seq'): 377827837, ('Vertebrate', 'RNA-Seq'): 110250064, ('Virome', 'WGS/WGA'): 41375236, ('Human', 'RNA-Seq'): 1079140526, ('Virome', 'OTHER'): 33289174, ('Mammal', 'OTHER'): 64915529, ('Fungus', 'OTHER'): 15016661, ('Invertebrate', 'RNA-Seq'): 164568279, ('Other', 'OTHER'): 17706860, ('Invertebrate', 'WGS/WGA'): 169354895, ('Mammal', 'RNA-Seq'): 257305416, ('Other', 'RNA-Seq'): 319

## Get petabases per type

In [80]:

from collections import defaultdict

# Initialize counters
petabases_counter = defaultdict(int)
nbacc_counter = defaultdict(int)

def process_lines_Athenastats(file1_line, file2_line):
    global petabases_counter, nbacc_counter
    #print("file1",file1_line)
    try:
        acc,mbases,mbytes,avgspotlen,librarylayout,instrument = file1_line
    except:
        print("bad row file1", file1_line)
        return
    #print("file2",file2_line)
    try:
        acc2, assay_type, organism, tax_id, taxonomic_rank, scientific_name = file2_line
    except:
        print("bad row file2", file1_line)
        return
    assert(acc == acc2)
    taxonomy_hierarchy = get_taxonomy_hierarchy_from_taxid(tax_id, names, nodes)
    org_class = classify_organism(organism,taxonomy_hierarchy,assay_type)
    simplified_assay_type = simplify_assay_type(assay_type)
    
    # Increment counters
    key = (org_class, simplified_assay_type)
    petabases_counter[key] += int(mbases)
    nbacc_counter[key] += 1

# Paths to the files
file1_path = '/home/ec2-user/erc-unitigs-prod/Athena_Dec_10_public.sorted.csv.zst'
file2_path = '../plot/sra_taxid.csv.zst'

# Run the merge sort and process function
merge_sort_and_process(file1_path, file2_path, process_lines_Athenastats)

print("petabases_counter = ", dict(petabases_counter))
print("nbacc_counter =", dict(nbacc_counter))


bad row file1 ['SRR19732797', '1414', '87', '258', 'PAIRED', '\nSRR19732798"', '691', '273', '281', 'PAIRED', 'NextSeq 550']
petabases_counter =  {('Prokaryote', 'WGS/WGA'): 1337536637, ('Human', 'OTHER'): 4175010458, ('Metagenome', 'OTHER'): 698673154, ('Plant', 'WGS/WGA'): 3948094067, ('Mouse', 'OTHER'): 2099232371, ('Prokaryote', 'OTHER'): 92153348, ('Mammal', 'OTHER'): 696688033, ('Vertebrate', 'OTHER'): 452695920, ('Plant', 'OTHER'): 1293976973, ('Mammal', 'WGS/WGA'): 3108752753, ('Invertebrate', 'OTHER'): 487270872, ('Mouse', 'WGS/WGA'): 741610048, ('Other', 'WGS/WGA'): 372653970, ('Human', 'WGS/WGA'): 3437232410, ('Vertebrate', 'WGS/WGA'): 1403041500, ('Metagenome', 'WGS/WGA'): 2695259386, ('Plant', 'RNA-Seq'): 2580159244, ('Vertebrate', 'RNA-Seq'): 883797364, ('Virome', 'WGS/WGA'): 25321925, ('Human', 'RNA-Seq'): 6386799101, ('Metagenome', 'RNA-Seq'): 361678065, ('Virome', 'OTHER'): 25097506, ('Fungus', 'OTHER'): 126223433, ('Invertebrate', 'RNA-Seq'): 1112706460, ('Other', 'OT