# Choose the mitochondrial genomes to retain
We do this by first only including taxa of interest, then keeping all of the ones specified to keep, plus a greedy algorithm to keep others that are sufficiently different from ones already retained.

In [None]:
import os

import Bio.SeqIO

import pandas as pd

Get variables from `snakemake`:

In [None]:
mashes = snakemake.input.mashes
fasta = snakemake.input.fasta
taxa = snakemake.input.taxa
info_csv = snakemake.input.info_csv
output_csv = snakemake.output.csv
output_fasta = snakemake.output.fasta
min_mash_dist = snakemake.params.min_mash_dist
to_keep = snakemake.params.to_keep
taxa_to_keep = snakemake.params.taxa_to_keep

Get all the taxa to keep:

In [None]:
taxa_df = pd.concat([pd.read_csv(f) for f in taxa], ignore_index=True)

assert set(taxa_df.columns) == {"id", "taxonid", *taxa_to_keep}

taxa_df["to_keep"] = False
for taxalevel, taxalist in taxa_to_keep.items():
    taxa_df["to_keep"] = taxa_df["to_keep"] | taxa_df[taxalevel].isin(taxalist)

print("Keeping this many genomes based on taxa inclusions:")
display(taxa_df.groupby("to_keep").aggregate(n=pd.NamedAgg("id", "count")))

ids_of_valid_taxa = set(taxa_df["id"])

Get the IDs specified to keep:

In [None]:
info = pd.read_csv(info_csv).query("id in @ids_of_valid_taxa")

id_to_species = info.set_index("id")["species"].to_dict()

valid_ids = set(info["id"])

retained_records = []
retained_ids = set()
for common_name, name in to_keep.items():
    df = info.query("species.str.startswith(@name)")
    if len(df) == 1:
        mito_id = df["id"].tolist()[0]
    elif len(df) > 1:
        df = info.query("species == @name")
        if len(df) == 1:
            mito_id = df["id"].tolist()[0]
        else:
            raise ValueError(f"ambiguous multiple hits for {name=}\n{df}")
    else:
        raise ValueError(f"no match for {common_name=}, {name=}")
    retained_records.append((mito_id, name, common_name))
    retained_ids.add(mito_id)

Now loop over all other IDs and get the ones to keep, using a greedy algorithm (if not overly similar to one already kept):

In [None]:
for mash in mashes:
    mito_id = os.path.splitext(os.path.basename(mash))[0]
    if mito_id not in ids_of_valid_taxa:
        continue
    assert mito_id in valid_ids
    if mito_id in retained_ids:
        continue
    mash_df = pd.read_csv(mash, sep="\t", skiprows=1, names=["query", "dist"]).assign(
        id=lambda x: x["query"].map(lambda f: os.path.splitext(os.path.basename(f))[0])
    )
    # are there close matches?
    to_close = len(
        mash_df
        .query("id != @mito_id")
        .query("id in @retained_ids")
        .query("dist < @min_mash_dist")
    )
    if not to_close:
        retained_ids.add(mito_id)
        retained_records.append((mito_id, id_to_species[mito_id], ""))

In [None]:
retained_df = pd.DataFrame(retained_records, columns=["id", "species", "common_name"])

print(f"Retained {len(retained_df)=} mitochondrial genomes")

retained_df.to_csv(output_csv, index=False)

retained_df

Get FASTA file with genomes to keep:

In [None]:
retained_fastas = []
for seq in Bio.SeqIO.parse(fasta, "fasta"):
    if seq.id in retained_ids:
        retained_fastas.append(seq)
        
assert len(retained_df) == len(retained_ids) == len(retained_fastas)

_ = Bio.SeqIO.write(retained_fastas, output_fasta, "fasta")