# Aggregate all the various counts and metadata

In [None]:
import pandas as pd

Get variables from `snakemake`:

In [None]:
tallied_count_files = snakemake.input.tallied_counts
mito_ref_info_csv = snakemake.input.mito_ref_info
metadata_csv = snakemake.input.metadata
read_counts_csv = snakemake.input.read_counts
sars2_ref_id = snakemake.params.sars2_ref_id
mito_genomes_to_keep = snakemake.params.mito_genomes_to_keep
mito_composition_filters = snakemake.params.mito_composition_filters
metagenomic_descriptions = snakemake.params.metagenomic_descriptions

Read information on mitochondrial reference genomes:

In [None]:
mito_ref_info = pd.read_csv(mito_ref_info_csv)

Read metadata on samples.
For sample titles that have multiple sample names, append the name to the title:

In [None]:
metadata_all = pd.read_csv(metadata_csv)

# downsize to most relevant metadata to keep
metadata = (
    metadata_all
    # make sample titles unique if they correspond to multiple names
    .assign(
        n_sample_names=lambda x: (
            x.groupby("Sample title")["Sample name"].transform("nunique")
        ),
        sample=lambda x: x["Sample title"].where(
            x["n_sample_names"] == 1,
            x["Sample title"] + "_" + x["Sample name"],
        )
    )
    # rename and organize
    .rename(columns={"Public description": "description"})
    [
        [
            "Run accession",
            "sample",
            "Sample name",
            "Collection date",
            "description",
            "Isolation source",
        ]
    ]
)

Tally all the alignment counts:

In [None]:
tallied_counts = pd.concat(
    [pd.read_csv(f) for f in tallied_count_files],
    ignore_index=True,
).assign(
    alignment_reference_type=lambda x: x["alignment_reference"].map(
        lambda r: "SARS-CoV-2" if r == sars2_ref_id else "mitochondrial genomes"
    )
)

# make sure all alignment references are SARS2 or a known mitochondrial genome
assert set(tallied_counts["alignment_reference"]).issubset(
    set(mito_ref_info["id"]).union([sars2_ref_id])
)

assert (
    len(tallied_counts)
    == len(tallied_counts.drop_duplicates())
    == len(tallied_counts.groupby(["alignment_reference", "Run accession"]))
)

# only keep these types of counts
count_cols = ["n_primary", "n_primary_and_unique_best"]

tallied_counts_tidy = tallied_counts.melt(
    id_vars=["Run accession", "alignment_reference", "alignment_reference_type"],
    value_vars=count_cols,
    var_name="alignment_count_type",
    value_name="alignment_count",
)

Get the read and alignment counts:

In [None]:
read_counts = pd.read_csv(read_counts_csv)

## SARS-CoV-2 and mitochondrial read counts
Now get the SARS-CoV-2 and mitochondrial read counts as a percentage of all preprocessed reads.

We first do this by run:

In [None]:
sars2_mito_aligned_by_run = (
    tallied_counts_tidy
    .groupby(
        ["Run accession", "alignment_reference_type", "alignment_count_type"],
        as_index=False,
    )
    .aggregate({"alignment_count": "sum"})
    .merge(read_counts, validate="many_to_one", how="outer")
    .assign(
        aligned_reads=lambda x: (
            x
            .groupby(["Run accession", "alignment_count_type"])
            ["alignment_count"]
            .transform("sum")
        ),
        percent_preprocessed_reads=(
            lambda x: x["alignment_count"] / x["preprocessed_reads"] * 100
        ),
        percent_aligned_reads=(
            lambda x: x["alignment_count"] / x["aligned_reads"] * 100
        ),
    )
    .rename(columns={"alignment_reference_type": "aligned_to"})
    .merge(
        metadata,
        on="Run accession",
        validate="many_to_one",
        how="outer",
    )
    .assign(
        aligned_reads=lambda x: x["aligned_reads"].astype("Int64"),
        alignment_count=lambda x: x["alignment_count"].astype("Int64"),
    )
)

sars2_mito_aligned_by_run.to_csv(
    snakemake.output["sars2_mito_aligned_by_run"], index=False, float_format="%.5g",
)

Now we do this by sample (aggregating runs for each sample) for **just the metagenomic samples**.

In [None]:
# columns that we sum across runs for each sample
sum_cols = [
    "total_reads",
    "preprocessed_reads",
    "aligned_reads",
    "alignment_count",
]

sars2_mito_aligned_by_metagenomic_sample = (
    sars2_mito_aligned_by_run
    .query("description in @metagenomic_descriptions")
    .groupby(
        [
            c for c in sars2_mito_aligned_by_run
            if c not in sum_cols
            and not c.startswith("percent_")
            and c != "Run accession"
        ],
        as_index=False,
        dropna=False,
    )
    .aggregate({c: "sum" for c in sum_cols})
    .assign(
        percent_preprocessed_reads=(
            lambda x: x["alignment_count"] / x["preprocessed_reads"] * 100
        ),
        percent_aligned_reads=(
            lambda x: x["alignment_count"] / x["aligned_reads"] * 100
        ),
    )
)

dup_samples = (
    sars2_mito_aligned_by_metagenomic_sample
    .groupby(["aligned_to", "alignment_count_type", "sample"])
    .aggregate(
        n=pd.NamedAgg("total_reads", "count"),
        sample_names=pd.NamedAgg("Sample name", "unique"),
    )
    .query("n > 1")
)

assert not len(dup_samples), f"Some duplicated sample names\n{dup_samples}"

sars2_mito_aligned_by_metagenomic_sample.to_csv(
    snakemake.output["sars2_mito_aligned_by_metagenomic_sample"], index=False, float_format="%.5g",
)

## Composition of mitochondrial reads

Do some filtering on the runs:
 - Get just the mitochondrial counts
 - Only keep runs with the metagenomic description
 - Exclude runs with insufficient unique best primary read alignment counts to mitochondria

In [None]:
mito_counts = (
    tallied_counts_tidy
    .merge(metadata, validate="many_to_one")
    .query("alignment_reference_type == 'mitochondrial genomes'")
    .drop(columns="alignment_reference_type")
    .query("description in @metagenomic_descriptions")
    .assign(alignment_count=lambda x: x["alignment_count"].astype(int))
)

insufficient_mito_reads = (
    mito_counts
    .query("alignment_count_type == 'n_primary'")
    .groupby("Run accession", as_index=False)
    .aggregate(total_aligned_reads=pd.NamedAgg("alignment_count", "sum"))
    .sort_values("total_aligned_reads")
    .query("total_aligned_reads < @mito_composition_filters['min_alignments_run_filter']")
    .reset_index(drop=True)
    .merge(metadata, how="left")
)

print("Excluding the following metagenomic runs with insufficient aligned mitochondrial reads:")
display(insufficient_mito_reads)

mito_counts = mito_counts[
    ~mito_counts["Run accession"].isin(insufficient_mito_reads["Run accession"])
]

Now exclude mitochondrial reference genomes that don't have a high enough percentage of primary read alignments or are not delineated in the list to keep:

In [None]:
mito_max_primary_percent = ( 
    mito_counts
    .query("alignment_count_type == 'n_primary'")
    .assign(
        total=lambda x: x.groupby("Run accession")["alignment_count"].transform("sum"),
        percent=lambda x: 100 * x["alignment_count"] / x["total"],
    )
    .groupby("alignment_reference", as_index=False)
    .aggregate(
        max_percent=pd.NamedAgg("percent", "max"),
        avg_percent=pd.NamedAgg("percent", "mean"),
    )
    .sort_values("max_percent")
    .merge(mito_ref_info.rename(columns={"id": "alignment_reference"}))
)

min_percent = mito_composition_filters["min_percent_aligned_genome_filter"]
mito_to_keep = (
    mito_max_primary_percent
    .assign(
        reason_kept=lambda x: x.apply(
            lambda row: (
                "specified to keep in config"
                if any(row["species"].startswith(s) for s in mito_genomes_to_keep.values())
                else (
                    f"at least {min_percent}% in one run"
                    if row["max_percent"] >= min_percent
                    else
                    "not retained"
                )
            ),
            axis=1,
        ),
    )
    .query("reason_kept != 'not retained'")
    .sort_values("max_percent", ascending=False)
    .reset_index(drop=True)
)

assert all(
    any(s_kept.startswith(s_to_keep) for s_kept in mito_to_keep["species"])
    for s_to_keep in mito_genomes_to_keep.values()
)

print("Keeping the following mitochondrial genomes:")
display(mito_to_keep.round(2))

Now get just the counts for those mitochondrial genomes to keep:

In [None]:
mito_counts_by_run = (
    mito_counts
    .merge(
        mito_to_keep[["alignment_reference", "species", "common_name"]],
        validate="many_to_one",
        on="alignment_reference",
        how="inner",
    )
)

mito_counts_by_run.to_csv(
    snakemake.output["mito_composition_by_metagenomic_run"], index=False, float_format="%.5f",
)

Now we get the mitochondrial counts by **sample** (aggregating counts across run for the same sample:

In [None]:
# columns that we sum across runs for each sample
sum_cols = [
    "alignment_count",
]

mito_counts_by_sample = (
    mito_counts_by_run
    .groupby(
        [
            c for c in mito_counts_by_run
            if c not in sum_cols and c != "Run accession"
        ],
        as_index=False,
        dropna=False,
    )
    .aggregate({c: "sum" for c in sum_cols})
)

dup_samples = (
    mito_counts_by_sample
    .groupby(["alignment_reference", "alignment_count_type", "sample"])
    .aggregate(
        n=pd.NamedAgg("alignment_count", "count"),
        sample_names=pd.NamedAgg("Sample name", "unique"),
    )
    .query("n > 1")
)

assert not len(dup_samples), f"Some duplicated sample names\n{dup_samples}"

mito_counts_by_sample.to_csv(
    snakemake.output["mito_composition_by_metagenomic_sample"], index=False, float_format="%.5g",
)