# Aggregate all the various counts and metadata

In [None]:
import numpy

import pandas as pd

Get variables from `snakemake`:

In [None]:
counts_and_coverage_files = snakemake.input.counts_and_coverage
mito_ref_info_csv = snakemake.input.mito_ref_info
metadata_csv = snakemake.input.metadata
read_counts_csv = snakemake.input.read_counts
sars2_ref_id = snakemake.params.sars2_ref_id
mito_genomes_to_keep = snakemake.params.mito_genomes_to_keep
mito_composition_filters = snakemake.params.mito_composition_filters
metagenomic_descriptions = snakemake.params.metagenomic_descriptions

Read information on mitochondrial reference genomes, and get species name as just first two words:

In [None]:
mito_ref_info = pd.read_csv(mito_ref_info_csv)

mito_ref_info

Read metadata on samples.
For sample titles that have multiple sample names, append the name to the title:

In [None]:
metadata_all = pd.read_csv(metadata_csv)

# downsize to most relevant metadata to keep
metadata = (
    metadata_all
    # make sample titles unique if they correspond to multiple names
    .assign(
        n_sample_names=lambda x: (
            x.groupby("Sample title")["Sample name"].transform("nunique")
        ),
        sample=lambda x: x["Sample title"].where(
            x["n_sample_names"] == 1,
            x["Sample title"] + "_" + x["Sample name"],
        )
    )
    # rename and organize
    .rename(columns={"Public description": "description"})
    [
        [
            "Run accession",
            "sample",
            "Sample name",
            "Collection date",
            "description",
            "Isolation source",
        ]
    ]
)

Tally all the alignment counts:

In [None]:
counts_and_coverage = pd.concat(
    [
        (
            pd.read_csv(
                f,
                sep="\t",
                names=["alignment_reference", "aligned_reads", "covered_bases"],
                header=0,
            )
            .assign(**{"Run accession": os.path.splitext(os.path.basename(f))[0]})
        )
        for f in counts_and_coverage_files
    ],
    ignore_index=True,
)[["Run accession", "alignment_reference", "aligned_reads", "covered_bases"]]

# make sure all alignment references are SARS2 or a known mitochondrial genome
assert set(counts_and_coverage["alignment_reference"]).issubset(
    set(mito_ref_info["id"]).union([sars2_ref_id])
)

assert (
    len(counts_and_coverage)
    == len(counts_and_coverage.drop_duplicates())
    == counts_and_coverage["alignment_reference"].nunique() * counts_and_coverage["Run accession"].nunique()
)

counts_and_coverage

Get the read and alignment counts:

In [None]:
read_counts = pd.read_csv(read_counts_csv)

## SARS-CoV-2 read counts
Now get the SARS-CoV-2 read counts as a percentage of all preprocessed reads.

We first do this by run:

In [None]:
sars2_aligned_by_run = (
    counts_and_coverage
    .query("alignment_reference == @sars2_ref_id")
    .drop(columns="alignment_reference")
    .rename(
        columns={
            "aligned_reads": "SARS2_aligned_reads",
            "covered_bases": "SARS2_covered_bases",
        },
    )
    .merge(read_counts, validate="one_to_one", how="outer")
    .assign(
        percent_preprocessed_reads_aligning_to_SARS2=(
            lambda x: x["SARS2_aligned_reads"] / x["preprocessed_reads"] * 100
        ),
    )
    .merge(
        metadata,
        on="Run accession",
        validate="one_to_one",
        how="outer",
    )
)

sars2_aligned_by_run.to_csv(
    snakemake.output["sars2_aligned_by_run"], index=False, float_format="%.5g",
)

sars2_aligned_by_run

Now we do this by sample (aggregating runs for each sample) for **just the metagenomic samples**.

In [None]:
# columns that we sum across runs for each sample
sum_cols = [
    "total_reads",
    "preprocessed_reads",
    "SARS2_aligned_reads",
]

sars2_aligned_by_metagenomic_sample = (
    sars2_aligned_by_run
    .query("description in @metagenomic_descriptions")
    .groupby(
        [
            c for c in sars2_aligned_by_run
            if c not in sum_cols
            and not c.startswith("percent_")
            and c not in ["Run accession", "SARS2_covered_bases"]
        ],
        as_index=False,
        dropna=False,
    )
    .aggregate({c: "sum" for c in sum_cols})
    .assign(
        percent_preprocessed_reads_aligning_to_SARS2=(
            lambda x: x["SARS2_aligned_reads"] / x["preprocessed_reads"] * 100
        ),
    )
)

dup_samples = (
    sars2_aligned_by_metagenomic_sample
    .groupby("sample")
    .aggregate(
        n=pd.NamedAgg("total_reads", "count"),
        sample_names=pd.NamedAgg("Sample name", "unique"),
    )
    .query("n > 1")
)

assert not len(dup_samples), f"Some duplicated sample names\n{dup_samples}"

sars2_aligned_by_metagenomic_sample.to_csv(
    snakemake.output["sars2_aligned_by_metagenomic_sample"], index=False, float_format="%.5g",
)

sars2_aligned_by_metagenomic_sample.sort_values("SARS2_aligned_reads")

## Composition of mitochondrial reads

Do some filtering on the runs:
 - Get just the mitochondrial counts
 - Only keep runs with the metagenomic description
 - Exclude runs with insufficient alignment counts to mitochondria

In [None]:
mito_counts = (
    counts_and_coverage
    .merge(metadata, validate="many_to_one")
    .query("alignment_reference != @sars2_ref_id")
    .query("description in @metagenomic_descriptions")
)

insufficient_mito_reads = (
    mito_counts
    .groupby("Run accession", as_index=False)
    .aggregate(total_aligned_reads=pd.NamedAgg("aligned_reads", "sum"))
    .sort_values("total_aligned_reads")
    .query("total_aligned_reads < @mito_composition_filters['min_alignments_run_filter']")
    .reset_index(drop=True)
    .merge(metadata, how="left")
)

print("Excluding the following metagenomic runs with insufficient aligned mitochondrial reads:")
display(insufficient_mito_reads)

mito_counts = mito_counts[
    ~mito_counts["Run accession"].isin(insufficient_mito_reads["Run accession"])
]

Now exclude mitochondrial reference genomes that don't have a high enough percentage of read alignments or are not delineated in the list to keep:

In [None]:
mito_max_primary_percent = ( 
    mito_counts
    .assign(
        total=lambda x: x.groupby("Run accession")["aligned_reads"].transform("sum"),
        percent=lambda x: 100 * x["aligned_reads"] / x["total"],
    )
    .groupby("alignment_reference", as_index=False)
    .aggregate(
        max_percent=pd.NamedAgg("percent", "max"),
        avg_percent=pd.NamedAgg("percent", "mean"),
        max_coverage=pd.NamedAgg("covered_bases", "max"),
    )
    .sort_values("max_percent")
    .merge(mito_ref_info.rename(columns={"id": "alignment_reference"}))
)

min_percent = mito_composition_filters["min_percent_aligned_genome_filter"]
min_coverage = mito_composition_filters["min_coverage_aligned_genome_filter"]

mito_to_keep = (
    mito_max_primary_percent
    .assign(
        to_keep=lambda x: x["species"].map(
            lambda s: any(s.startswith(s_to_keep) for s_to_keep in mito_genomes_to_keep.values())
        )
    )
    .query("((max_percent >= @min_percent) and (max_coverage >= @min_coverage)) or to_keep")
    .sort_values("max_percent", ascending=False)
    .reset_index(drop=True)
)

assert all(
    any(s_kept.startswith(s_to_keep) for s_kept in mito_to_keep["species"])
    for s_to_keep in mito_genomes_to_keep.values()
)

print("Keeping the following mitochondrial genomes:")
display(mito_to_keep.sort_values("max_percent", ascending=False).round(2))

mito_ids_to_keep = mito_to_keep["alignment_reference"].tolist()

Now get just the counts for those mitochondrial genomes to keep, aggregating all other counts to "other" and pivoting / melting to add zero counts:

In [None]:
mito_counts_by_metagenomic_run = (
    counts_and_coverage
    [["Run accession", "alignment_reference", "aligned_reads"]]
    .query("alignment_reference.isin(@mito_ids_to_keep)")
    .rename(columns={"alignment_reference": "reference_id"})
    # add more information on references, and shorten species name to first two words
    .merge(mito_ref_info.rename(columns={"id": "reference_id"}), how="left")
    .assign(
        species=lambda x: x["species"].map(lambda s: " ".join(s.split()[: 2])),
    )
    # add other information
    .merge(
        sars2_aligned_by_run
        .drop(columns=["percent_preprocessed_reads_aligning_to_SARS2", "SARS2_covered_bases"]),
        how="left",
    )
)

mito_counts_by_metagenomic_run.to_csv(
    snakemake.output["mito_composition_by_metagenomic_run"], index=False, float_format="%.5f",
)

# debugging
assert mito_counts_by_metagenomic_run.notnull().all().all(), mito_counts_by_metagenomic_run.notnull().all()

mito_counts_by_metagenomic_run

Now we get the mitochondrial counts by **sample** (aggregating counts across run for the same sample:

In [None]:
# columns that we sum across runs for each sample
sum_cols = [
    "aligned_reads",
    "total_reads",
    "preprocessed_reads",
    "SARS2_aligned_reads",
]

mito_counts_by_metagenomic_sample = (
    mito_counts_by_metagenomic_run
    .groupby(
        [
            c for c in mito_counts_by_metagenomic_run
            if c not in sum_cols
            and c not in {"Run accession"}
        ],
        as_index=False,
        dropna=False,
    )
    .aggregate({c: "sum" for c in sum_cols})
)

dup_samples = (
    mito_counts_by_metagenomic_sample
    .groupby(["reference_id", "sample"])
    .aggregate(
        n=pd.NamedAgg("aligned_reads", "count"),
        sample_names=pd.NamedAgg("Sample name", "unique"),
    )
    .query("n > 1")
)

assert not len(dup_samples), f"Some duplicated sample names\n{dup_samples}"

mito_counts_by_metagenomic_sample.to_csv(
    snakemake.output["mito_composition_by_metagenomic_sample"], index=False, float_format="%.5g",
)

mito_counts_by_metagenomic_sample