# Analyze samples in positive table of Chinese CDC study
Importantly, we perform this analysis only for metagenomic samples.

Import Python modules and read data from `snakemake` passed files:

In [None]:
import altair as alt

import numpy

import pandas as pd

positive_table = pd.read_csv(snakemake.input.positive_table)

# get only metagenomic samples
sars2_by_sample = pd.read_csv(snakemake.input.sars2_aligned_by_sample).query(
    "description in @snakemake.params.metagenomic_descriptions"
)

Merge the data for all samples that are in the positive table or have non-zero SARS2 reads:

In [None]:
merged = (
    sars2_by_sample
    .merge(
        positive_table.rename(
            columns={"Sample ID": "Sample name", "Lab code": "sample"}
        ),
        on=["sample", "Sample name"],
        validate="one_to_one",
        how="outer",
    )
)

Classify samples as:
  - PCR+ / no sequencing
  - PCR+ / sequencing+
  - PCR+ / sequencing-
  - PCR- / no sequencing
  - PCR- / sequencing+
  - PCR- / sequencing-

Samples are sequencing+ if they have at least one SARS-CoV-2 read:

In [None]:
def classify_sample(row):
    if row["PCR"] == "+":
        pcr = "PCR+"
    else:
        pcr = "PCR-"
    if pd.isnull(row["SARS2_aligned_reads"]):
        seq = "no sequencing"
    elif row["SARS2_aligned_reads"] == 0:
        seq = "sequencing-"
    else:
        assert row["SARS2_aligned_reads"] > 0
        seq = "sequencing+"
    return f"{pcr} / {seq}"

merged["classification"] = merged.apply(classify_sample, axis=1)

# write to file
merged.to_csv(snakemake.output.csv, index=False, float_format="%.5g")

merged.groupby("classification").aggregate(n_samples=pd.NamedAgg("classification", "count"))

Look at the samples that are PCR- but contain SARS-CoV-2 reads in the sequencing:

In [None]:
merged.query("classification == 'PCR- / sequencing+'")

Plot correlation of Ct with SARS-CoV-2 reads.
A missing Ct value is taken to be 40 (the limit of detection), as all samples were tested by RT-qPCR.
If there are multiple Ct values, take the mean.
If the Ct value is just `+`, ignore that sample as we don't have a quantitative value to plot:

In [None]:
ct_vs_reads = (
    merged
    .query("percent_preprocessed_reads_aligning_to_SARS2.notnull()")
    .query("CT != '+'")
    .assign(
        Ct=lambda x: x["CT"].map(
            lambda ct: (
                40
                if pd.isnull(ct)
                else numpy.mean([float(c) for c in ct.split("/")])
            )
        ),
    )
    [[
        "sample",
        "Sample name",
        "Collection date",
        "Isolation source",
        "total_reads",
        "preprocessed_reads",
        "SARS2_aligned_reads",
        "percent_preprocessed_reads_aligning_to_SARS2",
        "Sample type",
        "PCR",
        "Ct",
        "PCR target",
        "classification",
    ]]
    .reset_index(drop=True)
)

ct_vs_reads

Plot, putting SARS-CoV-2 percents in sequencing of zero as half minimum non-zero value:

In [None]:
plot_min_percent = (
    ct_vs_reads
    .query("percent_preprocessed_reads_aligning_to_SARS2 > 0")
    ["percent_preprocessed_reads_aligning_to_SARS2"]
    .min()
) / 2

base_chart = (
    alt.Chart(ct_vs_reads)
    .transform_calculate(
        clipped_percent=alt.expr.max(
            alt.datum["percent_preprocessed_reads_aligning_to_SARS2"], plot_min_percent
        ),
        log10_clipped_percent=alt.expr.log(alt.datum["clipped_percent"]) / alt.expr.LN10,
    )
    .encode(
        x=alt.X(
            "Ct",
            title="RT-qPCR Ct value",
            scale=alt.Scale(zero=False),
        ),
        y=alt.Y(
            "log10_clipped_percent:Q",
            scale=alt.Scale(zero=False, nice=False),
            title="log10 % reads mapping to SARS2",
        ),
        tooltip=ct_vs_reads.columns.tolist(),
    )
    .mark_circle(size=60, opacity=0.5)
    .properties(width=200, height=200)
)

line_chart = (
    base_chart
    .transform_regression("Ct", "log10_clipped_percent")
    .mark_line(color="orange", size=6, opacity=0.35)
)

r_chart = (
    base_chart
    .transform_regression("Ct", "log10_clipped_percent", params=True)
    .transform_calculate(
        r=alt.expr.if_(
            alt.datum["coef"][1] >= 0,
            alt.expr.sqrt(alt.datum["rSquared"]),
            -alt.expr.sqrt(alt.datum["rSquared"]),
        ),
        label='"r = " + format(datum.r, ".2f")',
    )
    .mark_text(align="left", color="orange", fontWeight=500, fontSize=12)
    .encode(
        x=alt.value(6),
        y=alt.value(8),
        text=alt.Text("label:N"),
    )
)

ct_vs_reads_chart = (
    (base_chart + line_chart + r_chart)
    .configure_axis(grid=False)
)

ct_vs_reads_chart.save(snakemake.output.html)

ct_vs_reads_chart