# Aggregate and analyze the drops from QC-ing the plates and sera

In [None]:
import altair as alt

import pandas as pd

from ruamel.yaml import YAML

yaml = YAML(typ="rt")

_ = alt.data_transformers.disable_max_rows()

Get variables from `snakemake`:

In [None]:
input_plate_qc_drops = snakemake.input.plate_qc_drops
input_sera_qc_drops = snakemake.input.sera_qc_drops
output_plate_qc_drops = snakemake.output.plate_qc_drops
output_sera_qc_drops = snakemake.output.sera_qc_drops
plates = snakemake.params.plates
sera = snakemake.params.sera

## Analyze plate QC drops
Read QC drops for individual plates into a merged dictionary, write it to YAML, and also convert to a DataFrame.
If you really want to look into the details of what is being dropped, you will want to look at that merged YAML file.

In [None]:
# read dictionary of QC drops
assert len(plates) == len(input_plate_qc_drops)
plate_qc_drops = {}
for plate, qc_drops_yaml in zip(plates, input_plate_qc_drops):
    with open(qc_drops_yaml) as f:
        plate_qc_drops[plate] = yaml.load(f)
assert len(plate_qc_drops) == len(input_plate_qc_drops)

print(f"Writing merged plate drops to {output_plate_qc_drops}")
with open(output_plate_qc_drops, "w") as f:
    yaml.dump(plate_qc_drops, stream=f)

# convert dictionary of QC drops into list of tuples
plate_qc_drop_tups = [
    (plate_key, droptype_key, drop_key, reason)
    for (plate_key, plate_val) in plate_qc_drops.items()
    for droptype_key, droptype_val in plate_val.items()
    for drop_key, reason in droptype_val.items()
]

# create data frame of QC drops
plate_qc_drops_df = pd.DataFrame(
    plate_qc_drop_tups,
    columns=["plate", "drop type", "drop", "reason"],
)

In [None]:
plate_qc_drop_counts = plate_qc_drops_df.groupby(
    ["plate", "drop type", "reason"], as_index=False
).aggregate(n_drops=pd.NamedAgg("drop", "nunique"))
assert plate_qc_drop_counts["n_drops"].sum() == len(plate_qc_drops_df)

Now plot the number of drops for each plate.
You should be worried (maybe re-do or discard) any plates with a very large number of drops:

In [None]:
plate_selection = alt.selection_point(fields=["plate"], on="mouseover", empty=False)

plate_qc_drop_counts_chart = (
    alt.Chart(plate_qc_drop_counts)
    .add_params(plate_selection)
    .encode(
        alt.X(
            "n_drops",
            title="number of drops",
        ),
        alt.Y(
            "plate",
            sort=plates,
            title=None,
            axis=alt.Axis(labelFontStyle="bold", labelFontSize=11),
        ),
        alt.Column(
            "drop type",
            title=None,
            spacing=5,
            header=alt.Header(labelFontSize=12, labelFontStyle="bold", labelPadding=1),
        ),
        alt.Color(
            "reason",
            legend=alt.Legend(
                orient="top", columns=1, labelLimit=230, title=None, padding=1
            ),
        ),
        strokeWidth=alt.condition(plate_selection, alt.value(3), alt.value(0.5)),
        tooltip=plate_qc_drop_counts.columns.tolist(),
    )
    .mark_bar(height={"band": 0.8}, stroke="black")
    .properties(
        width=230,
        height=alt.Step(16),
        title=alt.TitleParams(
            "Number of QC drops when processing plates", anchor="middle", dy=-2
        ),
    )
    .configure_axis(grid=False)
    .resolve_scale(color="independent", x="independent")
)

plate_qc_drop_counts_chart

## Look for barcodes dropped especially often in plate QC
If a barcode is dropped especially often across plates, that could indicate something problematic with that barcode such that it should be removed altogether from the library analysis.

In [None]:
barcode_drops = (
    plate_qc_drops_df.query("`drop type`.str.startswith('barcode')")
    .assign(barcode=lambda x: x["drop"].str.split().str[0])
    .groupby(["drop type", "barcode"], as_index=False)
    .aggregate(
        plates_where_dropped=pd.NamedAgg("plate", "nunique"),
        total_drops=pd.NamedAgg("plate", "count"),
    )
)

barcode_selection = alt.selection_point(fields=["barcode"], on="mouseover", empty=False)

barcode_drops_chart = (
    alt.Chart(barcode_drops)
    .add_params(barcode_selection)
    .encode(
        alt.X(
            "total_drops",
            title="times barcode dropped",
        ),
        alt.Y(
            "barcode",
            sort=alt.SortField("total_drops", order="descending"),
            axis=alt.Axis(labelFontSize=9),
        ),
        alt.Column(
            "drop type",
            title=None,
            spacing=8,
            header=alt.Header(labelFontSize=12, labelFontStyle="bold", labelPadding=1),
        ),
        strokeWidth=alt.condition(barcode_selection, alt.value(3), alt.value(0.5)),
        tooltip=barcode_drops.columns.tolist(),
    )
    .mark_bar(height={"band": 0.8}, stroke="black")
    .properties(
        width=200,
        height=alt.Step(10),
        title=alt.TitleParams(
            "Number of QC drops when processing plates", anchor="middle", dy=-2
        ),
    )
    .configure_axis(grid=False)
    .resolve_scale(color="independent", x="independent", y="independent")
)

barcode_drops_chart

## Analyze the sera QC
Analyze the QC performed on the sera, which involves completely dropping titers for certain virus-sera pairs.

Read the QC for different sera into a merged dictionary, write it to YAML, and also convert to a DataFrame.
If you really want to look into the details of what is being dropped, you will want to look at that merged YAML file.

In [None]:
# read dictionary of QC drops
assert len(sera) == len(input_sera_qc_drops)
sera_qc_drops = {}
for serum, qc_drops_yaml in zip(sera, input_sera_qc_drops):
    with open(qc_drops_yaml) as f:
        sera_qc_drops[serum] = yaml.load(f)
assert len(sera_qc_drops) == len(input_sera_qc_drops)

print(f"Writing merged sera drops to {output_sera_qc_drops}")
with open(output_sera_qc_drops, "w") as f:
    yaml.dump(sera_qc_drops, stream=f)

# convert dictionary of QC drops into list of tuples
sera_qc_drop_tups = [
    (serum_key, virus, reason)
    for (serum_key, serum_val) in sera_qc_drops.items()
    for virus, reason in serum_val.items()
]

# create data frame of QC drops
sera_qc_drops_df = pd.DataFrame(sera_qc_drop_tups, columns=["serum", "virus", "reason"])

Plot the number of viruses dropped for each serum.
If a serum has many missed viruses, then you will lack a lot of titers and so it may be worth reviewing the cause of the drops.

In [None]:
sera_n_drops = sera_qc_drops_df.groupby(["serum", "reason"], as_index=False).aggregate(
    n_viruses=pd.NamedAgg("virus", "nunique")
)
assert sera_n_drops["n_viruses"].sum() == len(sera_qc_drops_df)

sera_n_drops_chart = (
    alt.Chart(sera_n_drops)
    .encode(
        alt.X("n_viruses", title="number of viruses dropped"),
        alt.Y("serum", sort=sera),
        alt.Color("reason", title="reason dropped", legend=alt.Legend(labelLimit=350)),
        tooltip=sera_n_drops.columns.tolist(),
    )
    .mark_bar(height={"band": 0.8})
    .properties(
        width=250,
        height=alt.Step(13),
        title="Number of viruses dropped at serum QC for each serum",
    )
    .configure_axis(grid=False)
)

sera_n_drops_chart

Plot the number of sera for which each virus is dropped during serum QC.
If a virus is dropped for many sera, that may indicate some issue with that virus in assays:

In [None]:
virus_n_drops = sera_qc_drops_df.groupby(["virus", "reason"], as_index=False).aggregate(
    n_sera=pd.NamedAgg("serum", "nunique")
)
assert virus_n_drops["n_sera"].sum() == len(sera_qc_drops_df)

virus_n_drops_chart = (
    alt.Chart(virus_n_drops)
    .encode(
        alt.X("n_sera", title="number of sera for which virus is dropped"),
        alt.Y("virus", sort=alt.SortField("n_sera", order="descending")),
        alt.Color("reason", title="reason dropped", legend=alt.Legend(labelLimit=350)),
        tooltip=virus_n_drops.columns.tolist(),
    )
    .mark_bar(height={"band": 0.8})
    .properties(
        width=250,
        height=alt.Step(13),
        title="Number of sera for which each virus is dropped at serum QC",
    )
    .configure_axis(grid=False)
)

virus_n_drops_chart