# Analyze Barcode Counts

*Check the barcode counts for each sample and create a combined barcode count file.*

In [1]:
import os
import yaml
import numpy as np
import altair as alt
import pandas as pd

In [2]:
# Allow for large dataframes
_ = alt.data_transformers.disable_max_rows()

This notebook is parameterized by `papermill`. The next cell is tagged as parameters to get the passed parameters.

In [3]:
# this cell is tagged 'parameters' for `papermill` parameterization
joined_counts = None
snakemake = None


In [4]:
# Set the input and output paths
if snakemake is not None:
    filepath_prefix = ""
else:
    filepath_prefix = "../../"

with open(filepath_prefix + 'config.yml') as f:
    config = yaml.safe_load(f)


In [5]:
# Read in the information about all barcode runs
barcode_runs_df = pd.read_csv(filepath_prefix + config['barcode_runs'])
barcode_runs_df["sample"] = barcode_runs_df[config["id_columns"]].apply(
    lambda x: "-".join(x.astype(str)), axis=1
)
barcode_runs_df["date"] = pd.to_datetime(barcode_runs_df["date"], format="%y%m%d").dt.strftime("%Y-%m-%d")
barcode_runs_df.drop(columns=["fastq"], inplace=True)

samples = barcode_runs_df["sample"].unique().tolist()
print(f"There are {len(samples)} barcode runs.")

There are 1152 barcode runs.


In [66]:
# Read in the barcode counts files
count_dfs = {}
for file_type in ["counts", "invalid", "fates"]:
    count_dfs[file_type] =  pd.concat([
        pd.read_csv(os.path.join(filepath_prefix, config["barcode_counts_dir"], f"{sample}", f"{sample}_{file_type}.csv"))
        .assign(sample=sample)
        for sample in samples
    ])

# Merge with the run information
for file_type in count_dfs:
    # Left merge to retain all columns in 'barcode_runs'
    count_dfs[file_type] = count_dfs[file_type].merge(barcode_runs_df, on='sample', how='left')

In [67]:
# Get the user defined columns for dropdown menus
dropdown_columns = list(set(config["analyze_barcodes_params"]["dropdown_columns"] + ['library', 'date', 'plate']))


## General Barcode Stats

Below is an interactive plot of the count of barcodes in each sample colored by whether they are: 
- valid (in the library)
- invalid (not in the library)
- fails the chastity filter
- is too low quality
- is unparsable

You can select the `plate`, `library`, `date`, and whatever additional columns were specified in the `config`.





In [68]:
selections = [
    alt.selection_point(
        fields=[col],
        bind=alt.binding_select(
            options=[None] + barcode_runs_df[col].dropna().unique().tolist(),
            labels=["all"] + [str(x) for x in barcode_runs_df[col].dropna().unique()],
            name=col,
        ),
    )
    for col in dropdown_columns
]


fate_chart = (
    alt.Chart(count_dfs["fates"])
    .encode(
        x=alt.X(
            "count", title="barcode sequencing counts", axis=alt.Axis(format=".2g")
        ),
        y=alt.Y("sample", title=None),
        color=alt.Color(
            "fate",
            scale=alt.Scale(reverse=True),
        ),
        order=alt.Order("fate", sort="descending"),
        tooltip=[
            alt.Tooltip(c, format=".3g") if c == "count" else c
            for c in count_dfs["fates"].columns.tolist()
        ],
    )
    .mark_bar()
    .properties(width=350, height=alt.Step(13))
    .configure_axis(labelLimit=500)
)

for selection in selections:
    fate_chart = fate_chart.add_params(selection).transform_filter(selection)


display(fate_chart)

AttributeError: module 'altair' has no attribute 'selection_point'

## Average Valid and Invalid Counts

Below is a plot of the mean count of valid (present in the library) and invalid (not in the library) barcodes for each sample. 


You can select the `plate`, `library`, `date`, and whatever additional columns were specified in the `config`. You can also click on the legend to either the valid or invalid barcodes alone.



In [69]:
# Merge valid and invalid counts
counts_df = pd.concat(
    [
        count_dfs["counts"].assign(valid="valid"),
        count_dfs["invalid"].assign(valid="invalid"),
    ]
)

avg_counts = counts_df.groupby(["sample", "valid"], as_index=False).aggregate(
    avg_count=pd.NamedAgg("count", "mean")
).merge(barcode_runs_df, validate="many_to_one")


valid_selection = alt.selection_point(fields=["valid"], bind="legend")

avg_counts_chart = (
    alt.Chart(avg_counts)
    .encode(
        x=alt.X("avg_count", title="average counts per barcode"),
        y=alt.Y("sample", title=None),
        yOffset="valid",
        color=alt.Color(
            "valid",
            title="valid barcode",
            scale=alt.Scale(domain=avg_counts["valid"].unique()),
        ),
        tooltip=[
            alt.Tooltip(c, format=".3g") if c == "avg_count" else c
            for c in avg_counts.columns.tolist()
        ],
    )
    .mark_bar()
    .properties(width=200, height=alt.Step(15, **{"for": "position"}))
    .configure_axis(labelLimit=500)
    .add_params(*selections, valid_selection)
    .transform_filter(valid_selection)
)
for selection in selections:
    avg_counts_chart = avg_counts_chart.transform_filter(selection)

display(avg_counts_chart)

AttributeError: module 'altair' has no attribute 'selection_point'

### Update "retain" column of counts dataframe to remove samples with low counts

In [108]:
# Identify samples with low average valid counts and discard
avg_counts_valid = avg_counts.loc[avg_counts['valid']=='valid']
samples_low_counts = avg_counts_valid.loc[avg_counts_valid['avg_count']<1000]['sample'].tolist()
print("There are", len(samples_low_counts), "samples with low average valid counts. These samples will be discarded from analysis")

#Update retain column for dataframe for counts_df, do not retain samples with low valid counts
counts_df['retain'] = (~counts_df['sample'].isin(samples_low_counts))

There are 7 samples with low average valid counts. These samples will be discarded from analysis


Unnamed: 0,barcode,count,sample,library,antibody,concentration,replicate,date,plate,standard_set,retain,valid
0,CGTTTAAACAATGAAG,39064,pdmH1N1_lib2022-D002d0-393660.0-1-230801-Plate1,pdmH1N1_lib2022,D002d0,393660.0,1,2023-08-01,Plate1,pdmH1N1_std,True,valid
1,AGTGTCCCTAAGAGGC,20134,pdmH1N1_lib2022-D002d0-393660.0-1-230801-Plate1,pdmH1N1_lib2022,D002d0,393660.0,1,2023-08-01,Plate1,pdmH1N1_std,True,valid
2,CTGCACGAGAGACTTC,18873,pdmH1N1_lib2022-D002d0-393660.0-1-230801-Plate1,pdmH1N1_lib2022,D002d0,393660.0,1,2023-08-01,Plate1,pdmH1N1_std,True,valid
3,GTCCGTTGATAAAGAG,18188,pdmH1N1_lib2022-D002d0-393660.0-1-230801-Plate1,pdmH1N1_lib2022,D002d0,393660.0,1,2023-08-01,Plate1,pdmH1N1_std,True,valid
4,ATACCTCAACCTTGAA,17964,pdmH1N1_lib2022-D002d0-393660.0-1-230801-Plate1,pdmH1N1_lib2022,D002d0,393660.0,1,2023-08-01,Plate1,pdmH1N1_std,True,valid
...,...,...,...,...,...,...,...,...,...,...,...,...
2241511,TTGTGCACTAAATTAA,1,pdmH1N1_lib2022-Y184d30-131220.0-2-230926-Plate12,pdmH1N1_lib2022,Y184d30,131220.0,2,2023-09-26,Plate12,pdmH1N1_std,True,invalid
2241512,TTGTGGAAATATATAA,1,pdmH1N1_lib2022-Y184d30-131220.0-2-230926-Plate12,pdmH1N1_lib2022,Y184d30,131220.0,2,2023-09-26,Plate12,pdmH1N1_std,True,invalid
2241513,TTGTTCCGAGACAACA,1,pdmH1N1_lib2022-Y184d30-131220.0-2-230926-Plate12,pdmH1N1_lib2022,Y184d30,131220.0,2,2023-09-26,Plate12,pdmH1N1_std,True,invalid
2241514,TTTACTAAGATTTCAT,1,pdmH1N1_lib2022-Y184d30-131220.0-2-230926-Plate12,pdmH1N1_lib2022,Y184d30,131220.0,2,2023-09-26,Plate12,pdmH1N1_std,True,invalid


## Fraction of Counts from Neutralization Standards

Below is a plot of the fraction of each sample's barcodes that correspond to the barcodes in the neutralization standards.

You can select the `plate`, `library`, `date`, and whatever additional columns were specified in the `config`. 

In [95]:
neut_standards_barcodes = pd.read_csv(filepath_prefix + config["neut_standards"])["barcode"].tolist()

print(f"Read {len(neut_standards_barcodes)} neutralization-standard barcodes")

Read 10 neutralization-standard barcodes


In [98]:
faction_standards_df = (
    counts_df
        .query("valid == 'valid'")
        .query("retain")
        .assign(is_neut_standard=lambda x: x["barcode"].isin(neut_standards_barcodes))
        .groupby(["sample", "is_neut_standard"], as_index=False)
        .aggregate(count=pd.NamedAgg("count", "sum"))
        .assign(total_count=lambda x: x.groupby(["sample"])["count"].transform("sum"))
        .assign(fraction_standards=lambda x: x.apply(lambda row: row["count"] / row["total_count"] if row["total_count"] != 0 else np.nan, axis=1))
        .query("is_neut_standard")
        .drop(columns=["is_neut_standard", "count", "total_count"])
        .merge(barcode_runs_df, validate="one_to_one")
        .sort_values(['library', 'plate', 'date', 'antibody', 'concentration'], ascending=[True, True, True, True, False])
)

Make sure that there are neutralization standards in every sample. If not, there is likely an issue.

In [99]:
# Warn if there are no neutralization standards for a given sample (i.e an NaN value in the fraction_standards column)
if faction_standards_df["fraction_standards"].isna().any():
    print("WARNING: There are no neutralization standards for the following samples:")
    print(faction_standards_df[faction_standards_df["fraction_standards"].isna()]["sample"].tolist())


In [100]:
fraction_standards_chart = alt.Chart(faction_standards_df).encode(
    y=alt.Y('sample:N', title=None, sort=alt.SortField('rank', order='ascending')),
    x=alt.X(
        "fraction_standards:Q",
        title="fraction counts from neutralization standard",
        scale=alt.Scale(type="symlog", constant=0.02, domainMax=.8)
        ),
    tooltip=[alt.Tooltip(c, format=".3g") if c in ["fraction_standards"] else c
            for c in faction_standards_df.columns if c != "library_sample"],
).mark_point(filled=True, size=50, opacity=0.7
).properties(width=245, height=alt.Step(15)
).configure_axis(labelLimit=500
).add_params(*selections)

for selection in selections:
    fraction_standards_chart = fraction_standards_chart.transform_filter(selection)

display(fraction_standards_chart)

AttributeError: 'Chart' object has no attribute 'add_params'

## Combine Counts Files

Combine all of the valid counts files into a single file for downstream analyses.


In [101]:
if snakemake:
    counts_df.to_csv(joined_counts, index=False)
else:
    counts_df.to_csv(os.path.join(filepath_prefix, config["barcode_counts_dir"], "barcode_counts.csv"), index=False)

## Combine Counts Files By Plate

Combine all of the valid counts files into a single file for each plate for downstream analyses.

In [102]:
plates = counts_df['plate'].unique().tolist()

if snakemake:
    for plate in plates:
        filename = plate + "_barcode_counts.csv"
        counts_per_plate = counts_df.loc[counts_df['plate'] == plate]
        counts_per_plate.to_csv(os.path.join(filepath_prefix, config["barcode_counts_dir"], filename), index=False)
else:
    for plate in plates:
        filename = plate + "_barcode_counts.csv"
        counts_per_plate = counts_df.loc[counts_df['plate'] == plate]
        counts_per_plate.to_csv(os.path.join(filepath_prefix, config["barcode_counts_dir"], filename), index=False)