# Process plate counts to get ratios of variants in initial pool

An initial pooled library was made by adding equal volumes of all variants and then infecting this pool on MDCK-SIAT1 cells. These infections were done by serially diluting a starting 50 uL volume of pool. Barcodes were then isolated and sequenced so that we can determine representation of each variant in the pool, as well as the appropriate library pool dilution (i.e., MOI) to use in neutralization assays. 

The plots generated by this notebook are interactive, so you can mouseover points for details, use the mouse-scroll to zoom and pan, and use interactive dropdowns at the bottom of the plots.

## Setup
Import Python modules:

In [1]:
import pickle
import sys

import altair as alt

import matplotlib.pyplot as plt

import numpy
import string
import pandas as pd
from os.path import join
import os
import ruamel.yaml as yaml

_ = alt.data_transformers.disable_max_rows()

# Basic color palette
color_palette = [
    '#345995', #blue
    '#03cea4', #teal
    '#ca1551', #red
    '#eac435', #yellow
               ]

In [2]:
resultsdir = '../results'
os.makedirs(resultsdir, exist_ok=True)

## Add input data locations
Some of these files are defined as data, and some of these files are generated by running the specified library pooling data as `miscellaneous_plates` through the `seqneut-pipeline`. For details on how these files are generated, see the `README.md' in [https://github.com/jbloomlab/seqneut-pipeline](https://github.com/jbloomlab/seqneut-pipeline)

In [3]:
# Viral library contents and barcode IDs
viral_library_csv =  '../../../data/viral_libraries/flu-seqneut-2025-barcode-to-strain.csv'
# Neutralization standard set of barcode IDs
neut_standard_set_csv =  '../../../data/neut_standard_sets/loes2023_neut_standards.csv'
# All samples included in library poolign sequencing run
# Contains information on library, dilution factor, R1 location
samplesfile = '../../../data/miscellaneous_plates/2025-07-16_initial_pool.csv'

## CHANGE THIS EACH TIME YOU RUN NB ##
# Counts and fates files output by running library pooling samples as miscellaneous plates
platedir = '../../../results/miscellaneous_plates/20250716_initial_pool/'

# Identify all counts and fates CSVs
count_csvs = []
fate_csvs = []
file_list = os.listdir(platedir)
for f in file_list:
    location = platedir + f
    if "_counts" in f:
        count_csvs.append(location)
    elif "_fates" in f:
        fate_csvs.append(location)

In [4]:
# Define a samples dataframe using the samples file
samples_df = pd.read_csv(samplesfile)
samples_df.drop(columns=['fastq'], inplace=True)
samples_df['sample'] = samples_df.apply(
    lambda x: '-'.join(x.astype(str)), axis=1
)

samples = samples_df["sample"].unique().tolist()
print(f"There are {len(samples)} barcode runs.")

samples_df

There are 16 barcode runs.


Unnamed: 0,well,serum,dilution_factor,replicate,sample
0,A1,none,4,1,A1-none-4-1
1,B1,none,4,2,B1-none-4-2
2,A2,none,8,1,A2-none-8-1
3,B2,none,8,2,B2-none-8-2
4,A3,none,16,1,A3-none-16-1
5,B3,none,16,2,B3-none-16-2
6,A4,none,32,1,A4-none-32-1
7,B4,none,32,2,B4-none-32-2
8,A5,none,64,1,A5-none-64-1
9,B5,none,64,2,B5-none-64-2


## Statistics on barcode-parsing for each sample
Make interactive chart of the "fates" of the sequencing reads parsed for each sample on the plate.

If most sequencing reads are not "valid barcodes", this could potentially indicate some problem in the sequencing or barcode set you are parsing.

Potential fates are:
 - *valid barcode*: barcode that matches a known virus or neutralization standard, we hope most reads are this.
 - *invalid barcode*: a barcode with proper flanking sequences, but does not match a known virus or neutralization standard. If you  have a lot of reads of this type, it is probably a good idea to look at the invalid barcode CSVs (in the `./results/barcode_invalid/` subdirectory created by the pipeline) to see what these invalid barcodes are.
 - *unparseable barcode*: could not parse a barcode from this read as there was not a sequence of the correct length with the appropriate flanking sequence.
 - *low quality barcode*: low-quality or `N` nucleotides in barcode, could indicate problem with sequencing.
 - *failed chastity filter*: reads that failed the Illumina chastity filter, if these are reported in the FASTQ (they may not be).

Also, if the number of reads per sample is very uneven, that could indicate that you did not do a good job of balancing the different samples in the Illumina sequencing.

In [5]:
fates = (
    pd.concat([pd.read_csv(f).assign(well=f.strip(platedir).strip('_fates.csv')) for f, s in zip(fate_csvs, samples)])
    .merge(samples_df, validate="many_to_one", on="well")
    .assign(
        fate_counts=lambda x: x.groupby("fate")["count"].transform("sum"),
        sample_well=lambda x: x["sample"] + " (" + x["well"] + ")",
    )
    .query("fate_counts > 0")[  # only keep fates with at least one count
        ["fate", "count", "well", "sample_well", "dilution_factor"]
    ]
)

assert len(fates) == len(fates.drop_duplicates())


sample_wells = list(
    fates.sort_values(["dilution_factor"])["sample_well"]
)



fates_chart = (
    alt.Chart(fates)
    .encode(
        alt.X("count", scale=alt.Scale(nice=False, padding=3)),
        alt.Y(
            "sample_well",
            title=None,
            sort=sample_wells,
        ),
        alt.Color("fate", sort=sorted(fates["fate"].unique(), reverse=True)),
        alt.Order("fate", sort="descending"),
        tooltip=fates.columns.tolist(),
    )
    .mark_bar(height={"band": 0.85})
    .properties(
        height=alt.Step(10),
        width=200,
        title=f"Barcode parsing for initial titering plate",
    )
    .configure_axis(grid=False)
)

fates_chart

## Read barcode counts
Read the counts per barcode:

In [6]:
# get barcode counts
counts = (
    pd.concat([pd.read_csv(c).assign(well=c.strip(platedir).strip('_counts.csv')) for c, s in zip(count_csvs, samples)])
    .merge(samples_df, validate="many_to_one", on="well")
    .drop(columns=["replicate"])
    .assign(sample_well=lambda x: x["sample"] + " (" + x["well"] + ")")
)

# classify barcodes as viral or neut standard
barcode_class = pd.concat(
    [
        pd.read_csv(viral_library_csv)[["barcode", "strain"]].assign(
            neut_standard=False,
        ),
        pd.read_csv(neut_standard_set_csv)[["barcode"]].assign(
            neut_standard=True,
            strain=pd.NA,
        ),
    ],
    ignore_index=True,
)

# merge counts and classification of barcodes
assert set(counts["barcode"]) == set(barcode_class["barcode"])
counts = counts.merge(barcode_class, on="barcode", validate="many_to_one")
assert set(sample_wells) == set(counts["sample_well"])

In [7]:
counts

Unnamed: 0,barcode,count,well,serum,dilution_factor,sample,sample_well,strain,neut_standard
0,ACTGTCTAGAAATTTT,225762,A8,none,512,A8-none-512-1,A8-none-512-1 (A8),A/France/PAC-RELAB-HCL024172122101/2024_H3N2,False
1,ATTATCATATCTAATA,178552,A8,none,512,A8-none-512-1,A8-none-512-1 (A8),A/India/Pune-NIV24_3439/2024_H3N2,False
2,AAGTTAAGAGAAAGTT,136484,A8,none,512,A8-none-512-1,A8-none-512-1 (A8),A/Victoria/4897/2022_IVR-238_H1N1,False
3,CAGAACCTCGTTGTCT,117826,A8,none,512,A8-none-512-1,A8-none-512-1 (A8),A/Switzerland/47775/2024_H3N2,False
4,GTACAAACCTGCAAAT,114924,A8,none,512,A8-none-512-1,A8-none-512-1 (A8),,True
...,...,...,...,...,...,...,...,...,...
5243,GCTTGTCGCAAACAGC,0,A2,none,8,A2-none-8-1,A2-none-8-1 (A2),A/Texas/15550/2024_H3N2,False
5244,GTCTCCTGACTAAAAA,0,A2,none,8,A2-none-8-1,A2-none-8-1 (A2),A/Brisbane/02/2018_IVR-190_H1N1,False
5245,TAGGCATCGCTGTGTC,0,A2,none,8,A2-none-8-1,A2-none-8-1 (A2),A/New_York/191/2024_H3N2,False
5246,TATATAAGCCTGTGAG,0,A2,none,8,A2-none-8-1,A2-none-8-1 (A2),A/Brisbane/02/2018_IVR-190_H1N1,False


## Average counts per barcode in each well

Plot average counts per barcode.
If a sample has inadequate barcode counts, it may not have good enough statistics for accurate analysis, and a QC-threshold is applied:

In [8]:
avg_barcode_counts = (
    counts.groupby(
        ["well", "sample_well"],
        dropna=False,
        as_index=False,
    )
    .aggregate(avg_count=pd.NamedAgg("count", "mean"))
    .assign(
        fails_qc=lambda x: (
            x["avg_count"] < 500
        ),
    )
)

avg_barcode_counts_chart = (
    alt.Chart(avg_barcode_counts)
    .encode(
        alt.X(
            "avg_count",
            title="average barcode counts per well",
            scale=alt.Scale(nice=False, padding=3),
        ),
        alt.Y("sample_well", sort=sample_wells),
        alt.Color(
            "fails_qc",
            title=f"fails {'min barcode count threshold'=}",
            legend=alt.Legend(titleLimit=500),
        ),
        tooltip=[
            alt.Tooltip(c, format=".3g") if avg_barcode_counts[c].dtype == float else c
            for c in avg_barcode_counts.columns
        ],
    )
    .mark_bar(height={"band": 0.85})
    .properties(
        height=alt.Step(10),
        width=250,
        title=f"Average barcode counts per well for titering plate",
    )
    .configure_axis(grid=False)
)

display(avg_barcode_counts_chart)

# drop wells failing QC
avg_barcode_counts_per_well_drops = list(avg_barcode_counts.query("fails_qc")["well"])

## Fraction of counts from neutralization standard
Determine the fraction of counts from the neutralization standard in each sample, and make sure this fraction passess the QC threshold.

In [9]:
neut_standard_fracs = (
    counts.assign(
        neut_standard_count=lambda x: x["count"] * x["neut_standard"].astype(int)
    )
    .groupby(
        ["well", "sample_well", 'dilution_factor'],
        dropna=False,
        as_index=False,
    )
    .aggregate(
        total_count=pd.NamedAgg("count", "sum"),
        neut_standard_count=pd.NamedAgg("neut_standard_count", "sum"),
    )
    .assign(
        neut_standard_frac=lambda x: x["neut_standard_count"] / x["total_count"],
        fails_qc=lambda x: (
            x["neut_standard_frac"] < 0.001
        ),
    )
)

neut_standard_fracs_chart = (
    alt.Chart(neut_standard_fracs)
    .encode(
        alt.X(
            "neut_standard_frac",
            title="frac counts from neutralization standard per well",
            scale=alt.Scale(nice=False, padding=3),
        ),
        alt.Y("sample_well", sort=sample_wells),
        alt.Color(
            "fails_qc",
            title=f"fails {'min_neut_standard_frac_per_well'=}",
            legend=alt.Legend(titleLimit=500),
        ),
        tooltip=[
            alt.Tooltip(c, format=".3g") if neut_standard_fracs[c].dtype == float else c
            for c in neut_standard_fracs.columns
        ],
    )
    .mark_bar(height={"band": 0.85})
    .properties(
        height=alt.Step(10),
        width=250,
        title=f"Neutralization-standard fracs per well for titering plate, initial pool",
    )
    .configure_axis(grid=False)
    .configure_legend(titleLimit=1000)
)

display(neut_standard_fracs_chart)

# drop wells failing QC
min_neut_standard_frac_per_well_drops = list(
    neut_standard_fracs.query("fails_qc")["well"]
)

In [10]:
# Scatterplot of the same data as above, plotted by dilution factor
alt.Chart(neut_standard_fracs).mark_circle(size=60).encode(
    alt.X('dilution_factor:Q', 
          scale=alt.Scale(type='log'),
          title='library pool reciprocal dilution factor'),
    alt.Y('neut_standard_frac:Q', 
          scale=alt.Scale(type='log'),
          title='fraction of reads = neutralization standard'),
    color='fails_qc',
    tooltip=['well', 'dilution_factor', 'neut_standard_frac', 'total_count']
).interactive()

## Rebalancing strains contained in the library
Viruses were rescued and blind passaged individually. To make the initial pool, we added equal volumes of all strains together and re-infected MDCK-SIAT1 cells. Now we can assess the contribution of each strain to the pool, and determine how much should be added of each virus to achieve more equal balancing. 

Each of the 3 viral barcodes associated with each strain were pooled prior to rescue, so they cannot be balanced. 

In [11]:
# Get summed barcode counts for all strains across all wells
straincounts_allbarcodes = (counts.groupby(['sample','sample_well','strain','dilution_factor','serum','well'])
                          .sum()
                          .reset_index()
                          .drop(columns = ['sample_well', 'neut_standard', 'barcode'])
                         )

# Get sum of all virus/barcode counts per well
sumperwell = (straincounts_allbarcodes.groupby(['sample','dilution_factor','serum','well'])
              .sum()
              .drop(columns=['strain'])
              .reset_index()
              .rename(columns={'count':'counts_perwell'})
             )

# Merge dataframes and calculate fraction of each well devoted to each strain
merged_df = straincounts_allbarcodes.merge(sumperwell, on=['sample','dilution_factor','serum','well'])
merged_df['fraction_strain'] = merged_df['count'] /merged_df['counts_perwell'] / 2
merged_df

Unnamed: 0,sample,strain,dilution_factor,serum,well,count,counts_perwell,fraction_strain
0,A1-none-4-1,A/Amapa/021563-IEC/2024_H3N2,4,none,A1,40645,5016449,0.004051
1,A1-none-4-1,A/Badajoz/18680568/2025_H3N2,4,none,A1,53312,5016449,0.005314
2,A1-none-4-1,A/Bangkok/P176/2025_H1N1,4,none,A1,343,5016449,0.000034
3,A1-none-4-1,A/Brisbane/02/2018_H1N1,4,none,A1,3508,5016449,0.000350
4,A1-none-4-1,A/Brisbane/02/2018_IVR-190_H1N1,4,none,A1,0,5016449,0.000000
...,...,...,...,...,...,...,...,...
2315,B8-none-512-2,A/Wisconsin/588/2019_H1N1,512,none,B8,2256,2958109,0.000381
2316,B8-none-512-2,A/Wisconsin/67/2022_H1N1,512,none,B8,9602,2958109,0.001623
2317,B8-none-512-2,A/Wisconsin/NIRC-IS-1028/2024_H3N2,512,none,B8,1421,2958109,0.000240
2318,B8-none-512-2,A/Wisconsin/NIRC-IS-1111/2025_H1N1,512,none,B8,884,2958109,0.000149


We now have this fraction of reads devoted to all strains calculated for all wells. However, ideally we should just focus on those wells containing dilutions that we would use for actual neutralization assays. We should choose a set of replicate wells where the fraction of neutralization standard reads begins to increase linearly with the increasing reciprocal dilution factor. See plots above for choosing these wells. 

In [12]:
# Using A9 and B9, corresponding to reciprocal dilution factor = 256
single_well = merged_df.loc[merged_df['sample'].str.contains('A4-|B4-')]

In [13]:
# Calculate mean fraction strain across both wells
mean_df = single_well.groupby(['strain'])['fraction_strain'].mean().to_frame().rename(columns = {'fraction_strain': 'mean_fraction_strains'}).reset_index()
mean_single_well = single_well.merge(mean_df, on = 'strain', how = 'left')

# calcualte ratios to add for equal pool
num_strains = len(mean_single_well.strain.unique())
mean_single_well['ratio_to_add'] = (1/num_strains)/mean_single_well['fraction_strain']
mean_single_well['mean_ratio_to_add'] = (1/num_strains)/mean_single_well['mean_fraction_strains']

mean_single_well['est_tcid50'] = (mean_single_well['mean_fraction_strains']*25000)*76

print(f'this library has {num_strains} total strains')
print('stats where there isnt 0 of a virus...')
print(mean_single_well.query('mean_ratio_to_add != inf')[['mean_ratio_to_add']].describe())

print('\nviruses with 0 titer...')
print(mean_single_well.query('mean_ratio_to_add == inf').strain.unique())

ratio_cutoff = 250
print(f'\nviruses with >0 titer but ratio >={ratio_cutoff} to increase...')
print(mean_single_well.query('mean_ratio_to_add != inf').query(f'mean_ratio_to_add >= {ratio_cutoff}').strain.unique())

this library has 145 total strains
stats where there isnt 0 of a virus...
       mean_ratio_to_add
count         284.000000
mean           49.490931
std           121.586638
min             0.309454
25%             1.330844
50%             4.286480
75%            46.681513
max          1139.195387

viruses with 0 titer...
['A/Brisbane/02/2018_IVR-190_H1N1' 'A/Brisbane/10/2007_H3N2'
 'A/Brisbane/59/2007_H1N1']

viruses with >0 titer but ratio >=250 to increase...
['A/Colorado/218/2024_H1N1' 'A/Madagascar/00003/2025_H1N1'
 'A/Maryland/64/2024_H1N1' 'A/Oregon/11/2025_H1N1'
 'A/Queensland/IN000684/2024_H1N1' 'A/Santiago/101713/2024_H1N1'
 'A/Ufa/CRIE/47/2024_H1N1']


In [14]:
print('Adding 4x of each strain ratio...')
print(sum(
    mean_single_well.query('mean_ratio_to_add != inf').query('well == "B4"')
    .mean_ratio_to_add
) * 4
     )
print('Assuming worse case scenario of 1:16 on 150k cells...')
print((50/16)*100)
print('How many plates can I run?')
print(28110/312)

Adding 4x of each strain ratio...
28110.849013062932
Assuming worse case scenario of 1:16 on 150k cells...
312.5
How many plates can I run?
90.09615384615384


## Re-pooling calculations

In [15]:
# Get library IDs
ordersheetsdir = '../../library_design/results/ordersheets'

lib_id_df = pd.concat([
    pd.read_csv(os.path.join(ordersheetsdir, 'h1_inserts.csv')),
    pd.read_csv(os.path.join(ordersheetsdir, 'h3_inserts.csv')),
    pd.read_csv(os.path.join(ordersheetsdir, 'vaccine_inserts.csv')),
    pd.read_csv(os.path.join(ordersheetsdir, 'lisboa_inserts.csv')),
]).reset_index(drop=True)


# Combine with repool calculations
repool_df = (mean_single_well
             .query('mean_ratio_to_add != inf')
             .query('well == "B4"')
             [['strain','fraction_strain','mean_ratio_to_add']]
             .assign(x4_volume_to_add = lambda x: x['mean_ratio_to_add'] * 4)
             .merge(lib_id_df, how='outer')
             .assign(
                 subtype = lambda x: x['name'].str.split('_').str[0],
                 number = lambda x: pd.to_numeric(
                     x['name'].str.split('_').str[1], errors='coerce'
                 ).fillna(1e6).astype(int),  # use a big number to push NaNs to bottom
                 strain_id = lambda x: x['subtype'] + '_' + x['number'].astype(str)
             )
             .sort_values(by=['subtype', 'number'], ascending=True)
             .drop(columns=['sequence', 'genbank', 'subtype', 'number', 'name'])  # number is now temporary
             .drop_duplicates()
             .reset_index(drop=True)
)

pooling_mathdir = '../results/pooling_math'
os.makedirs(pooling_mathdir, exist_ok=True)
repool_df.to_csv(os.path.join(pooling_mathdir, '2025-07-16_repooling_math.csv'), index=False)
repool_df

Unnamed: 0,strain,fraction_strain,mean_ratio_to_add,x4_volume_to_add,strain_id
0,A/Maryland/64/2024_H1N1,0.000018,265.917212,1063.668847,H1N1_1
1,A/Qinghai-Chengzhong/SWL1410/2024_H1N1,0.000069,97.558635,390.234539,H1N1_2
2,A/Ulsan/492/2025_H1N1,0.000064,112.602363,450.409452,H1N1_3
3,A/Massachusetts/ISC-1679/2025_H1N1,0.000125,46.681513,186.726051,H1N1_4
4,A/Singapore/MOH0547/2024_H1N1,0.000100,59.931104,239.724416,H1N1_5
...,...,...,...,...,...
137,A/Texas/50/2012_H3N2,0.006121,1.330844,5.323377,
138,A/Thailand/8/2022_H3N2,0.000470,13.985544,55.942176,
139,A/Victoria/4897/2022_IVR-238_H1N1,0.021529,0.309454,1.237815,
140,A/Wisconsin/588/2019_H1N1,0.000728,11.848132,47.392528,


## Visualize barcode- and strain-level balancing in the current pool

In [16]:
# Plot the current fraction of each strain in the pool
strains_chart = (
    alt.Chart(mean_single_well)
    .encode(
        alt.X(
            "fraction_strain",
            scale=alt.Scale(nice=False, padding=3),
        ),
        alt.Y("strain"),
        
        tooltip = ['strain', 'fraction_strain', 'est_tcid50'],
    )
).mark_bar(height={"band": 0.85}).properties(
        height=alt.Step(10),
        width=250,
        title="",
    ).properties(
        height = alt.Step(10),
        width = 200,
        title = "Strain representation, initial pool")

# add veritcal line where we would expect equal representation of all barcodes in pool
expected_line = alt.Chart(
    pd.DataFrame({'x': [1/num_strains]})
).mark_rule(strokeDash = [2,2], strokeWidth = 2).encode(x = 'x')

# plot both barcode counts and expected line
strains_chart + expected_line

In [17]:
# Each barcode fraction across strains
all_barcode_counts = counts[['strain', 'barcode', 'count', 'well']].dropna()
single_well_all_barcode_counts = all_barcode_counts[all_barcode_counts['well'].isin(['A5','B5'])]

# Get tidy single well means
tidy_single_well = single_well_all_barcode_counts[['strain','barcode','count']].groupby(['strain', 'barcode']).mean().reset_index()
# Get sums for each strain
strain_sums_df = tidy_single_well.groupby('strain').sum().rename(columns = {'count': 'strain_count_sum'}).reset_index()
# Merge and calculate per strain the fraction represented by each barcode
tidy_single_well = tidy_single_well.merge(strain_sums_df[['strain', 'strain_count_sum']], 
                       on = ['strain'],
                       validate="many_to_one",
                      )
tidy_single_well['per_strain_fraction_barcode'] = tidy_single_well['count'] / tidy_single_well['strain_count_sum']
tidy_single_well['barcode_letter'] = tidy_single_well.groupby('strain').cumcount().apply(lambda x: string.ascii_uppercase[x])

# Plot as colored bar chart
bar_chart = alt.Chart(tidy_single_well).mark_bar(height={"band": 0.85}).encode(
    x = 'per_strain_fraction_barcode',
    y = 'strain',
    color=alt.Color('barcode_letter', legend=None).scale(range=color_palette),
    tooltip = ['strain', 'per_strain_fraction_barcode', 'barcode'],
).configure_axis(grid=False).properties(
        height = alt.Step(10),
        width = 200,
        title = "Barcode fraction for each strain, initial pool")

bar_chart