# Summarize titers from experimental replicates using same library and with different pools of library strains
Make summary plots for the titers from all experiments comparing effect of library pooling.

In [100]:
import altair as alt

import neutcurve
import numpy as np
import pandas as pd
from scipy import stats
import pickle

_ = alt.data_transformers.disable_max_rows()

In [136]:
#These are our input files. May want to add a file with the conditions so that we can plot by condition
input_titers = snakemake.input.input_titers
viral_strain_plot_order = snakemake.input.viral_strain_plot_order
titers = pd.read_csv(input_titers)
viruses_to_plot = pd.read_csv(viral_strain_plot_order)
viruses = viruses_to_plot.strain.tolist()
#Load in data from curve fits to assess differences in titers between replicates and barcodes
curvefits_pickle = snakemake.input.curvefits_pickle
with open(curvefits_pickle, "rb") as f:
    fits_seqbasedneut = pickle.load(f)
titers_chart_variation_html = snakemake.output.titers_chart_html

In [172]:
#Create dataframes to compare titers calculated for replicate barcodes and replicate dilution series
fits_seqbasedneut
fitsdfseqneut = fits_seqbasedneut.fitParams(no_average=True, average_only=False)
sera_withreps = fitsdfseqneut.loc[fitsdfseqneut['serum'].str.contains('fulllibconc|halflibconc|no5a1|no5a2')]
sera_with_rep1 = sera_withreps.loc[sera_withreps['replicate'].str.contains('-1-')].drop(columns = ['nreplicates','slope','rmsd','ic50_bound','ic50_str','midpoint_bound','midpoint_bound_type','top','bottom','r2'])
sera_with_rep2 = sera_withreps.loc[sera_withreps['replicate'].str.contains('-2-')].drop(columns = ['nreplicates','slope','rmsd','ic50_bound','ic50_str','midpoint_bound','midpoint_bound_type','top','bottom','r2'])

sera_with_rep1['NT50_rep1'] = 1/sera_with_rep1['midpoint']
sera_with_rep2['NT50_rep2'] = 1/sera_with_rep2['midpoint']
sera_with_rep1['barcode'] = sera_with_rep1['replicate'].str.split("-").str[2]
sera_with_rep2['NT50_rep2'] = 1/sera_with_rep2['midpoint']
sera_with_rep2['barcode'] = sera_with_rep2['replicate'].str.split("-").str[2]

merged_replicates = sera_with_rep1.merge(sera_with_rep2, on=['serum','virus','barcode'])
sera_withreps
sera_withreps_simplified = sera_withreps.drop(columns=['ic50','ic50_bound','ic50_str','midpoint_bound','midpoint_bound_type','slope','top','bottom','r2','rmsd'])
sera_withreps_simplified['barcode'] = sera_withreps_simplified['replicate'].str.split("-").str[2]
sera_withreps_simplified['replicate_n'] = sera_withreps_simplified['replicate'].str.split("-").str[1]
sera_withreps_simplified['NT50'] = 1/sera_withreps_simplified['midpoint']
sera_withreps_simplified = sera_withreps_simplified.drop(columns=['nreplicates','replicate'])


sera_withreps_simplified_NT50perrep = sera_withreps_simplified.groupby(['serum','virus','replicate_n']).median(numeric_only=True).reset_index()
sera_withreps_simplified_NT50all = sera_withreps_simplified_NT50perrep.groupby(['serum','virus']).median(numeric_only=True).reset_index()
sera_withreps_simplified_NT50all

Unnamed: 0,serum,virus,midpoint,NT50
0,fulllibconc,A/Bangladesh/2221/2021,0.000147,7007.838864
1,fulllibconc,A/Bangladesh/3210810034/2021,0.000112,9320.383223
2,fulllibconc,A/Bangladesh/8002/2021,0.000070,14381.443883
3,fulllibconc,A/Bangladesh/8036/2021,0.000074,13635.912710
4,fulllibconc,A/Belgium/H0017/2022,0.000440,2350.561455
...,...,...,...,...
137,no5a2,A/Togo/0304/2021,0.000027,42465.106915
138,no5a2,A/Togo/845/2020,0.000026,51344.262555
139,no5a2,A/Utah/27/2022,0.000059,17067.909278
140,no5a2,A/Washington/23/2020,0.000051,19578.451486


In [138]:
#Rename merged directory and calculate median NT50 to plot
merged_replicates_withbarcodes = merged_replicates
merged_replicates = merged_replicates.drop(columns=['replicate_x','replicate_y','barcode'])
merged_replicates = merged_replicates.groupby(['serum','virus']).median().reset_index()
merged_replicates

Unnamed: 0,serum,virus,ic50_x,midpoint_x,NT50_rep1,ic50_y,midpoint_y,NT50_rep2
0,fulllibconc,A/Bangladesh/2221/2021,0.000158,0.000172,5821.014761,0.000105,0.000122,8194.662968
1,fulllibconc,A/Bangladesh/3210810034/2021,0.000109,0.000111,9099.161318,0.000112,0.000112,9541.605128
2,fulllibconc,A/Bangladesh/8002/2021,0.000052,0.000064,15600.939188,0.000076,0.000076,13161.948579
3,fulllibconc,A/Bangladesh/8036/2021,0.000068,0.000069,14400.355875,0.000077,0.000078,12871.469546
4,fulllibconc,A/Belgium/H0017/2022,0.000329,0.000359,2784.615546,0.000521,0.000522,1916.507364
...,...,...,...,...,...,...,...,...
137,no5a2,A/Togo/0304/2021,0.000012,0.000017,58521.914311,0.000034,0.000038,26408.299519
138,no5a2,A/Togo/845/2020,0.000013,0.000013,76619.488900,0.000037,0.000038,26069.036211
139,no5a2,A/Utah/27/2022,0.000042,0.000056,17895.964908,0.000060,0.000062,16239.853648
140,no5a2,A/Washington/23/2020,0.000043,0.000056,17911.507539,0.000047,0.000047,21245.395434


In [139]:
#Plot correlation by strain for replicates
alt.Chart(merged_replicates).mark_point(filled=True, size=75,opacity=0.4).encode(
    x=alt.X('NT50_rep1', scale=alt.Scale(type="log", domain=[400,400000]),axis=alt.Axis(title="NT50 (replicate 1, by strain)")),
    y=alt.Y('NT50_rep2', scale=alt.Scale(type="log",domain=[400,400000]),axis=alt.Axis(title="NT50 (replicate 2, by strain)")),
    facet =alt.Facet('serum:N'),
).properties(width = 200, height = 200).configure_axis(grid=False, domain=False,labelFontSize=14,titleFontSize=12).configure_legend(titleAlign='left',labelLimit= 0,
titleFontSize=10,
labelFontSize=10
)

In [140]:
#Plot correlation by barcode for replicates
alt.Chart(merged_replicates_withbarcodes).mark_point(filled=True, size=75,opacity=0.4).encode(
    x=alt.X('NT50_rep1', scale=alt.Scale(type="log", domain=[400,400000]),axis=alt.Axis(title="NT50 (replicate 1, by barcode)")),
    y=alt.Y('NT50_rep2', scale=alt.Scale(type="log",domain=[400,400000]),axis=alt.Axis(title="NT50 (replicate 2, by barcode)")),
    facet =alt.Facet('serum:N'),
).properties(width = 200, height = 200).configure_axis(grid=False, domain=False,labelFontSize=14,titleFontSize=12).configure_legend(titleAlign='left',labelLimit= 0,
titleFontSize=10,
labelFontSize=10
)

In [141]:
#Look at pearson correlation for NT50s between replicates, half library is particularly bad (may be due to less coverage)
for i in merged_replicates['serum'].unique().tolist():
    condition = i
    merged_replicates_condition = merged_replicates.loc[merged_replicates['serum'].str.contains(i)]
    print(i)
    print(merged_replicates_condition.corr(method= "pearson", numeric_only=True).reset_index()['NT50_rep1'][5])

fulllibconc
0.6518267802837482
halflibconc
0.23129989590045516
no5a1
0.7401895966195636
no5a2
0.7214486427043775


In [142]:
#Pivot data table so that we can plot correlations of NT50s from different library designs
titers_conditions = titers.pivot_table('titer', ['virus'],'serum').reset_index()

In [143]:
#Show pearson correlation for different library designs
titers_conditions.corr(method= "pearson", numeric_only=True).reset_index()

serum,serum.1,fulllibconc,halflibconc,no5a1,no5a2
0,fulllibconc,1.0,0.638607,0.811835,0.601052
1,halflibconc,0.638607,1.0,0.721149,0.67431
2,no5a1,0.811835,0.721149,1.0,0.783435
3,no5a2,0.601052,0.67431,0.783435,1.0


In [144]:
#Make a heatmap showing that the 
corr_df = titers_conditions.corr(method= "pearson", numeric_only=True)

# data preparation
pivot_cols = list(corr_df.columns)
corr_df['cat'] = corr_df.index

# actual chart
alt.Chart(corr_df).mark_rect(tooltip=True)\
   .transform_fold(pivot_cols)\
   .encode(
       x=alt.X("cat:N", title=None),
       y=alt.Y('key:N', title=None),
       color=alt.Color("value:Q", scale=alt.Scale(domain=[0,1]))
   ).properties(width = 200, height = 200)


In [145]:
alt.Chart(titers_conditions).mark_point(filled=True, size=75,opacity=0.4).encode(
    x=alt.X('fulllibconc', scale=alt.Scale(type="log", domain=[1000,100000]),axis=alt.Axis(title="Full Library NT50")),
    y=alt.Y('halflibconc', scale=alt.Scale(type="log",domain=[1000,100000]),axis=alt.Axis(title="Half Concentration Library NT50")),
#    color =alt.Color('serum:N', sort=['0','30','182'], title="days post vaccination",legend=alt.Legend(orient='right',labelLimit= 0)).scale(range = ['rebeccapurple', 'firebrick','mediumseagreen']),
).properties(width = 200, height = 200).configure_axis(grid=False, domain=False,labelFontSize=10,titleFontSize=12).configure_legend(titleAlign='left',labelLimit= 0,
titleFontSize=10,
labelFontSize=10
)

In [146]:
alt.Chart(titers_conditions).mark_point(filled=True, size=75,opacity=0.4).encode(
    x=alt.X('fulllibconc', scale=alt.Scale(type="log", domain=[1000,100000]),axis=alt.Axis(title="Full Library NT50")),
    y=alt.Y('no5a1', scale=alt.Scale(type="log",domain=[1000,100000]),axis=alt.Axis(title="No 5a1 NT50")),
#    color =alt.Color('serum:N', sort=['0','30','182'], title="days post vaccination",legend=alt.Legend(orient='right',labelLimit= 0)).scale(range = ['rebeccapurple', 'firebrick','mediumseagreen']),
).properties(width = 200, height = 200).configure_axis(grid=False, domain=False,labelFontSize=10,titleFontSize=12).configure_legend(titleAlign='left',labelLimit= 0,
titleFontSize=10,
labelFontSize=10
)

In [147]:
alt.Chart(titers_conditions).mark_point(filled=True, size=75,opacity=0.4).encode(
    x=alt.X('fulllibconc', scale=alt.Scale(type="log", domain=[1000,100000]),axis=alt.Axis(title="Full Library NT50")),
    y=alt.Y('no5a2', scale=alt.Scale(type="log",domain=[1000,100000]),axis=alt.Axis(title="No 5a2 NT50")),
#    color =alt.Color('serum:N', sort=['0','30','182'], title="days post vaccination",legend=alt.Legend(orient='right',labelLimit= 0)).scale(range = ['rebeccapurple', 'firebrick','mediumseagreen']),
).properties(width = 200, height = 200).configure_axis(grid=False, domain=False,labelFontSize=10,titleFontSize=12).configure_legend(titleAlign='left',labelLimit= 0,
titleFontSize=10,
labelFontSize=10
)

In [148]:
alt.Chart(titers_conditions).mark_point(filled=True, size=75,opacity=0.4).encode(
    x=alt.X('no5a1', scale=alt.Scale(type="log", domain=[1000,100000]),axis=alt.Axis(title="No 5a1 NT50")),
    y=alt.Y('no5a2', scale=alt.Scale(type="log",domain=[1000,100000]),axis=alt.Axis(title="No 5a2 NT50")),
#    color =alt.Color('serum:N', sort=['0','30','182'], title="days post vaccination",legend=alt.Legend(orient='right',labelLimit= 0)).scale(range = ['rebeccapurple', 'firebrick','mediumseagreen']),
).properties(width = 200, height = 200).configure_axis(grid=False, domain=False,labelFontSize=10,titleFontSize=12).configure_legend(titleAlign='left',labelLimit= 0,
titleFontSize=10,
labelFontSize=10
)

In [194]:
#Run chart to show that there the variation between the different titers calculated for different library composition, but also variation within replicates for the same library
source = sera_withreps_simplified_NT50perrep
median_source = sera_withreps_simplified_NT50all
range_ = ['steelblue','goldenrod','mediumseagreen','firebrick']

domain_ = ['fulllibconc','halflibconc','no5a1','no5a2']
plot_range = [200,100000]

NT50s_forselections_chart = alt.Chart(source).mark_errorband(opacity=0.2,extent="stdev",).encode(
    y=alt.X('NT50',scale=alt.Scale(domain=plot_range, type="log",nice=False),axis=alt.Axis(grid=False,titleFontSize=12, labelFontSize=12)),
    x=alt.Y("virus", sort=viruses, axis=alt.Axis(title=None,
                                                 labelFontSize=11,labelLimit=300)),
    color=alt.Color("serum", title="group",scale=alt.
                    Scale(domain=domain_, range=range_)),
).properties(
    height=150,
    width = alt.Step(15))
meanline_chart = alt.Chart(median_source).mark_line(point=True,strokeWidth=1).encode(
    y=alt.X('NT50',scale=alt.Scale(domain=plot_range,type="log", nice=False)),
    x=alt.Y("virus", sort=viruses,
            title="virus"),
    color=alt.Color("serum",title="group",scale=alt.
                    Scale(domain=domain_, range=range_)),
).properties(
    height=150,
    width = alt.Step(15))
meanline_chart_point = alt.Chart(median_source).mark_point(filled=True).encode(
    y=alt.X("NT50",scale=alt.Scale(domain=plot_range,type="log", nice=False)),
    x=alt.Y("virus", sort=viruses,
            title="virus"),
    color=alt.Color("serum",title="group",scale=alt.
                    Scale(domain=domain_, range=range_)),
    size=alt.value(45),
    opacity=alt.value(1),
).properties(
    height=150,
    width = alt.Step(15))

chart =meanline_chart_point + NT50s_forselections_chart + meanline_chart
chart.layer[0].encoding.x.title = ' titer'
chart.layer[0].title = 'NT50s from experiments using different concentrations of library strains'
chart.properties(
    width=alt.Step(15),
    height = 150)
chart.save(titers_chart_variation_html)
chart