## Plot variance of NT50s

I can do this a few ways, but am thinking I'll start with variance per individual, plotted per cohort (same axes)

I'll estimate variance across these groupings by a few methods, but I'll just use `pandas` built-in (`ddof` value of 1)

In general, I'll want to exclude vaccine strains from this analysis since I mostly care about variation in neutralizing titers against currently circulating strains. I'll *only* include the `A/Massachusetts/18/2022` cell-passaged vaccine strain, since this virus was technically circulating in 2023. 

In [2]:
# Import packages
import os
import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Ignore error message from Altair about large dataframes
_ = alt.data_transformers.disable_max_rows()

# Color scheme
palette = [
    '#345995', #blue
    '#03cea4', #teal
    '#ca1551', #red
    '#eac435', #yellow
               ]

In [3]:
# define inputs
datadir = '../data'
resultsdir = '../results'
os.makedirs(datadir, exist_ok = True)
os.makedirs(resultsdir, exist_ok = True)

# Define SCH titers
SCH_titers = (pd.read_csv('../../../results/aggregated_titers/titers_SCH.csv')
             .assign(
                 barcode = lambda x: x['serum'].str.split('_').str[2],)
             )


# Define Penn titers
titers_PennVaccineCohort = (pd.read_csv('../../../results/aggregated_titers/titers_PennVaccineCohort.csv')
                            .assign(
                                barcode = lambda x: x['serum'].str.split('_').str[2],
                                timepoint = lambda x: 'd' + x['serum'].str.split('d').str[1])
                           )

# Define Australian MA22 vaccine cohort titers
Australia_MA22_titers = (pd.read_csv('../../../results/aggregated_titers/titers_AusVaccineCohort.csv')
                         .assign(
                             barcode = lambda x: x['serum'].str.split('_').str[1],
                             timepoint = lambda x: x['serum'].str.split('_').str[2])
                        )


In [4]:
# Define virus order
viral_plot_order = pd.read_csv('../../../data/H3N2library_2023-2024_strain_order.csv')
viruses = [v for v in viral_plot_order.strain]

# Define vaccine strains
vaccine_strains = []
with open('../data/vaccine_strains.csv') as f:
    for line in f:
        line = line.strip('\n')
        if 'strain' not in line:
            vaccine_strains.append(line)

# Define separate list where Massachusetts/18/2022 is reclassified as a 2023-circulating strains
vaccine_strains_no_Massachusetts = [item for item in vaccine_strains if item != 'A/Massachusetts/18/2022']

In [5]:
# concatenate dfs
# facet by clade, color by variant subclade -- need to pull this metadata 

all_titers_df = (pd.concat([SCH_titers, 
                           titers_PennVaccineCohort.rename(columns = {'day': 'timepoint'}), 
                           Australia_MA22_titers])
                .rename(columns = {'virus': 'strain'}))


all_titers_df['group_detail'] = np.where(all_titers_df['group'] == 'SCH', 
                                         all_titers_df['group'],  # If the value matches the string
                                         all_titers_df['group'].astype(str) + '_' + all_titers_df['timepoint'])  # Otherwise, combination of col2 and col3



# Make all serum identifiers strings
all_titers_df.loc[:, 'serum'] = all_titers_df['serum'].astype(str)

all_titers_df

Unnamed: 0,group,serum,strain,titer,titer_bound,titer_sem,n_replicates,titer_as,barcode,timepoint,group_detail
0,SCH,SCH23_y2009_s001,A/AbuDhabi/6753/2023,81.66,interpolated,6.532,3,midpoint,s001,,SCH
1,SCH,SCH23_y2009_s001,A/Bangkok/P3599/2023,105.50,interpolated,28.440,3,midpoint,s001,,SCH
2,SCH,SCH23_y2009_s001,A/Bangkok/P3755/2023,78.17,interpolated,3.420,3,midpoint,s001,,SCH
3,SCH,SCH23_y2009_s001,A/Bhutan/0006/2023,73.65,interpolated,8.163,3,midpoint,s001,,SCH
4,SCH,SCH23_y2009_s001,A/Bhutan/0845/2023,40.00,upper,5.460,3,midpoint,s001,,SCH
...,...,...,...,...,...,...,...,...,...,...,...
1242,AusVaccineCohort,AUS24_s010_prevax,A/Thailand/8/2022,655.40,interpolated,86.520,3,midpoint,s010,prevax,AusVaccineCohort_prevax
1243,AusVaccineCohort,AUS24_s010_prevax,A/Townsville/68/2023,86.20,interpolated,10.880,3,midpoint,s010,prevax,AusVaccineCohort_prevax
1244,AusVaccineCohort,AUS24_s010_prevax,A/Victoria/1033/2023,94.25,interpolated,8.144,3,midpoint,s010,prevax,AusVaccineCohort_prevax
1245,AusVaccineCohort,AUS24_s010_prevax,A/Wisconsin/27/2023,146.40,interpolated,19.080,3,midpoint,s010,prevax,AusVaccineCohort_prevax


## Calculate variance across individuals

In [6]:
# In general, we'll want to exclude vaccine strains from this analysis
data = all_titers_df[~all_titers_df['strain'].isin(vaccine_strains_no_Massachusetts)]

In [7]:
# Initialize empty list for serum variance calculations
serum_variance_list = []

for serum in data.serum.unique(): 

    # Get reduced dataframe of just serum in question
    serum_df = (data
              .query(f'serum == "{serum}"'))

    # Calculate serum arithmetic mean
    serum_mean = serum_df['titer'].mean()
    # Calculate overall serum variance
    serum_variance = serum_df['titer'].var(ddof=1)

    # Append to list
    serum_variance_list.append([serum, serum_mean, serum_variance])

serum_variance_df = pd.DataFrame(serum_variance_list, columns = ['serum', 'serum_mean', 'serum_var'])

In [8]:
# Also calculate standard deviation and coefficient of variance

# Standard deviation is the square root of variance
serum_variance_df['serum_stdev'] = np.sqrt(serum_variance_df['serum_var'])

# Coeffecient of variance is the stdev divided by the mean
serum_variance_df['serum_coeff-of-var'] = serum_variance_df['serum_stdev'] / serum_variance_df['serum_mean']


serum_variance_df

Unnamed: 0,serum,serum_mean,serum_var,serum_stdev,serum_coeff-of-var
0,SCH23_y2009_s001,96.277419,1.689514e+03,41.103694,0.426930
1,SCH23_y2009_s002,5319.645161,1.653839e+06,1286.016573,0.241749
2,SCH23_y2009_s004,54.360000,3.254909e+02,18.041367,0.331887
3,SCH23_y2009_s005,105.848387,1.350579e+03,36.750230,0.347197
4,SCH23_y2009_s006,102.257419,1.624719e+03,40.307798,0.394180
...,...,...,...,...,...
145,AUS24_s008_prevax,54.934032,2.908532e+02,17.054420,0.310453
146,AUS24_s009_postvax,2904.270968,6.651110e+05,815.543355,0.280808
147,AUS24_s009_prevax,559.224194,9.531644e+04,308.733611,0.552075
148,AUS24_s010_postvax,1298.155000,6.462358e+05,803.887915,0.619254


In [9]:
# Make reduced input dataframe
data = (all_titers_df
        .merge(serum_variance_df, on = 'serum')
        [['group', 'group_detail', 'serum', 'serum_mean',
          'serum_stdev', 'serum_var', 'serum_coeff-of-var']]
        .drop_duplicates()
        # .query('group != "SCH"')
       )

# Initialize list to store difference group variance calcualtions
plots = []

# Configure plots
width = 250
point_size = 60
point_opacity = 0.6
box_size = 40
box_opacity = 0.8

# Define sort order for x-axis
sort_order = ['SCH', 
              'PennVaccineCohort_d0', 'PennVaccineCohort_d28',
              'AusVaccineCohort_prevax', 'AusVaccineCohort_postvax']

# Define list of variances to plot for
var_list = ['serum_mean', 'serum_stdev', 'serum_coeff-of-var']

# Iterate through above list and produce plots
for var in var_list:

    if var == 'serum_coeff-of-var':
        _range = [0,1]
        scale_param = alt.Scale(nice=True, domain=_range)
    else:
        _range = [4,4000]
        scale_param = alt.Scale(type='log', nice=True, domain=_range)
    
    # Make plot
    color_scheme = alt.Color("group_detail:N").legend(None).scale(scheme='set2')

    points = (alt.Chart(data, width = width)
            .mark_point(size = point_size, opacity = point_opacity, filled=True, stroke='black')
            .encode(
                alt.X('group_detail', 
                          axis = alt.Axis(grid=False, 
                                          titleFontSize=18, 
                                          labelFontSize=16,
                                          labelLimit = 1000, 
                                          labelAlign = 'right'),
                      title = '',
                       sort = sort_order
                         ),
                alt.Y(f'{var}', 
                      axis=alt.Axis(grid=False, titleFontSize=18, labelFontSize=16, title=f"{var}"),
                      scale=scale_param,
                     ),
                detail = ['serum'],
                color = color_scheme,
                xOffset="jitter:Q",
                tooltip=['serum']
            )
              .transform_calculate(
        # Generate Gaussian jitter with a Box-Muller transform
        jitter="sqrt(-4*log(random()))*cos(2*PI*random())"
    ).interactive()
           )
    
    boxes = (alt.Chart(data, width = width)
             .mark_boxplot(
                 extent="min-max", 
                 opacity = box_opacity, size = box_size, color = 'white').encode(
        alt.X("group_detail:N",
              sort = sort_order),
        alt.Y(f'{var}', 
              axis=alt.Axis(title=f'{var}'),
             scale = scale_param,
             )
                 .scale(zero=False),
                 stroke = alt.value('black'),
                  strokeWidth=alt.value(2)
             )
            )
        
    layered = (alt.layer(points, boxes, title=f'per serum {var.split('_')[1]}')
               # .properties(title = f'{var}')
               # .configure_title(fontSize=22, offset=1, dx = -10, dy = -10,)
              )

    # Add final layered plot to plots list
    plots.append(layered)

# Concat plots in plots list
(alt.concat(*plots, title = '', columns = 4)
 .configure_title(fontSize=18, 
                  anchor = 'middle',
                  dy = 0,)
 .configure_header(labelFontSize=16)
 .configure_legend(titleFontSize=18, 
                   labelFontSize = 18,
                   strokeColor='gray',
                   padding=10,
                   cornerRadius=10,)
)

## Identify and plot the individuals in the 10th percentile for coefficient of variation
These are the individuals actually putting differential selective pressures on circualting variants, and should be highlighted by our analysis. 

In [10]:
# Make reduced input dataframe
merged_data = (all_titers_df
        .merge(serum_variance_df, on = 'serum')
        [['group', 'group_detail', 'serum',
          'serum_stdev', 'serum_var', 'serum_coeff-of-var']]
        .drop_duplicates()
       )

frames = []

for group in merged_data.group_detail.unique():
    df = merged_data.query(f'group_detail == "{group}"')

    df = df.copy()
    df['serum_cov_percentile'] = df['serum_coeff-of-var'].rank(pct=True)

    top_ten_df = (df.query('serum_cov_percentile > 0.9'))
    frames.append(top_ten_df)

high_variance_df = pd.concat(frames).reset_index(drop=True)

In [11]:
# Get list of sera with high variance
high_variance_sera = high_variance_df.serum.unique()
high_variance_sera

array(['SCH23_y2009_s007', 'SCH23_y2012_s014', 'SCH23_y2013_s020',
       'SCH23_y2018_s043', 'SCH23_y2018_s047', 'SCH23_y2022_s062',
       'PENN23_y1968_s033_d0', 'PENN23_y1987_s010_d0',
       'PENN23_y1988_s031_d0', 'PENN23_y1996_s043_d0',
       'PENN23_y1985_s015_d28', 'PENN23_y1987_s010_d28',
       'PENN23_y1988_s031_d28', 'PENN23_y1999_s046_d28',
       'AUS24_s010_postvax', 'AUS24_s010_prevax'], dtype=object)

In [12]:
# Configure the plot
color_scheme = alt.Color('group_detail', 
                         title = 'Cohort', 
                         sort=sort_order,
                         legend=None
                        ).scale(range=palette)
titer_range = [30, 16000]
titleFontSize=19
labelFontSize=19
lineOpacity = 0.3
lineSize = 3
markerOpacity = 0.8
markerSize = 150
width = 1100
height = 180

# Get data
data = all_titers_df
# Only plot select sera
data = data[data['serum'].isin(high_variance_sera)]
# Only plot a few cohorts
cohorts_to_plot = ['SCH', 'PennVaccineCohort_d0']
data = data[data['group_detail'].isin(cohorts_to_plot)]
# Only plot 2023-circulating viruses
data = (data[~data['strain'].isin(vaccine_strains_no_Massachusetts)]
        # .replace({'group_detail': group_detail_dict}) # Get nicer labels
       )

# Make the plot
line = (alt.Chart(data, width = width,height=height)
        .mark_line(size = lineSize, point = False, opacity = lineOpacity)
        .encode(
            alt.X('strain', 
                      axis = alt.Axis(grid=False, titleFontSize=titleFontSize, labelFontSize=labelFontSize,
                                      title = None,labelLimit = 1000, labelAlign = 'right',
                                     ),             
                      sort = viruses
                     ),
            alt.Y('titer', 
                      scale =alt.Scale(type='log',domain=titer_range, nice=False), 
                      axis=alt.Axis(grid=False, titleFontSize=titleFontSize, labelFontSize=labelFontSize, title="NT50")
                 ),
            detail = 'serum',
            color = color_scheme,
        )
       )

layered = (alt.layer(line)
           .facet(
               row = alt.Row('group_detail:N', sort = sort_order, 
                             title = None,
                            ),
               config = alt.Config(
                        legend = alt.LegendConfig(titleFontSize=titleFontSize, labelFontSize = labelFontSize,
                            strokeColor='gray',padding=10,cornerRadius=10,
                            labelLimit = 1000 # Let legend labels be as long as they want
                        )
                       )
           ).configure_header(labelFontSize=labelFontSize,
                              labelFontWeight='bold',
                             labelOrient = 'top')
)

# Show the plot
layered


In [13]:
# In general, we'll want to exclude vaccine strains from this analysis
data = all_titers_df[~all_titers_df['strain'].isin(vaccine_strains_without_Mass)]

# Initialize empty list for strain and variance calculations
strain_variance_list = []

# Group names list
group_names = ['strain']

for strain in data.strain.unique():

    temp_list = []
    temp_list.append(strain)

    # Overall, all-group variance
    # Only represent the UPenn individuals once
    # Choosing the preVax for now
    all_df = (data
              .query(f'strain == "{strain}"')
              .query('timepoint != "28"'))

    all_variance = all_df['titer'].var(ddof=1)
    temp_list.append(all_variance)

    # Per-group variance
    for group_detail in data.group_detail.sort_values(ascending = False).unique():
        group_df = (data
                    .query(f'strain == "{strain}"')
                    .query(f'group_detail == "{group_detail}"'))
        group_variance = group_df['titer'].var(ddof=1)

        temp_list.append(group_variance)

        if group_detail not in group_names:
            group_names.append(group_detail)
        
    strain_variance_list.append(temp_list)   


print(group_names)


strain_variance_df = pd.DataFrame(strain_variance_list, columns = group_names)
strain_variance_df

NameError: name 'vaccine_strains_without_Mass' is not defined

In [None]:
# Also calculate standard deviation
strain_stdev_df = strain_variance_df

# Standard deviation is the square root of variance
strain_stdev_df['SCH_Penn_stdev'] = np.sqrt(strain_variance_df['SCH_Penn_var'])
strain_stdev_df['SCH_stdev'] = np.sqrt(strain_variance_df['SCH_var'])
strain_stdev_df['Penn_preVax_stdev'] = np.sqrt(strain_variance_df['Penn_preVax_var'])
strain_stdev_df['Penn_postVax_stdev'] = np.sqrt(strain_variance_df['Penn_postVax_var'])

In [None]:
# Make standard deviation plots
# Make reduced input dataframe
data = (all_titers_merge_df
        .merge(strain_stdev_df)
        [['strain', 'subclade',
          'SCH_Penn_stdev', 'SCH_stdev', 'Penn_preVax_stdev', 'Penn_postVax_stdev']]
        .drop_duplicates()
       )

# Initialize list to store difference group variance calcualtions
plots = []

# Define width of plots
width = 200

# Define list of variances to plot for
var_list = ['SCH_Penn_stdev', 'SCH_stdev', 'Penn_preVax_stdev', 'Penn_postVax_stdev']

# Iterate through above list and produce plots
for var in var_list:
    
    # Make plot
    color_scheme = alt.Color("subclade:N").legend(None).scale(scheme='magma')

    points = (alt.Chart(data, width = width)
            .mark_point(size = 40, opacity = 0.6, filled=True, stroke='black')
            .encode(
                alt.X('subclade', 
                          axis = alt.Axis(grid=False, 
                                          titleFontSize=18, 
                                          labelFontSize=16,
                                          # title = None,
                                          # titleY = 330,
                                          labelLimit = 1000, 
                                          labelAlign = 'right'), 
                         ),
                alt.Y(f'{var}', 
                      axis=alt.Axis(grid=False, titleFontSize=18, labelFontSize=16, title="stdev"),
                      # scale =alt.Scale(type='log', nice=False),
                     ),
                detail = ['strain'],
                color = color_scheme,
                xOffset="jitter:Q",
                # tooltip=['patient_barcode', 'virus']
            )
              .transform_calculate(
        # Generate Gaussian jitter with a Box-Muller transform
        jitter="sqrt(-4*log(random()))*cos(2*PI*random())"
    )
           )
    
    boxes = (alt.Chart(data, width = width)
             .mark_boxplot(
                 extent="min-max", 
                 opacity = 0.8, size = 20, color = 'white').encode(
        alt.X("subclade:N"),
        alt.Y(f'{var}', 
              axis=alt.Axis(title=''),
             # scale =alt.Scale(type='log', nice=False),
             )
                 .scale(zero=False),
                 stroke = alt.value('black'),
                  strokeWidth=alt.value(2)
             )
            )
        
    layered = (alt.layer(points, boxes)
               .properties(title = f'{var}')
               # .configure_title(fontSize=22, offset=1, dx = -10, dy = -10,)
              )

    # Add final layered plot to plots list
    plots.append(layered)

# Concat plots in plots list
(alt.concat(*plots, title = '', columns = 4)
 .resolve_scale(y='shared')
 .configure_title(fontSize=18, 
                  # offset=20, 
                  # dx = 50,
                  anchor = 'middle',
                  dy = 0,)
 .configure_header(labelFontSize=16)
 .configure_legend(titleFontSize=18, 
                   labelFontSize = 18,
                   strokeColor='gray',
                   padding=10,
                   cornerRadius=10,)
)

It does seem like virus standard deviations across kids + prevax adults sometimes group by subclade, but this is heavily influenced by the the sort of imperfect genetic groupings and uneven sampling of subclades across the library. 

While children tend to have larger standard deviations from the mean, this is somewhat influenced by the actual serum titers. Some of the children have the highest titers across the cohorts. Hence, normalization with the mean (ie the coefficient of variance). When we plot these values, we see no obvious difference between cohorts. 