# Make plots showing NT50 barcode replicate and experimental replicate correlations 

In [2]:
import os
import altair as alt
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error 
import numpy as np
import pandas as pd

# Basic color palette
color_palette = [
    '#345995', #blue
    '#03cea4', #teal
    '#ca1551', #red
    '#eac435', #yellow
    'grey'
               ]

In [3]:
# Find relative 'sera' directory in seqneut-pipeline results
seqneut_sera_dir = '../../../results/sera'

resultsdir = '../results'
os.makedirs(resultsdir, exist_ok = True)

In [4]:

# replicate sera to concat titer data from
sera = [
    'PennVaccineCohort_PENN23_y1974_s034_d0',
    'PennVaccineCohort_PENN23_y1974_s034_d28',
    'PennVaccineCohort_PENN23_y1981_s053_d0',
    'PennVaccineCohort_PENN23_y1981_s053_d28',
    'PooledSera_SCHPennPrePost_pool'
]

# initialize empty df
replicate_titers = pd.DataFrame()

# iterate through sera and concat titer data
for serum in sera:

    replicate_titers = pd.concat([replicate_titers, (pd.read_csv(os.path.join(seqneut_sera_dir, serum, 'titers_per_replicate.csv'))
                        .assign(plate = lambda x: x['replicate'].str.split('-').str[0],
                                barcode = lambda x: x['replicate'].str.split('-').str[1]
                               )
                       )])

replicate_titers

Unnamed: 0,group,serum,virus,replicate,titer,titer_bound,titer_as,nt50,midpoint,top,bottom,slope,plate,barcode
0,PennVaccineCohort,PENN23_y1974_s034_d0,A/Sydney/749/2023,plate27-AGCAGACACTTTACAT,987.9,interpolated,midpoint,1018.0,987.9,0.9818,0,1.236,plate27,AGCAGACACTTTACAT
1,PennVaccineCohort,PENN23_y1974_s034_d0,A/Sydney/749/2023,plate27-TCGTCCTAGAACCTAA,790.4,interpolated,midpoint,800.6,790.4,0.9899,0,1.602,plate27,TCGTCCTAGAACCTAA
2,PennVaccineCohort,PENN23_y1974_s034_d0,A/Sydney/749/2023,plate27-GTAGAAACTAGGAGTT,530.7,interpolated,midpoint,584.3,530.7,0.8999,0,2.322,plate27,GTAGAAACTAGGAGTT
3,PennVaccineCohort,PENN23_y1974_s034_d0,A/Sydney/749/2023,plate29-TCGTCCTAGAACCTAA,799.7,interpolated,midpoint,1057.0,799.7,0.8373,0,1.411,plate29,TCGTCCTAGAACCTAA
4,PennVaccineCohort,PENN23_y1974_s034_d0,A/Sydney/749/2023,plate29-GTAGAAACTAGGAGTT,654.6,interpolated,midpoint,1014.0,654.6,0.7614,0,1.482,plate29,GTAGAAACTAGGAGTT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,PooledSera,SCHPennPrePost_pool,A/Sydney/710/2023,plate38-GCCGTAGCGAAATCTT,373.0,interpolated,midpoint,382.8,373.0,0.9693,0,2.433,plate38,GCCGTAGCGAAATCTT
463,PooledSera,SCHPennPrePost_pool,A/Sydney/710/2023,plate38-TGATCCGCAAGCTTAG,376.4,interpolated,midpoint,416.5,376.4,0.8968,0,2.283,plate38,TGATCCGCAAGCTTAG
464,PooledSera,SCHPennPrePost_pool,A/Sydney/710/2023,plate39-GCCGTAGCGAAATCTT,250.8,interpolated,midpoint,317.2,250.8,0.8329,0,1.734,plate39,GCCGTAGCGAAATCTT
465,PooledSera,SCHPennPrePost_pool,A/Sydney/710/2023,plate39-CTTGTAAAACTATGAT,296.7,interpolated,midpoint,318.3,296.7,0.9480,0,1.563,plate39,CTTGTAAAACTATGAT


In [5]:
replicate_titers_pivot = (replicate_titers
     .pivot(index = ['serum', 'virus', 'barcode'],
                           columns = 'plate',
                           values = 'titer'
                          )
     .reset_index()
     .assign(log_fold_change_2729 = lambda x: np.log(x['plate27']/x['plate29']),
             log_fold_change_2830 = lambda x: np.log(x['plate28']/x['plate30']))
    )

print(replicate_titers_pivot.serum.unique())

replicate_titers_pivot.query('serum =="PENN23_y1981_s053_d28"')

['PENN23_y1974_s034_d0' 'PENN23_y1974_s034_d28' 'PENN23_y1981_s053_d0'
 'PENN23_y1981_s053_d28' 'SCHPennPrePost_pool']


plate,serum,virus,barcode,plate27,plate28,plate29,plate30,plate38,plate39,log_fold_change_2729,log_fold_change_2830
702,PENN23_y1981_s053_d28,A/AbuDhabi/6753/2023,GCTGGTGCACAAGATT,,236.5,,417.6,,,,-0.568576
703,PENN23_y1981_s053_d28,A/AbuDhabi/6753/2023,TATCGCAATATGATAA,,219.1,,348.9,,,,-0.465257
704,PENN23_y1981_s053_d28,A/AbuDhabi/6753/2023,TCTTGAATTTCATGGA,,214.2,,358.4,,,,-0.514740
705,PENN23_y1981_s053_d28,A/Bangkok/P3599/2023,AGGTGCGAGCCATCAG,,287.5,,652.1,,,,-0.818975
706,PENN23_y1981_s053_d28,A/Bangkok/P3599/2023,GAAAGAAAGCTATATG,,269.6,,595.5,,,,-0.792462
...,...,...,...,...,...,...,...,...,...,...,...
931,PENN23_y1981_s053_d28,A/Wisconsin/27/2023,CAAGACAAGCCCTATA,,300.5,,589.3,,,,-0.673488
932,PENN23_y1981_s053_d28,A/Wisconsin/27/2023,CCTATAAGGCCTTACG,,324.2,,679.8,,,,-0.740438
933,PENN23_y1981_s053_d28,A/YAMAGATA/98/2023,CAACGTGATGAGGAAG,,151.4,,342.0,,,,-0.814885
934,PENN23_y1981_s053_d28,A/YAMAGATA/98/2023,CCCGCTAACCCTGTCT,,136.7,,327.9,,,,-0.874920


## Calculate R2 correlation and RMSD

In [6]:
# write values to dictionary
# initialize empty dictionary

corr_values = {}
corr_values_string = {}

# iterate through sera and calculate R2 across experimental replicate titers

for s in replicate_titers_pivot.serum.unique():
    if s == 'PENN23_y1974_s034_d0':
        col1 = 'plate27'
        col2 = 'plate29'
    elif s == 'PENN23_y1974_s034_d28':
        col1 = 'plate27'
        col2 = 'plate29'
    elif s == 'SCHPennPrePost_pool':
        col1 = 'plate38'
        col2 = 'plate39'
    else:
        col1 = 'plate28'
        col2 = 'plate30'

    # reduce dataframe to relevant sera, plates
    df = replicate_titers_pivot.query(f'serum == "{s}"')[[col1, col2]].dropna(axis=0)

    #initiate linear regression model
    model = LinearRegression()    
    #define predictor and response variables
    titer1, titer2 = df[[col1]], df[[col2]]   
    #fit regression model
    model.fit(titer1, titer2)  
    #calculate R-squared of regression model
    r_squared = model.score(titer1, titer2)
    
    # calcualte mse
    rmse = root_mean_squared_error(titer1, titer2)

    corr_values[s] = r_squared
    corr_values_string[s] = s + ', r2=' + str(r_squared)[0:5]

print('saving dictionary of sera matched with R2...')
print(corr_values)

saving dictionary of sera matched with R2...
{'PENN23_y1974_s034_d0': 0.8243265483863605, 'PENN23_y1974_s034_d28': 0.88246973928398, 'PENN23_y1981_s053_d0': 0.8772506836368261, 'PENN23_y1981_s053_d28': 0.8744876895874067, 'SCHPennPrePost_pool': 0.9249854975446554}


## Produce per-barcode correlation scatter plot

In [7]:
# # color params
# color_list = ['green', 'white', 'blue']
# color_scheme = 'redblue'

In [8]:
# # add serum, R2 column
# df = (replicate_titers_pivot
#         .replace({'serum': corr_values_string})
#        )

# barcode_scatter_1 = (
#     alt.Chart(df)
#     .mark_circle(size=60, filled=False)
#     .encode(
#         alt.X('plate27:Q', 
#               title = 'NT50, plate 27',
#               scale = alt.Scale(nice=False, padding=6, type="log"),
#               axis = alt.Axis(grid=False, titleFontSize=14, labelFontSize=12)
#              ),
#         alt.Y('plate29:Q',
#               title = 'NT50, plate 29',
#               scale = alt.Scale(nice=False, padding=6, type="log"),
#               axis = alt.Axis(grid=False, titleFontSize=14, labelFontSize=12)
#              ),
#         alt.Facet('serum:N',
#               header=alt.Header(
#                   title=None, labelFontSize=14, labelFontStyle="bold", labelPadding=0),
#                   columns = 4,
#                   bounds = 'full'
#                  ),
#         tooltip=['serum', 'virus', 'barcode', 'plate27', 'plate29', 'log_fold_change_2729'])
#     .interactive()
#     .properties(
#         title=alt.TitleParams(
#             "Neutralizing titer correlation across biological replicate experiments",
#             fontSize=14,
#             dx=10,
#             dy=-10,
#         )
#     )
# )



# barcode_scatter_2 = (
#     alt.Chart(df)
#     .mark_circle(size=60, filled=False)
#     .encode(
#         alt.X('plate28:Q', 
#               title = 'NT50, plate 28',
#               scale = alt.Scale(nice=False, padding=6, type="log"),
#               axis = alt.Axis(grid=False, titleFontSize=14, labelFontSize=12)
#              ),
#         alt.Y('plate30:Q',
#               title = 'NT50, plate 30',
#               scale = alt.Scale(nice=False, padding=6, type="log"),
#               axis = alt.Axis(grid=False, titleFontSize=14, labelFontSize=12)
#              ),
#         alt.Facet('serum:N',
#               header=alt.Header(
#                   title=None, labelFontSize=14, labelFontStyle="bold", labelPadding=0),
#                   columns = 4,
#                   bounds = 'full'
#                  ),
#         tooltip=['serum', 'virus', 'barcode', 'plate28', 'plate30', 'log_fold_change_2830'])
#     .interactive()
# )

# # dummy line plot
# line = pd.DataFrame({
#     'Goals Conceded': [30, 30000],
#     'Goals': [30, 30000],})

# line_plot = alt.Chart(line).mark_line(color= 'black', strokeDash = [8,8]).encode(
#     x= 'Goals Conceded',
#     y= 'Goals'
# )


# # barcode_scatter_1 = (barcode_scatter_1 + line_plot)
# # barcode_scatter_2 = (barcode_scatter_2 + line_plot)

# concat = alt.concat(barcode_scatter_1, barcode_scatter_2, columns = 1)

# # save
# # concat_png = os.path.join(resultsdir, 'replicate_NT50_correlation.png')
# # # print(f"Saving chart to {concat_png}")
# # # # concat.save(concat_png, ppi=200)

# concat 

In [9]:
# # add serum, R2 column
# df = (replicate_titers_pivot
#         .replace({'serum': corr_values_string})
#        )

# barcode_scatter_1 = (
#     alt.Chart(df)
#     .mark_circle(size=60, filled=False)
#     .encode(
#         alt.X('plate27:Q', 
#               title = 'per barcode NT50, plate 27',
#               scale = alt.Scale(nice=False, padding=6, type="log"),
#               axis = alt.Axis(grid=False, titleFontSize=14, labelFontSize=12)
#              ),
#         alt.Y('plate29:Q',
#               title = 'per barcode NT50, plate 29',
#               scale = alt.Scale(nice=False, padding=6, type="log"),
#               axis = alt.Axis(grid=False, titleFontSize=14, labelFontSize=12)
#              ),
#         alt.Color('serum:N',
#                  ),
#         tooltip=['serum', 'virus', 'barcode', 'plate27', 'plate29', 'log_fold_change_2729'])
#     .interactive()
#     # .properties(
#     #     title=alt.TitleParams(
#     #         "Neutralizing titer correlation across biological replicate experiments",
#     #         fontSize=16,
#     #         dx=50,
#     #         dy=-10,
#     #     )
#     # )
# )



# barcode_scatter_2 = (
#     alt.Chart(df)
#     .mark_circle(size=60, filled=False)
#     .encode(
#         alt.X('plate28:Q', 
#               title = 'per barcode NT50, plate 28',
#               scale = alt.Scale(nice=False, padding=6, type="log"),
#               axis = alt.Axis(grid=False, titleFontSize=14, labelFontSize=12)
#              ),
#         alt.Y('plate30:Q',
#               title = 'per barcode NT50, plate 30',
#               scale = alt.Scale(nice=False, padding=6, type="log"),
#               axis = alt.Axis(grid=False, titleFontSize=14, labelFontSize=12)
#              ),
#         alt.Color('serum:N',
#                  ),
#         tooltip=['serum', 'virus', 'barcode', 'plate28', 'plate30', 'log_fold_change_2830'])
#     .interactive()
# )

# # dummy line plot
# line = pd.DataFrame({
#     'Goals Conceded': [50, 30000],
#     'Goals': [50, 30000],})

# line_plot = alt.Chart(line).mark_line(color= 'black', strokeDash = [8,8]).encode(
#     x= 'Goals Conceded',
#     y= 'Goals'
# )

# barcode_scatter_1 = (barcode_scatter_1 + line_plot)
# barcode_scatter_2 = (barcode_scatter_2 + line_plot)

# concat = alt.concat(barcode_scatter_1, barcode_scatter_2, 
#                     columns = 2, 
#                     spacing = 10
#                    ).configure_legend(
#     titleFontSize=16,
#     labelFontSize=14
#     )

# # save
# concat_png = os.path.join(resultsdir, 'replicate_NT50_correlation.png')
# print(f"Saving chart to {concat_png}")
# concat.save(concat_png, ppi=200)

# concat 

## Within variant per-barcode replicate correlations


In [10]:
# Add barcode numbers
barcode_numbering_list = []

for pl in replicate_titers.plate.sort_values().unique():
    plate_df = replicate_titers.query(f'plate == "{pl}"')

    for serum in plate_df.serum.sort_values().unique():
        serum_df = (plate_df.query(f'serum == "{serum}"'))

        for vir in serum_df.virus.sort_values().unique():
            if (len(serum_df.query(f'virus == "{vir}"'))) == 3:
                barcode_numbering_list.extend(['barcode1','barcode2','barcode3'])
            
            elif (len(serum_df.query(f'virus == "{vir}"'))) == 2:
                barcode_numbering_list.extend(['barcode1','barcode2'])
                print('Just 2 barcode-replicates found for this virus...')
                print(vir)
                print(serum_df.query(f'virus == "{vir}"'))
                print()

            
            else:
                print('Unexpected barcode-replicate found, heres the data...')
                print(serum_df.query(f'virus == "{vir}"'))
                print()

Just 2 barcode-replicates found for this virus...
A/Switzerland/9715293/2013NIB-88
                 group                 serum  \
399  PennVaccineCohort  PENN23_y1974_s034_d0   
400  PennVaccineCohort  PENN23_y1974_s034_d0   

                                virus                 replicate   titer  \
399  A/Switzerland/9715293/2013NIB-88  plate29-CGGGAAATGTAAATGA  4557.0   
400  A/Switzerland/9715293/2013NIB-88  plate29-ATAGGATATATGGCTG  4570.0   

      titer_bound  titer_as    nt50  midpoint    top  bottom  slope    plate  \
399  interpolated  midpoint  4853.0    4557.0  0.928       0  2.464  plate29   
400  interpolated  midpoint  4570.0    4570.0  1.000       0  1.663  plate29   

              barcode  
399  CGGGAAATGTAAATGA  
400  ATAGGATATATGGCTG  

Just 2 barcode-replicates found for this virus...
A/Switzerland/9715293/2013NIB-88
                 group                  serum  \
399  PennVaccineCohort  PENN23_y1974_s034_d28   
400  PennVaccineCohort  PENN23_y1974_s034_d28   

 

In [11]:
replicate_titers_tidy = replicate_titers.sort_values(by = ['plate', 'serum', 'virus']).reset_index(drop=True)
replicate_titers_tidy['barcode_n'] = barcode_numbering_list
replicate_titers_tidy

replicate_titers_tidy_pivot = (replicate_titers_tidy
     .pivot(index = ['serum', 'virus','plate'],
                           columns = 'barcode_n',
                           values = 'titer'
                          )
     .reset_index()
     # .assign(log_fold_change_2729 = lambda x: np.log(x['plate27']/x['plate29']),
     #         log_fold_change_2830 = lambda x: np.log(x['plate28']/x['plate30']))
    )

replicate_titers_tidy_pivot

barcode_n,serum,virus,plate,barcode1,barcode2,barcode3
0,PENN23_y1974_s034_d0,A/AbuDhabi/6753/2023,plate27,683.2,600.4,604.9
1,PENN23_y1974_s034_d0,A/AbuDhabi/6753/2023,plate29,439.3,519.0,373.7
2,PENN23_y1974_s034_d0,A/Bangkok/P3599/2023,plate27,597.2,572.2,663.4
3,PENN23_y1974_s034_d0,A/Bangkok/P3599/2023,plate29,656.3,494.2,789.3
4,PENN23_y1974_s034_d0,A/Bangkok/P3755/2023,plate27,503.7,580.9,721.5
...,...,...,...,...,...,...
775,SCHPennPrePost_pool,A/Victoria/1033/2023,plate39,426.9,357.7,328.2
776,SCHPennPrePost_pool,A/Wisconsin/27/2023,plate38,642.1,345.8,729.3
777,SCHPennPrePost_pool,A/Wisconsin/27/2023,plate39,239.3,403.8,498.6
778,SCHPennPrePost_pool,A/YAMAGATA/98/2023,plate38,720.0,511.5,378.6


### Calculate R2 and RMSD

In [12]:
# write values to dictionary
# initialize empty dictionary

corr_values_bc_1v2 = {}
corr_values_bc_1v3 = {}
corr_values_string_bc_1v2 = {}
corr_values_string_bc_1v3 = {}

# iterate through plate+serum combinations and calculate R2 across experimental replicate titers

for p in replicate_titers_tidy_pivot.plate.unique():
    for s in replicate_titers_tidy_pivot.query(f'plate == "{p}"').serum.unique():

        df = replicate_titers_tidy_pivot.query(f'plate == "{p}"').query(f'serum == "{s}"').dropna().reset_index(drop = True)

        ## Compare barcode1 and barcode2
        #initiate linear regression model
        model = LinearRegression()   
        #define predictor and response variables
        bc1, bc2, bc3 = df[['barcode1']], df[['barcode2']], df[['barcode3']]     
        #fit regression model
        model.fit(bc1, bc2)  
        #calculate R-squared of regression model
        r_squared = model.score(bc1, bc2)
        # calcualte mse
        rmse = root_mean_squared_error(bc1, bc2)
        # Save as dictionary 
        corr_values_bc_1v2[s] = r_squared
        corr_values_string_bc_1v2[s] = s + ', r2=' + str(r_squared)[0:5]

        ## Compare barcode1 and BARCODE3
        #initiate linear regression model
        model = LinearRegression()   
        #define predictor and response variables
        bc1, bc2, bc3 = df[['barcode1']], df[['barcode2']], df[['barcode3']]     
        #fit regression model
        model.fit(bc1, bc3)  
        #calculate R-squared of regression model
        r_squared = model.score(bc1, bc3)
        # calcualte mse
        rmse = root_mean_squared_error(bc1, bc3)
        # Save as dictionary 
        corr_values_bc_1v3[s] = r_squared
        corr_values_string_bc_1v3[s] = s + ', r2=' + str(r_squared)[0:5]

    
print('saving dictionary of barcode1 vs barcode2 R2...')
print(corr_values_bc_1v2)

print('saving dictionary of barcode1 vs barcode3 R2...')
print(corr_values_bc_1v3)


saving dictionary of barcode1 vs barcode2 R2...
{'PENN23_y1974_s034_d0': 0.8796382148953947, 'PENN23_y1974_s034_d28': 0.8722878386345697, 'PENN23_y1981_s053_d0': 0.9182050175831972, 'PENN23_y1981_s053_d28': 0.9171806091043391, 'SCHPennPrePost_pool': 0.8133570932588776}
saving dictionary of barcode1 vs barcode3 R2...
{'PENN23_y1974_s034_d0': 0.854087242625549, 'PENN23_y1974_s034_d28': 0.8429245309215396, 'PENN23_y1981_s053_d0': 0.8885379591407941, 'PENN23_y1981_s053_d28': 0.9469438648552035, 'SCHPennPrePost_pool': 0.9455412025029746}


In [13]:
# Produce plot with plate 30 only for main text figure
charts = []

# mark config
fill = True
opacity = 0.6
stroke = 'grey'
strokeWidth = 2
color = alt.Color('serum').scale(scheme='magma')

for plate in replicate_titers_tidy_pivot.plate.unique():
    # add serum, R2 column
    df = (replicate_titers_tidy_pivot.query(f'plate == "{plate}"')
           )
    
    barcode_scatter_1 = (
        alt.Chart(df.replace({'serum': corr_values_string_bc_1v2}),
                 title = f'{plate}')
        .mark_circle(size=60, filled=fill, opacity = opacity, stroke = stroke, strokeWidth = strokeWidth)
        .encode(
            alt.X('barcode1:Q', 
                  title = 'barcode-replicate1',
                  scale = alt.Scale(nice=False, padding=6, type="log"),
                  axis = alt.Axis(grid=False, titleFontSize=14, labelFontSize=12)
                 ),
            alt.Y('barcode2:Q',
                  title = 'barcode-replicate2',
                  scale = alt.Scale(nice=False, padding=6, type="log"),
                  axis = alt.Axis(grid=False, titleFontSize=14, labelFontSize=12)
                 ),
            color = color,
        )
    )
    
    barcode_scatter_2 = (
        alt.Chart(df.replace({'serum': corr_values_string_bc_1v3}),
                 title = f'{plate}')
        .mark_circle(size=60, filled=fill, opacity = opacity, stroke = stroke, strokeWidth = strokeWidth)
        .encode(
            alt.X('barcode1:Q', 
                  title = 'barcode-replicate1',
                  scale = alt.Scale(nice=False, padding=6, type="log"),
                  axis = alt.Axis(grid=False, titleFontSize=14, labelFontSize=12)
                 ),
            alt.Y('barcode3:Q',
                  title = 'barcode-replicate3',
                  scale = alt.Scale(nice=False, padding=6, type="log"),
                  axis = alt.Axis(grid=False, titleFontSize=14, labelFontSize=12)
                 ),
            color = color,
        )
    )
    
    # dummy line plot
    line = pd.DataFrame({
        'Goals Conceded': [50, 30000],
        'Goals': [50, 30000],})
    
    line_plot = alt.Chart(line).mark_line(color= 'black', strokeDash = [8,8]).encode(
        x= 'Goals Conceded',
        y= 'Goals'
    )
    
    barcode_scatter_1 = (barcode_scatter_1 + line_plot)
    barcode_scatter_2 = (barcode_scatter_2 + line_plot)
    
    concat = (alt.concat(barcode_scatter_1, barcode_scatter_2, 
                        columns = 2, 
                        spacing = 10
                       )
              # .configure_legend(
              #     titleFontSize=16,
              #     labelFontSize=14)
             )

    charts.append(concat)
    
    # save
    # concat_png = os.path.join(resultsdir, 'within_variant_barcode_correlation.png')
    # print(f"Saving chart to {concat_png}")
    # concat.save(concat_png, ppi=200)
    
(alt.concat(*charts, title = '', columns = 1)
 # .resolve_scale(y='shared')
 .configure_title(fontSize=18)
 .configure_legend(titleFontSize=20, 
                   labelFontSize = 18,
                   strokeColor='gray',
                   # fillColor='#EEEEEE',
                   padding=10,
                   cornerRadius=10,
                   labelLimit = 500)
)

In [14]:
df = replicate_titers_tidy_pivot

df['participant'] = df['serum'].str.split('_').str[:-1].apply(lambda l: "_".join(l))
df

barcode_n,serum,virus,plate,barcode1,barcode2,barcode3,participant
0,PENN23_y1974_s034_d0,A/AbuDhabi/6753/2023,plate27,683.2,600.4,604.9,PENN23_y1974_s034
1,PENN23_y1974_s034_d0,A/AbuDhabi/6753/2023,plate29,439.3,519.0,373.7,PENN23_y1974_s034
2,PENN23_y1974_s034_d0,A/Bangkok/P3599/2023,plate27,597.2,572.2,663.4,PENN23_y1974_s034
3,PENN23_y1974_s034_d0,A/Bangkok/P3599/2023,plate29,656.3,494.2,789.3,PENN23_y1974_s034
4,PENN23_y1974_s034_d0,A/Bangkok/P3755/2023,plate27,503.7,580.9,721.5,PENN23_y1974_s034
...,...,...,...,...,...,...,...
775,SCHPennPrePost_pool,A/Victoria/1033/2023,plate39,426.9,357.7,328.2,SCHPennPrePost
776,SCHPennPrePost_pool,A/Wisconsin/27/2023,plate38,642.1,345.8,729.3,SCHPennPrePost
777,SCHPennPrePost_pool,A/Wisconsin/27/2023,plate39,239.3,403.8,498.6,SCHPennPrePost
778,SCHPennPrePost_pool,A/YAMAGATA/98/2023,plate38,720.0,511.5,378.6,SCHPennPrePost


In [22]:
# Produce plot with plate 30 only for main text figure
plate = 'plate30'

# configure color scheme
fill = True
opacity = 0.7
stroke = 'black'
strokeWidth = 1.2
markSize = 120
color = alt.Color('serum').scale(range=color_palette)

titleFontSize = 16
labelFontSize = 16

width = 200
height = width
_range = [60, 30000]

# identify data
df = replicate_titers_tidy_pivot.replace({'serum': corr_values_string_bc_1v2})
df['participant'] = df['serum'].str.split('_').str[:-1].apply(lambda l: "_".join(l))

participant0 = df['participant'].unique()[0]
participant1 = df['participant'].unique()[1]
participant2 = df['participant'].unique()[2]

barcode_scatter_1 = (
    alt.Chart(df.query(f'participant == "{participant0}"'), width = width, height = height)
    .mark_circle(size=markSize, stroke=stroke, strokeWidth = strokeWidth, filled=fill)
    .encode(
        alt.X('barcode1:Q', 
              title = ['neutralization titer', 'barcode 1'],
              scale = alt.Scale(nice=False, padding=6, type="log", domain=_range),
              axis = alt.Axis(grid=False, titleFontSize=titleFontSize, labelFontSize=labelFontSize)
             ),
        alt.Y('barcode2:Q',
              title = ['neutralization titer', 'barcode 2'],
              scale = alt.Scale(nice=False, padding=6, type="log", domain=_range),
              axis = alt.Axis(grid=False, titleFontSize=titleFontSize, labelFontSize=labelFontSize)
             ),
        color = color,
))

barcode_scatter_2 = (
    alt.Chart(df.query(f'participant == "{participant1}"'), width = width, height = height)
    .mark_circle(size=markSize, stroke=stroke, strokeWidth = strokeWidth, filled=fill)
    .encode(
        alt.X('barcode1:Q', 
              title = ['neutralization titer', 'barcode 1'],
              scale = alt.Scale(nice=False, padding=6, type="log", domain=_range),
              axis = alt.Axis(grid=False, titleFontSize=titleFontSize, labelFontSize=labelFontSize)
             ),
        alt.Y('barcode2:Q',
              title = ['neutralization titer', 'barcode 2'],
              scale = alt.Scale(nice=False, padding=6, type="log", domain=_range),
              axis = alt.Axis(grid=False, titleFontSize=titleFontSize, labelFontSize=labelFontSize)
             ),
        color = color,
))

barcode_scatter_3 = (
    alt.Chart(df.query(f'participant == "{participant2}"'), width = width, height = width)
    .mark_circle(size=markSize, stroke=stroke, strokeWidth = strokeWidth, filled=fill)
    .encode(
        alt.X('barcode1:Q', 
              title = ['neutralization titer', 'barcode 1'],
              scale = alt.Scale(nice=False, padding=6, type="log", domain=_range),
              axis = alt.Axis(grid=False, titleFontSize=titleFontSize, labelFontSize=labelFontSize)
             ),
        alt.Y('barcode2:Q',
              title = ['neutralization titer', 'barcode 2'],
              scale = alt.Scale(nice=False, padding=6, type="log", domain=_range),
              axis = alt.Axis(grid=False, titleFontSize=titleFontSize, labelFontSize=labelFontSize)
             ),
        color = color,
))

# dummy line plot
line = pd.DataFrame({
    'x': _range,
    'y': _range,})
line_plot = alt.Chart(line).mark_line(color= 'black', strokeDash = [8,8]).encode(
    x= 'x',
    y= 'y')

barcode_scatter_1 = (line_plot + barcode_scatter_1)
barcode_scatter_2 = (line_plot + barcode_scatter_2)
barcode_scatter_3 = (line_plot + barcode_scatter_3)


concat = (alt.concat(barcode_scatter_1, barcode_scatter_2,
                     barcode_scatter_3, 
                    columns = 3, 
                    spacing = 10
                   )
          .configure_legend(titleFontSize=labelFontSize, 
                            labelFontSize = labelFontSize,
                            symbolStrokeWidth = strokeWidth,
                            strokeColor='gray',
                            padding=10,
                            cornerRadius=10,
                            labelLimit = 500)
    )

# barcode_scatter_1 = (
#     alt.Chart(df.replace({'serum': corr_values_string_bc_1v2}), width=width, height=height)
#     .mark_circle(size=markSize, filled=fill, opacity = opacity, stroke = stroke, strokeWidth = strokeWidth)
#     .encode(
#         alt.X('barcode1:Q', 
#               title = 'NT50, Barcode 1',
#               scale = alt.Scale(nice=False, padding=6, type="log", domain=_range),
#               axis = alt.Axis(grid=False, titleFontSize=titleFontSize, labelFontSize=labelFontSize)
#              ),
#         alt.Y('barcode2:Q',
#               title = 'NT50, Barcode 2',
#               scale = alt.Scale(nice=False, padding=6, type="log", domain=_range),
#               axis = alt.Axis(grid=False, titleFontSize=titleFontSize, labelFontSize=labelFontSize)
#              ),
#         color = color,
#     )
# )

# # dummy line plot
# line = pd.DataFrame({
#     'dummy_x': _range,
#     'dummy_y': _range,})

# line_plot = alt.Chart(line).mark_line(color= 'black', strokeDash = [8,8]).encode(
#     x= 'dummy_x',
#     y= 'dummy_y'
# )

# layer = (alt.layer(barcode_scatter_1, 
#                    # line_plot, 
#                    )
#          .facet('participant')
#           .configure_title(fontSize=titleFontSize)
#          .configure_header(title=None, labelFontSize=0)
#           .configure_legend(titleFontSize=titleFontSize, 
#                labelFontSize = labelFontSize,
#                strokeColor='gray',
#                # fillColor='#EEEEEE',
#                padding=10,
#                cornerRadius=10,
#                labelLimit = 500)
#          )

# Save
outfile = os.path.join(resultsdir, 'per_barcode_replicate_correlations.pdf')
concat.save(outfile, dpi = 600)
concat

## Correlations on viral strain level

In [16]:
replicate_NT50_pivot = (replicate_titers
     .pivot(index = ['serum', 'virus', 'barcode'],
                           columns = 'plate',
                           values = 'nt50'
                          )
     .reset_index()
     .assign(log_fold_change_2729 = lambda x: np.log(x['plate27']/x['plate29']),
             log_fold_change_2830 = lambda x: np.log(x['plate28']/x['plate30']))
    )


# intialize empty list for median values
median_titer_ls = []

# get medium titers per virus per sera
for v in replicate_NT50_pivot.virus.unique():
    for s in replicate_NT50_pivot.serum.unique():

        df = replicate_NT50_pivot.query(f'virus == "{v}"').query(f'serum == "{s}"')
        
        if s == 'PENN23_y1974_s034_d0':
            plate27_median = (df.plate27.median())
            plate29_median = (df.plate29.median())
            plate28_median = np.nan
            plate30_median = np.nan
            plate38_median = np.nan
            plate39_median = np.nan

        elif s == 'PENN23_y1974_s034_d28':
            plate27_median = (df.plate27.median())
            plate29_median = (df.plate29.median())
            plate28_median = np.nan
            plate30_median = np.nan
            plate38_median = np.nan
            plate39_median = np.nan
       
        elif s == 'SCHPennPrePost_pool':
            plate38_median = (df.plate38.median())
            plate39_median = (df.plate39.median())
            plate28_median = np.nan
            plate30_median = np.nan
            plate27_median = np.nan
            plate29_median = np.nan
            
        else:
            plate28_median = (df.plate28.median())
            plate30_median = (df.plate30.median())
            plate27_median = np.nan
            plate29_median = np.nan
            plate38_median = np.nan
            plate39_median = np.nan

        median_titer_ls.append([s, v, 
                                plate27_median, plate28_median, 
                                plate29_median, plate30_median,
                                plate38_median, plate39_median])

# make df
median_titer_df = pd.DataFrame(median_titer_ls, columns = ['serum', 'virus', 
                                                           'plate27_median', 'plate28_median', 
                                                           'plate29_median', 'plate30_median',
                                                           'plate38_median', 'plate39_median'])
# merge df with pivot
replicate_NT50_median_merge = replicate_NT50_pivot.merge(median_titer_df, how = 'left', on = ['serum', 'virus'])

replicate_NT50_median_merge

Unnamed: 0,serum,virus,barcode,plate27,plate28,plate29,plate30,plate38,plate39,log_fold_change_2729,log_fold_change_2830,plate27_median,plate28_median,plate29_median,plate30_median,plate38_median,plate39_median
0,PENN23_y1974_s034_d0,A/AbuDhabi/6753/2023,GCTGGTGCACAAGATT,620.5,,393.4,,,,0.455699,,623.7,,515.2,,,
1,PENN23_y1974_s034_d0,A/AbuDhabi/6753/2023,TATCGCAATATGATAA,727.3,,515.2,,,,0.344784,,623.7,,515.2,,,
2,PENN23_y1974_s034_d0,A/AbuDhabi/6753/2023,TCTTGAATTTCATGGA,623.7,,622.1,,,,0.002569,,623.7,,515.2,,,
3,PENN23_y1974_s034_d0,A/Bangkok/P3599/2023,AGGTGCGAGCCATCAG,650.0,,817.4,,,,-0.229156,,650.0,,817.4,,,
4,PENN23_y1974_s034_d0,A/Bangkok/P3599/2023,GAAAGAAAGCTATATG,643.8,,615.5,,,,0.044953,,650.0,,817.4,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1165,SCHPennPrePost_pool,A/Wisconsin/27/2023,CAAGACAAGCCCTATA,,,,,351.3,437.9,,,,,,,642.1,437.9
1166,SCHPennPrePost_pool,A/Wisconsin/27/2023,CCTATAAGGCCTTACG,,,,,784.1,541.9,,,,,,,642.1,437.9
1167,SCHPennPrePost_pool,A/YAMAGATA/98/2023,CAACGTGATGAGGAAG,,,,,727.5,369.6,,,,,,,527.1,416.8
1168,SCHPennPrePost_pool,A/YAMAGATA/98/2023,CCCGCTAACCCTGTCT,,,,,382.9,538.1,,,,,,,527.1,416.8


In [17]:
# for median only
temp_df = replicate_NT50_median_merge[['serum', 'virus', 
                                       'plate27_median', 'plate28_median', 
                                       'plate29_median', 'plate30_median',
                                       'plate38_median', 'plate39_median']].drop_duplicates()

# write values to dictionary
# initialize empty dictionary
virus_corr_values = {}
virus_corr_values_string = {}

# iterate through sera and calculate R2 across experimental replicate titers

for s in temp_df.serum.unique():
    if s == 'PENN23_y1974_s034_d0':
        col1 = 'plate27_median'
        col2 = 'plate29_median'
    elif s == 'PENN23_y1974_s034_d28':
        col1 = 'plate27_median'
        col2 = 'plate29_median'
    elif s == 'SCHPennPrePost_pool':
        col1 = 'plate38_median'
        col2 = 'plate39_median' 
    else:
        col1 = 'plate28_median'
        col2 = 'plate30_median'

    # reduce dataframe to relevant sera, plates
    df = temp_df.query(f'serum == "{s}"')[[col1, col2]].dropna().reset_index(drop = True)

    #initiate linear regression model
    model = LinearRegression()    
    #define predictor and response variables
    titer1, titer2 = df[[col1]], df[[col2]]   
    #fit regression model
    model.fit(titer1, titer2)  
    #calculate R-squared of regression model
    r_squared = model.score(titer1, titer2)
    
    # calcualte mse
    rmse = root_mean_squared_error(titer1, titer2)

    virus_corr_values[s] = r_squared
    virus_corr_values_string[s] = s + ', r2=' + str(r_squared)[0:5]

print('saving dictionary of sera matched with R2...')
print(virus_corr_values)

saving dictionary of sera matched with R2...
{'PENN23_y1974_s034_d0': 0.8894039425107969, 'PENN23_y1974_s034_d28': 0.9607798462555565, 'PENN23_y1981_s053_d0': 0.951414874555066, 'PENN23_y1981_s053_d28': 0.9790168157691079, 'SCHPennPrePost_pool': 0.9760074566988571}


In [20]:
# configure color scheme
fill = True
opacity = 0.7
stroke = 'black'
strokeWidth = 1.2
markSize = 120
color = alt.Color('serum').scale(range=color_palette)

titleFontSize = 16
labelFontSize = 16

width = 200
height = width
_range = [60, 30000]


# add serum, R2 column
df = (temp_df
        .replace({'serum': virus_corr_values_string})
       )

barcode_scatter_1 = (
    alt.Chart(df, width = width, height = height)
    .mark_circle(size=markSize, stroke=stroke, strokeWidth = strokeWidth, filled=fill)
    .encode(
        alt.X('plate27_median:Q', 
              title = ['neutralization titer', 'experiment 1'],
              scale = alt.Scale(nice=False, padding=6, type="log", domain=_range),
              axis = alt.Axis(grid=False, titleFontSize=titleFontSize, labelFontSize=labelFontSize)
             ),
        alt.Y('plate29_median:Q',
              title = ['neutralization titer', 'experiment 2'],
              scale = alt.Scale(nice=False, padding=6, type="log", domain=_range),
              axis = alt.Axis(grid=False, titleFontSize=titleFontSize, labelFontSize=labelFontSize)
             ),
        color = color,
        tooltip=['serum', 'virus', 'plate27_median', 'plate29_median'])
)

barcode_scatter_2 = (
    alt.Chart(df, width = width, height = height)
    .mark_circle(size=markSize, stroke=stroke, strokeWidth = strokeWidth, filled=fill)
    .encode(
        alt.X('plate28_median:Q', 
              title = ['neutralization titer', 'experiment 1'],
              scale = alt.Scale(nice=False, padding=6, type="log", domain=_range),
              axis = alt.Axis(grid=False, titleFontSize=titleFontSize, labelFontSize=labelFontSize)
             ),
        alt.Y('plate30_median:Q',
              title = ['neutralization titer', 'experiment 2'],
              scale = alt.Scale(nice=False, padding=6, type="log", domain=_range),
              axis = alt.Axis(grid=False, titleFontSize=titleFontSize, labelFontSize=labelFontSize)
             ),
        color = color,
        tooltip=['serum', 'virus', 'plate27_median', 'plate29_median'])
)

barcode_scatter_3 = (
    alt.Chart(df, width = width, height = width)
    .mark_circle(size=markSize, stroke=stroke, strokeWidth = strokeWidth, filled=fill)
    .encode(
        alt.X('plate38_median:Q', 
              title = ['neutralization titer', 'experiment 3'],
              scale = alt.Scale(nice=False, padding=6, type="log", domain=_range),
              axis = alt.Axis(grid=False, titleFontSize=titleFontSize, labelFontSize=labelFontSize)
             ),
        alt.Y('plate39_median:Q',
              title = ['neutralization titer', 'experiment 4'],
              scale = alt.Scale(nice=False, padding=6, type="log", domain=_range),
              axis = alt.Axis(grid=False, titleFontSize=titleFontSize, labelFontSize=labelFontSize)
             ),
        color = color,
        tooltip=['serum', 'virus', 'plate28_median', 'plate30_median'])
)

# dummy line plot
line = pd.DataFrame({
    'x': _range,
    'y': _range,})
line_plot = alt.Chart(line).mark_line(color= 'black', strokeDash = [8,8]).encode(
    x= 'x',
    y= 'y')

barcode_scatter_1 = (line_plot + barcode_scatter_1)
barcode_scatter_2 = (line_plot + barcode_scatter_2)
barcode_scatter_3 = (line_plot + barcode_scatter_3)


concat = (alt.concat(barcode_scatter_1, barcode_scatter_2,
                     barcode_scatter_3, 
                    columns = 3, 
                    spacing = 10
                   )
          .configure_legend(titleFontSize=labelFontSize, 
                            labelFontSize = labelFontSize,
                            symbolStrokeWidth = strokeWidth,
                            strokeColor='gray',
                            padding=10,
                            cornerRadius=10,
                            labelLimit = 500)
    )

# Save
outfile = os.path.join(resultsdir, 'per_strain_replicate_correlations.pdf')
concat.save(outfile, dpi = 600)
concat

The non-vaccine strains cluster a lot more tightly in the `SCHPennPrePost_pool` serum relative to the other individually-run sera. With vaccine strains included, correlations are quite strong.