# Calculate fraction infectivity for each of the barcodes included in the library at each concentration selected
Here we compute the  fraction infectivity for each well for the selections that we performed in a 96-well plate

First, import Python modules:

In [4]:
import os
import pandas as pd
import yaml

Change working directory to top directory of repo:

In [2]:
os.chdir('../../')
os.getcwd()

'/fh/fast/bloom_j/computational_notebooks/aloes/2023/NGS_Neut_Assay'

## Read input data for generating dataframe with sample metadata and barcode counts
Read configuration:

In [3]:
with open('config.yml') as f:
    config = yaml.safe_load(f)

Read the barcode run information:
First, we read in the barcodes that correspond to the neutralization standard, then we make separate dataframes for the counts for the controls and for the target strains that we are interested in assessing neutralization potency.

In [4]:
#Here, I am reading in a file that contains all of the barcode counts and a file that contains the linkage between the barcodes and the variants, such that we can link this information later on
variant_counts_samples = pd.read_csv(config["barcode_counts_file"])
variants_in_pool = pd.read_csv(config["strain_to_barcode"]).drop(columns=['library'])

#create a dataframe that merges the barcode counts with the variant names and has the amino acid information
variants_named_counts = pd.merge(variant_counts_samples, variants_in_pool, on="barcode", how="outer", validate="many_to_one")

#Pull the list of neutralization standard barcodes from the vaariant_in_pool file
neut_standard_barcodes =pd.read_csv(config["neut_standards"])
neut_standard_barcodes_list = neut_standard_barcodes['barcode'].tolist()

In [5]:
#Converting replicate to a string, not essential
variants_named_counts['replicate'] = "rep" + variants_named_counts['replicate'].astype(str)

In [6]:
#create a dataframe that is the count for the neut standards, summed over all of the barcodes for the neut standard
variants_named_counts_neut_standard = variants_named_counts.loc[variants_named_counts['barcode'].isin(neut_standard_barcodes_list)].groupby(by=['sample','replicate'], as_index=False).sum(numeric_only=True).rename(columns={"count":"neutstandard_count"}).drop(columns=['concentration','replicate','retain'])

In [7]:
#Add the column containing the neut standard summed value for each of the files to the dataframes as the neutstandard_count_pre column, and rename the other columns so that they are now correct for transformations
#This is done for each sample and barcode independently, I am using separate dataframes for the controls and the selections as I will want to group the controls by plate and then merge that information back into the selections dataframe later on
variants_named_counts_withneut = pd.merge(variants_named_counts, variants_named_counts_neut_standard, on="sample", how="inner", validate="many_to_one")

# We are going to drop the lines that are for the neutstandard barcodes from each file, as this will just get confusing
variants_named_counts_withneut = variants_named_counts_withneut.loc[~variants_named_counts_withneut['barcode'].str.contains('|'.join(neut_standard_barcodes_list))]

#create dataframes for selections and controls, use names that can be used every time, and rename columns
variants_named_counts_selections_withneut = variants_named_counts_withneut.loc[~variants_named_counts_withneut['sample'].str.contains('Noselection|CellsOnly|Noserum')].rename(columns={"sample": "antibody_sample", "count": "postselection_count", "neutstandard_count":"neutstandard_count_post"}).drop(columns=['library'])
variants_named_counts_controls_withneut = variants_named_counts_withneut.loc[variants_named_counts_withneut['sample'].str.contains('Noselection|Noserum')].rename(columns={"sample": "no-antibody_sample", "count": "preselection_count", "neutstandard_count":"neutstandard_count_pre"}).drop(columns=['library'])

### As we have multiple control wells per plate, we need to merge these replicate control wells into a single ratio of each barcode to the neut standard barcodes for the whole plate
#### To do this, I am creating a dataframe where I first calculate the ratio of counts of each barcode to the sum of all neutralization standard barcodes for each of the control wells and then take the median value of this for all wells on the plates, I can use the plate name control for normalization

In [8]:
#Divide the preselection_count by the neutralization standard count for each well for each barcode
variants_named_counts_controls_withneut['preselection_count_normalized'] = variants_named_counts_controls_withneut['preselection_count'] / variants_named_counts_controls_withneut['neutstandard_count_pre']

In [14]:
# Remove any samples where neut standard counts is too low
variants_named_counts_controls_withneut = variants_named_counts_controls_withneut.loc[variants_named_counts_controls_withneut['neutstandard_count_pre']>1000]

In [15]:
#I am currently taking the median of the normalized count for all the control wells for each barcode count/the summed counts for the neut-standard barcodes for each plate
variants_named_counts_controls_withneut_median = variants_named_counts_controls_withneut.groupby(by=['barcode','plate'], as_index=False).median(numeric_only=True)
variants_named_counts_controls_withneut_median

Unnamed: 0,barcode,plate,preselection_count,concentration,retain,neutstandard_count_pre,preselection_count_normalized
0,AAAAAACGCATGTAGA,Plate1,9959.5,,1.0,54892.5,0.224402
1,AAAAAACGCATGTAGA,Plate10,7537.5,,1.0,53418.5,0.136950
2,AAAAAACGCATGTAGA,Plate11,8395.0,,1.0,48472.0,0.200168
3,AAAAAACGCATGTAGA,Plate12,6498.5,,1.0,38216.5,0.174120
4,AAAAAACGCATGTAGA,Plate2,6473.0,,1.0,50031.5,0.119030
...,...,...,...,...,...,...,...
1351,TTGTCCCGAGACAACA,Plate5,7824.5,,1.0,70551.5,0.118054
1352,TTGTCCCGAGACAACA,Plate6,7811.0,,1.0,158061.0,0.051128
1353,TTGTCCCGAGACAACA,Plate7,6932.0,,1.0,22089.5,0.330257
1354,TTGTCCCGAGACAACA,Plate8,6662.5,,1.0,37927.5,0.161156


### Now we want to merge the dataframes so that we can link the pre-selection counts for each barcode and neut-standard with the post selection counts for each barcode and neut standards

#### Calculate the normalized counts for each of the variants post selection

In [16]:
#We are just dividing the count for the barcode by the summed counts for the neutstandard
variants_named_counts_selections_withneut['postselection_count_normalized'] = variants_named_counts_selections_withneut['postselection_count'] / variants_named_counts_selections_withneut['neutstandard_count_post'] 

### Now, map the preselection counts with the post selection counts:

In [17]:
#Merge selections and controls on barcode and dated_controls
variant_counts_samples_mapped = pd.merge(variants_named_counts_selections_withneut, variants_named_counts_controls_withneut_median, on=["barcode","plate"],how="left", validate="many_to_one")
variant_counts_withpreselection = variant_counts_samples_mapped.loc[variant_counts_samples_mapped['preselection_count'] > 10]

variant_counts_withpreselection

Unnamed: 0,barcode,postselection_count,antibody_sample,antibody,concentration_x,replicate,date,plate,standard_set,retain_x,strain,neutstandard_count_post,postselection_count_normalized,preselection_count,concentration_y,retain_y,neutstandard_count_pre,preselection_count_normalized
0,CGTTTAAACAATGAAG,39064,pdmH1N1_lib2022-D002d0-393660.0-1-230801-Plate1,D002d0,393660.0,rep1,2023-08-01,Plate1,pdmH1N1_std,True,A/India-Pune-Nivcov2221170/2022,26829,1.456036,81067.0,,1.0,54892.5,1.551538
1,AGTGTCCCTAAGAGGC,20134,pdmH1N1_lib2022-D002d0-393660.0-1-230801-Plate1,D002d0,393660.0,rep1,2023-08-01,Plate1,pdmH1N1_std,True,A/Bangladesh/8002/2021,26829,0.750457,25668.5,,1.0,54892.5,0.560747
2,CTGCACGAGAGACTTC,18873,pdmH1N1_lib2022-D002d0-393660.0-1-230801-Plate1,D002d0,393660.0,rep1,2023-08-01,Plate1,pdmH1N1_std,True,A/Perth/1/2022,26829,0.703455,31058.0,,1.0,54892.5,0.554462
3,GTCCGTTGATAAAGAG,18188,pdmH1N1_lib2022-D002d0-393660.0-1-230801-Plate1,D002d0,393660.0,rep1,2023-08-01,Plate1,pdmH1N1_std,True,A/Cote_DIvoire/1448/2021,26829,0.677923,21210.5,,1.0,54892.5,0.414267
4,ATACCTCAACCTTGAA,17964,pdmH1N1_lib2022-D002d0-393660.0-1-230801-Plate1,D002d0,393660.0,rep1,2023-08-01,Plate1,pdmH1N1_std,True,A/Bangladesh/8036/2021,26829,0.669574,28446.0,,1.0,54892.5,0.611516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108472,AACGAATGAATTTCTT,5130,pdmH1N1_lib2022-Y184d30-131220.0-2-230926-Plate12,Y184d30,131220.0,rep2,2023-09-26,Plate12,pdmH1N1_std,True,A/Togo/0274/2021,37520,0.136727,4532.5,,1.0,38216.5,0.109649
108473,ACGGAATCCCCTGAGA,937,pdmH1N1_lib2022-Y184d30-131220.0-2-230926-Plate12,Y184d30,131220.0,rep2,2023-09-26,Plate12,pdmH1N1_std,True,A/Washington/23/2020,37520,0.024973,2119.5,,1.0,38216.5,0.050860
108474,GCAATCCCGCAATTTG,2726,pdmH1N1_lib2022-Y184d30-131220.0-2-230926-Plate12,Y184d30,131220.0,rep2,2023-09-26,Plate12,pdmH1N1_std,True,A/Ghana/2080/2020,37520,0.072655,3569.5,,1.0,38216.5,0.088467
108475,GCCGGAGGGCATTTTC,3476,pdmH1N1_lib2022-Y184d30-131220.0-2-230926-Plate12,Y184d30,131220.0,rep2,2023-09-26,Plate12,pdmH1N1_std,True,A/Belgium/H0038/2022,37520,0.092644,2714.5,,1.0,38216.5,0.069555


### Calculate the normalized count for each barcoded variant by dividing the count preselection by the count post selection

In [18]:
variant_counts_withpreselection['normalized_count'] = variant_counts_withpreselection['postselection_count_normalized'] / variant_counts_withpreselection['preselection_count_normalized']

#Remove unnecessary columns to get the simplified dataframe
variant_counts_normalized = variant_counts_withpreselection.drop(columns = ['postselection_count','neutstandard_count_post','postselection_count_normalized','neutstandard_count_pre','preselection_count','preselection_count_normalized','retain_x','retain_y','standard_set'])
variant_counts_normalized

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  variant_counts_withpreselection['normalized_count'] = variant_counts_withpreselection['postselection_count_normalized'] / variant_counts_withpreselection['preselection_count_normalized']


Unnamed: 0,barcode,antibody_sample,antibody,concentration_x,replicate,date,plate,strain,concentration_y,normalized_count
0,CGTTTAAACAATGAAG,pdmH1N1_lib2022-D002d0-393660.0-1-230801-Plate1,D002d0,393660.0,rep1,2023-08-01,Plate1,A/India-Pune-Nivcov2221170/2022,,0.938447
1,AGTGTCCCTAAGAGGC,pdmH1N1_lib2022-D002d0-393660.0-1-230801-Plate1,D002d0,393660.0,rep1,2023-08-01,Plate1,A/Bangladesh/8002/2021,,1.338317
2,CTGCACGAGAGACTTC,pdmH1N1_lib2022-D002d0-393660.0-1-230801-Plate1,D002d0,393660.0,rep1,2023-08-01,Plate1,A/Perth/1/2022,,1.268718
3,GTCCGTTGATAAAGAG,pdmH1N1_lib2022-D002d0-393660.0-1-230801-Plate1,D002d0,393660.0,rep1,2023-08-01,Plate1,A/Cote_DIvoire/1448/2021,,1.636438
4,ATACCTCAACCTTGAA,pdmH1N1_lib2022-D002d0-393660.0-1-230801-Plate1,D002d0,393660.0,rep1,2023-08-01,Plate1,A/Bangladesh/8036/2021,,1.094941
...,...,...,...,...,...,...,...,...,...,...
108472,AACGAATGAATTTCTT,pdmH1N1_lib2022-Y184d30-131220.0-2-230926-Plate12,Y184d30,131220.0,rep2,2023-09-26,Plate12,A/Togo/0274/2021,,1.246951
108473,ACGGAATCCCCTGAGA,pdmH1N1_lib2022-Y184d30-131220.0-2-230926-Plate12,Y184d30,131220.0,rep2,2023-09-26,Plate12,A/Washington/23/2020,,0.491026
108474,GCAATCCCGCAATTTG,pdmH1N1_lib2022-Y184d30-131220.0-2-230926-Plate12,Y184d30,131220.0,rep2,2023-09-26,Plate12,A/Ghana/2080/2020,,0.821261
108475,GCCGGAGGGCATTTTC,pdmH1N1_lib2022-Y184d30-131220.0-2-230926-Plate12,Y184d30,131220.0,rep2,2023-09-26,Plate12,A/Belgium/H0038/2022,,1.331956


In [19]:
#Add columns which correspond to antibody concentration from dilution factor
variant_counts_normalized = variant_counts_normalized.rename(columns={"concentration_x": "antibody_dilution"})
variant_counts_normalized['antibody_dilution'] = pd.to_numeric(variant_counts_normalized['antibody_dilution'], errors='ignore')
variant_counts_normalized['antibody_concentration'] = 1/variant_counts_normalized['antibody_dilution']

#rename columns for loading into neutcurve to fit Hill curve and calculate NT50
variant_counts_test = variant_counts_normalized[['antibody','barcode','antibody_concentration','normalized_count','strain','antibody_sample','replicate']].copy()
variant_counts_test['virus'] = variant_counts_test['barcode']+"_"+variant_counts_test['replicate'].astype(str)
variant_counts_test = variant_counts_test.rename(columns={"antibody": "serum","antibody_concentration":"concentration","normalized_count":"fraction infectivity","antibody_sample":"sample"})

In [21]:
#Save fraction infectivity to file
os.mkdir(config["selection_dir"])
variant_counts_test.to_csv(config["fraction_infectivity_file"])