# Subset barcode runs
This Python Jupyter notebook is creates a copy of the barcode runs file that contains just specific samples.

First, import Python modules:

In [1]:
import os

import pandas as pd

import yaml

Read in the "master" barcode runs CSV file and the runs used to compute the escape scores for each sample:

In [2]:
with open('../config.yaml') as f:
    config = yaml.safe_load(f)

barcode_runs_file = os.path.join('../', config['barcode_runs'])
print(f"Reading barcode runs from {barcode_runs_file}")
barcode_runs = pd.read_csv(barcode_runs_file)

escape_score_samples_file = os.path.join('../', config['escape_score_samples'])
print(f"Reading escape score samples from {escape_score_samples_file}")
escape_score_samples = pd.read_csv(escape_score_samples_file)

Reading barcode runs from ../data/barcode_runs.csv
Reading escape score samples from ../results/escape_scores/samples.csv


Now read in the samples to subset:

In [3]:
samples_to_subset_df = pd.read_csv('samples_to_subset.csv')
print('Here are the samples we will subset to:')
display(samples_to_subset_df)

samples_to_subset = samples_to_subset_df['sample'].tolist()
assert len(samples_to_subset) == len(set(samples_to_subset)), 'duplicate samples to subset'

Here are the samples we will subset to:


Unnamed: 0,sample
0,S309_421
1,S304_46
2,S2X35_70
3,S2H97_58
4,S2E12_56
5,S2H58_46
6,S2X16_54
7,S2D106_68
8,S2X58_18
9,S2H13_56


Now get all the escape-score samples of interest:

In [4]:
if not set(samples_to_subset).issubset(escape_score_samples['name']):
    raise ValueError(f"Not all samples to subset are in the escape score samples.")
    
samples_subset = (
    escape_score_samples
    .query('name in @samples_to_subset')
    .reset_index(drop=True)
    )

print(f"Here are the samples for which we are subsetting barcode runs:")
display(samples_subset)

Here are the samples for which we are subsetting barcode runs:


Unnamed: 0,name,library,antibody,concentration,concentration_units,date,pre_sample,post_sample,frac_escape,pre_cells_sorted,post_cells_sorted
0,CB6_400,lib1,CB6,400,ng_per_mL,200904,expt_24-33-none-0-reference,expt_27-CB6-400-escape,0.222,1907893.0,160000000.0
1,CB6_400,lib2,CB6,400,ng_per_mL,200904,expt_24-33-none-0-reference,expt_27-CB6-400-escape,0.225,927804.0,160000000.0
2,COV2-2050_400,lib1,COV2-2050,400,ng_per_mL,200720,expt_13-16-none-0-reference,expt_13-COV2-2050-400-escape,0.078,1253185.0,160000000.0
3,COV2-2050_400,lib2,COV2-2050,400,ng_per_mL,200720,expt_13-16-none-0-reference,expt_13-COV2-2050-400-escape,0.104,1588138.0,160000000.0
4,COV2-2082_400,lib1,COV2-2082,400,ng_per_mL,200616,expt_7-11-none-0-reference,expt_11-COV2-2082-400-escape,0.057,522474.0,160000000.0
5,COV2-2082_400,lib2,COV2-2082,400,ng_per_mL,200616,expt_7-11-none-0-reference,expt_11-COV2-2082-400-escape,0.086,512138.0,160000000.0
6,COV2-2094_400,lib1,COV2-2094,400,ng_per_mL,200720,expt_13-16-none-0-reference,expt_14-COV2-2094-400-escape,0.069,853420.0,160000000.0
7,COV2-2094_400,lib2,COV2-2094,400,ng_per_mL,200720,expt_13-16-none-0-reference,expt_14-COV2-2094-400-escape,0.071,954885.0,160000000.0
8,COV2-2096_400,lib1,COV2-2096,400,ng_per_mL,200810,expt_22-23-none-0-reference,expt_23-COV2-2096-400-escape,0.155,1018697.0,160000000.0
9,COV2-2096_400,lib2,COV2-2096,400,ng_per_mL,200810,expt_22-23-none-0-reference,expt_23-COV2-2096-400-escape,0.11,927621.0,160000000.0


Now unfold the samples of interest into the actual relevant barcode runs (this requires getting both the pre- and post-selection run for each sample):

In [5]:
barcode_runs_subset = (
    samples_subset
    .melt(id_vars=['name', 'library'],
          value_vars=['pre_sample', 'post_sample'],
          value_name='sample',
          var_name='sample_type')
    [['library', 'sample']]
    .drop_duplicates()
    .reset_index(drop=True)
    .merge(barcode_runs,
           how='left',
           on=['library', 'sample'],
           validate='one_to_one',
           )
    )

assert barcode_runs_subset['R1'].notnull().all(), 'some barcode runs missing R1'

barcode_runs_subset_file = 'barcode_runs_subset.csv'
print(f"Here are the subsetted barcode runs. Writing to {barcode_runs_subset_file}")
barcode_runs_subset.to_csv(barcode_runs_subset_file, index=False)
display(barcode_runs_subset)

Here are the subsetted barcode runs. Writing to barcode_runs_subset.csv


Unnamed: 0,library,sample,date,experiment,antibody,concentration,concentration_units,group,selection,frac_escape,cells_sorted,R1
0,lib1,expt_24-33-none-0-reference,200904,expt_24-33,none,0,ng_per_mL,clinical_serum,reference,,160000000.0,/shared/ngs/illumina/agreaney/200914_D00300_10...
1,lib2,expt_24-33-none-0-reference,200904,expt_24-33,none,0,ng_per_mL,clinical_serum,reference,,160000000.0,/shared/ngs/illumina/agreaney/200914_D00300_10...
2,lib1,expt_13-16-none-0-reference,200720,expt_13-16,none,0,ng_per_mL,none,reference,,160000000.0,/shared/ngs/illumina/agreaney/200727_D00300_10...
3,lib2,expt_13-16-none-0-reference,200720,expt_13-16,none,0,ng_per_mL,none,reference,,160000000.0,/shared/ngs/illumina/agreaney/200727_D00300_10...
4,lib1,expt_7-11-none-0-reference,200616,expt_7-11,none,0,ng_per_mL,none,reference,,160000000.0,/shared/ngs/illumina/agreaney/200618_D00300_09...
...,...,...,...,...,...,...,...,...,...,...,...,...
73,lib2,expt_72-S2X58-18-escape,201106,expt_72,S2X58,18,ng_per_mL,Vir,escape,0.141,1391912.0,/shared/ngs/illumina/tstarr/201123_D00300_1119...
74,lib1,expt_73-S304-46-escape,201106,expt_73,S304,46,ng_per_mL,Vir,escape,0.123,1121146.0,/shared/ngs/illumina/tstarr/201123_D00300_1119...
75,lib2,expt_73-S304-46-escape,201106,expt_73,S304,46,ng_per_mL,Vir,escape,0.155,1300000.0,/shared/ngs/illumina/tstarr/201123_D00300_1119...
76,lib1,expt_68-S309-421-escape,201106,expt_68,S309,421,ng_per_mL,Vir,escape,0.100,1000774.0,/shared/ngs/illumina/tstarr/201123_D00300_1119...
