# Subset barcode runs
This Python Jupyter notebook is creates a copy of the barcode runs file that contains just specific samples.

First, import Python modules:

In [1]:
import os

import pandas as pd

import yaml

Read in the main barcode runs CSV file and the runs used to compute the escape scores for each sample:

In [2]:
with open('../config.yaml') as f:
    config = yaml.safe_load(f)

barcode_runs_file = os.path.join('../', config['barcode_runs'])
print(f"Reading barcode runs from {barcode_runs_file}")
barcode_runs = pd.read_csv(barcode_runs_file)

escape_score_samples_file = os.path.join('../', config['escape_score_samples'])
print(f"Reading escape score samples from {escape_score_samples_file}")
escape_score_samples = pd.read_csv(escape_score_samples_file)

Reading barcode runs from ../data/barcode_runs.csv
Reading escape score samples from ../results/escape_scores/samples.csv


Now read in the samples to subset:

In [3]:
samples_to_subset_df = pd.read_csv('samples_to_subset.csv')
print('Here are the samples we will subset to:')
display(samples_to_subset_df)

samples_to_subset = samples_to_subset_df['sample'].tolist()
assert len(samples_to_subset) == len(set(samples_to_subset)), 'duplicate samples to subset'

Here are the samples we will subset to:


Unnamed: 0,sample
0,267C_200
1,268C_500
2,273C_500
3,274C_500
4,276C_500
5,277C_500
6,278C_1250
7,279C_1250
8,Delta_1_500
9,Delta_3_350


Now get all the escape-score samples of interest:

In [4]:
if not set(samples_to_subset).issubset(escape_score_samples['name']):
    raise ValueError(f"Not all samples to subset are in the escape score samples.")
    
samples_subset = (
    escape_score_samples
    .query('name in @samples_to_subset')
    .reset_index(drop=True)
    )

print(f"Here are the samples for which we are subsetting barcode runs:")
display(samples_subset)

Here are the samples for which we are subsetting barcode runs:


Unnamed: 0,name,library,antibody,concentration,date,pre_sample,post_sample,frac_escape,pre_cells_sorted,post_cells_sorted
0,267C_200,lib1,267C,200,211119,delta_21-26-none-0-ref,delta_21-267C-200-abneg,0.112,1206433.0,
1,267C_200,lib2,267C,200,211119,delta_21-26-none-0-ref,delta_21-267C-200-abneg,0.094,1078707.0,
2,268C_500,lib1,268C,500,211112,delta_17-20-none-0-ref,delta_19-268C-500-abneg,0.107,1012056.0,
3,268C_500,lib2,268C,500,211112,delta_17-20-none-0-ref,delta_19-268C-500-abneg,0.119,1023050.0,
4,273C_500,lib1,273C,500,211119,delta_21-26-none-0-ref,delta_22-273C-500-abneg,0.126,1269137.0,
5,273C_500,lib2,273C,500,211119,delta_21-26-none-0-ref,delta_22-273C-500-abneg,0.125,1249629.0,
6,274C_500,lib1,274C,500,211119,delta_21-26-none-0-ref,delta_23-274C-500-abneg,0.122,1233005.0,
7,274C_500,lib2,274C,500,211119,delta_21-26-none-0-ref,delta_23-274C-500-abneg,0.108,1094894.0,
8,276C_500,lib1,276C,500,211119,delta_21-26-none-0-ref,delta_24-276C-500-abneg,0.126,1263014.0,
9,276C_500,lib2,276C,500,211119,delta_21-26-none-0-ref,delta_24-276C-500-abneg,0.108,1091168.0,


Now unfold the samples of interest into the actual relevant barcode runs (this requires getting both the pre- and post-selection run for each sample):

In [5]:
barcode_runs_subset = (
    samples_subset
    .melt(id_vars=['name', 'library'],
          value_vars=['pre_sample', 'post_sample'],
          value_name='sample',
          var_name='sample_type')
    [['library', 'sample']]
    .drop_duplicates()
    .reset_index(drop=True)
    .merge(barcode_runs,
           how='left',
           on=['library', 'sample'],
           validate='one_to_one',
           )
    )

assert barcode_runs_subset['R1'].notnull().all(), 'some barcode runs missing R1'

barcode_runs_subset_file = 'barcode_runs_subset.csv'
print(f"Here are the subsetted barcode runs. Writing to {barcode_runs_subset_file}")
barcode_runs_subset.to_csv(barcode_runs_subset_file, index=False)
display(barcode_runs_subset)

Here are the subsetted barcode runs. Writing to barcode_runs_subset.csv


Unnamed: 0,library,sample,date,experiment,antibody,concentration,sort_bin,HutchBase,experiment_type,number_cells,frac_escape,R1
0,lib1,delta_21-26-none-0-ref,211119,delta_21-26,none,0.0,ref,delta_21-26_lib1_ref,ab_selection,,,/shared/ngs/illumina/agreaney/211130_VH00699_2...
1,lib2,delta_21-26-none-0-ref,211119,delta_21-26,none,0.0,ref,delta_21-26_lib2_ref,ab_selection,,,/shared/ngs/illumina/agreaney/211130_VH00699_2...
2,lib1,delta_17-20-none-0-ref,211112,delta_17-20,none,0.0,ref,delta_17-20_lib1_ref,ab_selection,,,/shared/ngs/illumina/agreaney/211117_D00300_13...
3,lib2,delta_17-20-none-0-ref,211112,delta_17-20,none,0.0,ref,delta_17-20_lib2_ref,ab_selection,,,/shared/ngs/illumina/agreaney/211117_D00300_13...
4,lib1,delta_27-34-none-0-ref,211122,delta_27-34,none,0.0,ref,delta_27-34_lib1_ref,ab_selection,,,/shared/ngs/illumina/agreaney/211130_VH00699_2...
5,lib2,delta_27-34-none-0-ref,211122,delta_27-34,none,0.0,ref,delta_27-34_lib2_ref,ab_selection,,,/shared/ngs/illumina/agreaney/211130_VH00699_2...
6,lib1,delta_35-40-none-0-ref,211124,delta_35-40,none,0.0,ref,delta_35-40_lib1_ref,ab_selection,,,/shared/ngs/illumina/agreaney/211130_VH00699_2...
7,lib2,delta_35-40-none-0-ref,211124,delta_35-40,none,0.0,ref,delta_35-40_lib2_ref,ab_selection,,,/shared/ngs/illumina/agreaney/211130_VH00699_2...
8,lib1,delta_21-267C-200-abneg,211119,delta_21,267C,200.0,abneg,delta_21_lib1_abneg,ab_selection,1206433.0,0.112,/shared/ngs/illumina/agreaney/211130_VH00699_2...
9,lib2,delta_21-267C-200-abneg,211119,delta_21,267C,200.0,abneg,delta_21_lib2_abneg,ab_selection,1078707.0,0.094,/shared/ngs/illumina/agreaney/211130_VH00699_2...
