# Build `barcode_runs.csv` file
This notebook takes the file labeled `{date}_barcode_runs.csv` for each day that an Illumina sequencing run was done, and maps it to the files that contain the Illumina R1 reads from that day.
Sometimes samples are sequenced on multiple days. So next, we concatenate the files for each sample, separated by a semicolon.

The final data frame should have only one row for each `sample` x `library`. 

In [1]:
import glob
import itertools
import numpy
import pandas as pd
from IPython.display import display, HTML

List of Illumina sequencing runs and their subdirectory. 

The file names are like this:

`expt_0_lib1_trans_S1_R1_001.fastq.gz`

which is a concatenation, separated by "_" of the columns:
* `experiment`
* `library`
* `sort_bin`

`sample` is a concatenation, separated by "-" of the columns:
* `experiment`
* `antibody`
* `concentration`
* `sort_bin`

Sometimes samples get mixed up during the whole sorting and/or library prep process.
This is easiest to detect (and fix) when lib1 and lib2 are simply swapped.

I am basically going to hardcode this in when this happens. 

In [2]:
runs = {
    '211020':'/shared/ngs/illumina/agreaney/211020_D00300_1350_BHN3NYBCX3/Unaligned/Project_agreaney',
    '211018':'/shared/ngs/illumina/agreaney/211018_D00300_1346_AHMTMLBCX3/Unaligned/Project_agreaney',
    '211118':'/shared/ngs/illumina/agreaney/211117_D00300_1372_BHNH7WBCX3/Unaligned/Project_agreaney',
}

In [3]:
barcode_runs = pd.DataFrame()

for run in runs:
    print(run)
    df = (
        pd.read_csv(f'{run}_barcode_runs.csv')
        .assign(
            R1=lambda x: runs[run]+'/'+x['HutchBase']+'_*R1*.fastq.gz',
            )
        )
    
    assert len(df.groupby(['library', 'sample'])) == len(df)
    barcode_runs = pd.concat([barcode_runs, df], ignore_index=True)
    
R1_df=(barcode_runs[['library', 'sample', 'R1']])
R1_df=R1_df.groupby(['library', 'sample'])['R1'].apply(lambda x: '; '.join(x)).reset_index()

barcode_runs = (barcode_runs
                .drop(columns=['R1'])
                .drop_duplicates()
                .merge(R1_df,
                       how='left',
                       on=['library', 'sample'],
                       validate='one_to_one',
                      )
               )

assert len(barcode_runs.groupby(['library', 'sample'])) == len(barcode_runs)
barcode_runs.to_csv('barcode_runs_new.csv', index=False)

display(HTML(barcode_runs.head().to_html()))

211020
211018
211118


Unnamed: 0,date,experiment,library,antibody,concentration,sort_bin,HutchBase,experiment_type,number_cells,frac_escape,sample,R1
0,210930,TiteSeq,lib1,monomeric_ACE2,1.0,1,210930_s01-b1,TiteSeq,1051589,,TiteSeq_01_bin1,/shared/ngs/illumina/agreaney/211020_D00300_1350_BHN3NYBCX3/Unaligned/Project_agreaney/210930_s01-b1_*R1*.fastq.gz; /shared/ngs/illumina/agreaney/211018_D00300_1346_AHMTMLBCX3/Unaligned/Project_agreaney/210930_s01-b1_*R1*.fastq.gz
1,210930,TiteSeq,lib1,monomeric_ACE2,1.0,2,210930_s01-b2,TiteSeq,624665,,TiteSeq_01_bin2,/shared/ngs/illumina/agreaney/211020_D00300_1350_BHN3NYBCX3/Unaligned/Project_agreaney/210930_s01-b2_*R1*.fastq.gz; /shared/ngs/illumina/agreaney/211018_D00300_1346_AHMTMLBCX3/Unaligned/Project_agreaney/210930_s01-b2_*R1*.fastq.gz
2,210930,TiteSeq,lib1,monomeric_ACE2,1.0,3,210930_s01-b3,TiteSeq,1279752,,TiteSeq_01_bin3,/shared/ngs/illumina/agreaney/211020_D00300_1350_BHN3NYBCX3/Unaligned/Project_agreaney/210930_s01-b3_*R1*.fastq.gz; /shared/ngs/illumina/agreaney/211018_D00300_1346_AHMTMLBCX3/Unaligned/Project_agreaney/210930_s01-b3_*R1*.fastq.gz
3,210930,TiteSeq,lib1,monomeric_ACE2,1.0,4,210930_s01-b4,TiteSeq,8085751,,TiteSeq_01_bin4,/shared/ngs/illumina/agreaney/211020_D00300_1350_BHN3NYBCX3/Unaligned/Project_agreaney/210930_s01-b4_*R1*.fastq.gz; /shared/ngs/illumina/agreaney/211018_D00300_1346_AHMTMLBCX3/Unaligned/Project_agreaney/210930_s01-b4_*R1*.fastq.gz
4,210930,TiteSeq,lib1,monomeric_ACE2,2.0,1,210930_s02-b1,TiteSeq,1295531,,TiteSeq_02_bin1,/shared/ngs/illumina/agreaney/211020_D00300_1350_BHN3NYBCX3/Unaligned/Project_agreaney/210930_s02-b1_*R1*.fastq.gz; /shared/ngs/illumina/agreaney/211018_D00300_1346_AHMTMLBCX3/Unaligned/Project_agreaney/210930_s02-b1_*R1*.fastq.gz


Here's how I had it for a previous (antibody mapping?) repo: 

In [4]:
# barcode_runs = pd.DataFrame()

# for run in runs:
#     print(run)
#     df = (
#         pd.read_csv(f'{run}_barcode_runs.csv')
#         .assign(
#             sample=lambda x: x['experiment']+'-'+x['antibody']+'-'+x['concentration']+'-'+x['sort_bin'],
#             R1=lambda x: runs[run]+'/'+x['experiment']+'_'+x['library']+'_'+x['sort_bin']+'_*R1*.fastq.gz',
#             )
#         )
    
#     assert len(df.groupby(['library', 'sample'])) == len(df)
#     barcode_runs = pd.concat([barcode_runs, df], ignore_index=True)
    
# R1_df=(barcode_runs[['library', 'sample', 'R1']])
# R1_df=R1_df.groupby(['library', 'sample'])['R1'].apply(lambda x: '; '.join(x)).reset_index()

# barcode_runs = (barcode_runs
#                 .drop(columns=['R1'])
#                 .drop_duplicates()
#                 .merge(R1_df,
#                        how='left',
#                        on=['library', 'sample'],
#                        validate='one_to_one',
#                       )
#                )

# assert len(barcode_runs.groupby(['library', 'sample'])) == len(barcode_runs)
# barcode_runs.to_csv('../barcode_runs.csv', index=False)

# display(HTML(barcode_runs.head().to_html()))

Test that this will not break the `snakemake` pipeline.

In [5]:
barcode_runs_expandR1 = (
    barcode_runs
    .assign(R1=lambda x: x['R1'].str.split('; ').map(
                    lambda y: list(itertools.chain(*map(glob.glob, y)))),
            n_R1=lambda x: x['R1'].map(len),
            sample_lib=lambda x: x['sample'] + '_' + x['library'],
            )
    )

assert barcode_runs_expandR1['sample_lib'].nunique() == len(barcode_runs_expandR1)
if any(barcode_runs_expandR1['n_R1'] < 1):
    raise ValueError(f"no R1 for {barcode_runs_expandR1.query('n_R1 < 1')}")
    
display(HTML(barcode_runs_expandR1.head().to_html()))

Unnamed: 0,date,experiment,library,antibody,concentration,sort_bin,HutchBase,experiment_type,number_cells,frac_escape,sample,R1,n_R1,sample_lib
0,210930,TiteSeq,lib1,monomeric_ACE2,1.0,1,210930_s01-b1,TiteSeq,1051589,,TiteSeq_01_bin1,"[/shared/ngs/illumina/agreaney/211020_D00300_1350_BHN3NYBCX3/Unaligned/Project_agreaney/210930_s01-b1_S1_R1_001.fastq.gz, /shared/ngs/illumina/agreaney/211018_D00300_1346_AHMTMLBCX3/Unaligned/Project_agreaney/210930_s01-b1_S1_R1_001.fastq.gz]",2,TiteSeq_01_bin1_lib1
1,210930,TiteSeq,lib1,monomeric_ACE2,1.0,2,210930_s01-b2,TiteSeq,624665,,TiteSeq_01_bin2,"[/shared/ngs/illumina/agreaney/211020_D00300_1350_BHN3NYBCX3/Unaligned/Project_agreaney/210930_s01-b2_S2_R1_001.fastq.gz, /shared/ngs/illumina/agreaney/211018_D00300_1346_AHMTMLBCX3/Unaligned/Project_agreaney/210930_s01-b2_S2_R1_001.fastq.gz]",2,TiteSeq_01_bin2_lib1
2,210930,TiteSeq,lib1,monomeric_ACE2,1.0,3,210930_s01-b3,TiteSeq,1279752,,TiteSeq_01_bin3,"[/shared/ngs/illumina/agreaney/211020_D00300_1350_BHN3NYBCX3/Unaligned/Project_agreaney/210930_s01-b3_S3_R1_001.fastq.gz, /shared/ngs/illumina/agreaney/211018_D00300_1346_AHMTMLBCX3/Unaligned/Project_agreaney/210930_s01-b3_S3_R1_001.fastq.gz]",2,TiteSeq_01_bin3_lib1
3,210930,TiteSeq,lib1,monomeric_ACE2,1.0,4,210930_s01-b4,TiteSeq,8085751,,TiteSeq_01_bin4,"[/shared/ngs/illumina/agreaney/211020_D00300_1350_BHN3NYBCX3/Unaligned/Project_agreaney/210930_s01-b4_S4_R1_001.fastq.gz, /shared/ngs/illumina/agreaney/211018_D00300_1346_AHMTMLBCX3/Unaligned/Project_agreaney/210930_s01-b4_S4_R1_001.fastq.gz]",2,TiteSeq_01_bin4_lib1
4,210930,TiteSeq,lib1,monomeric_ACE2,2.0,1,210930_s02-b1,TiteSeq,1295531,,TiteSeq_02_bin1,"[/shared/ngs/illumina/agreaney/211020_D00300_1350_BHN3NYBCX3/Unaligned/Project_agreaney/210930_s02-b1_S5_R1_001.fastq.gz, /shared/ngs/illumina/agreaney/211018_D00300_1346_AHMTMLBCX3/Unaligned/Project_agreaney/210930_s02-b1_S5_R1_001.fastq.gz]",2,TiteSeq_02_bin1_lib1
