# Extract viral barcodes from progeny viral barcode sequencing data
This Python Jupyter notebook parses viral barcodes from progeny sequencing data sources (e.g. supernatant or second infection). For each sequencing sample, the viral barcodes are parsed and tallied.

Import Python modules:

In [None]:
import pandas as pd

import Bio.SeqIO

from dms_variants import illuminabarcodeparser

Get `snakemake` variables [as described here](https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#jupyter-notebook-integration):

In [None]:
viral_genbank = snakemake.input.viral_genbank
fastq_df = snakemake.params.fastq_df
viral_barcode_upstream_length = snakemake.params.viral_barcode_upstream_length
viral_barcode_mismatch_threshold = snakemake.params.viral_barcode_mismatch_threshold
barcoded_viral_genes = snakemake.params.barcoded_viral_genes
viral_bc_in_progeny_csv = snakemake.output.viral_bc_in_progeny_csv
viral_bc_fates_csv = snakemake.output.viral_bc_fates_csv

Get the upstream sequence for each barcoded gene:

In [None]:
barcode_details = []
for s in Bio.SeqIO.parse(viral_genbank, 'genbank'):
    if s.id in barcoded_viral_genes:
        bc_features = [f for f in s.features if f.type == 'viral_barcode']
        assert len(bc_features) == 1
        bc_feature = bc_features[0]
        bc_len = bc_feature.location.end - bc_feature.location.start
        upstream_seq_start = bc_feature.location.start - viral_barcode_upstream_length
        upstream_seq_end = bc_feature.location.start
        barcode_details.append((s.id,
                                bc_len,
                                str(s.seq[upstream_seq_start:upstream_seq_end])))
barcode_details_df = pd.DataFrame.from_records(barcode_details,
                                               columns=['gene', 'bc_len', 'upstream_seq'])

display(barcode_details_df)

if len(barcode_details_df) != barcode_details_df['gene'].nunique():
    raise ValueError('code assumes at most one barcode per gene')

Iterate through sequencing samples and make a list of all FASTQ files:

In [None]:
assert len(fastq_df['experiment'].unique()) == 1, "code assumes dataframe contains 1 experiment"

barcodes_records = []
fates_records = []
for (source, tag, gene, replicate), df in fastq_df.groupby(['source', 'tag', 'gene', 'replicate']):
    upstream_seq = barcode_details_df.set_index('gene').at[gene, 'upstream_seq']
    viral_barcode_length = barcode_details_df.set_index('gene').at[gene, 'bc_len']
    viral_barcode_parser = illuminabarcodeparser.IlluminaBarcodeParser(bclen=viral_barcode_length,
                                                                       upstream=upstream_seq,
                                                                       upstream_mismatch=viral_barcode_mismatch_threshold,
                                                                       bc_orientation='R2')
    r1files = df['fastq_path'].tolist()

    print(f"Parsing files for {source} {tag} {gene} {replicate}.")
    ibarcodes, ifates = viral_barcode_parser.parse(r1files)
    barcodes_records.append(ibarcodes.assign(source=source,
                                             tag=tag,
                                             gene=gene,
                                             replicate=replicate))
    fates_records.append(ifates.assign(source=source,
                                       tag=tag,
                                       gene=gene,
                                       replicate=replicate))


barcodes_df = pd.concat(barcodes_records).reset_index()
barcodes_df = barcodes_df[['source','tag','gene','replicate','barcode','count']]
fates_df = pd.concat(fates_records).reset_index()
fates_df = fates_df[['source','tag','gene','replicate','fate','count']]
print('Done.')

Write the viral barcodes to the output CSV file:

In [None]:
print(f"Writing viral barcodes to {viral_bc_in_progeny_csv}")

barcodes_df.to_csv(viral_bc_in_progeny_csv,
                  index=False,
                  compression='gzip')

print(f"Writing viral barcode fates to {viral_bc_fates_csv}")

fates_df.to_csv(viral_bc_fates_csv,
                  index=False,
                  compression='gzip')