# Extract viral barcodes from progeny viral barcode sequencing data
This Python Jupyter notebook parses viral barcodes from progeny sequencing data sources (e.g. supernatant or second infection). For each sequencing sample, the viral barcodes are parsed and tallied.

Import Python modules:

In [None]:
import pandas as pd

import Bio.SeqIO

from dms_variants import illuminabarcodeparser

Get `snakemake` variables [as described here](https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#jupyter-notebook-integration):

In [None]:
viral_bc_locs = snakemake.input.viral_bc_locs
viral_genbank = snakemake.input.viral_genbank
fastq_df = snakemake.params.fastq_df
viral_barcode_length = snakemake.params.viral_barcode_length
viral_barcode_upstream_length = snakemake.params.viral_barcode_upstream_length
viral_barcode_mismatch_threshold = snakemake.params.viral_barcode_mismatch_threshold
barcoded_viral_genes = snakemake.params.barcoded_viral_genes
viral_bc_in_progeny_csv =snakemake.output.viral_bc_in_progeny_csv

Read the viral barcode locations:

In [None]:
print(f"Reading viral barcode locations from {viral_bc_locs}")
bc_locs_df = pd.read_csv(viral_bc_locs)
display(bc_locs_df)

if len(bc_locs_df) != bc_locs_df['gene'].nunique():
    raise ValueError('code assumes at most one barcode per gene')

for index, row in bc_locs_df.iterrows():
    assert ((row['end'] - (row['start']-1)) == viral_barcode_length), \
           'barcode start and end sites do not match expected length'

Get the upstream sequence for each barcoded gene:

In [None]:
upstream_seqs = []
for s in Bio.SeqIO.parse(viral_genbank, 'genbank'):
    if s.id in barcoded_viral_genes:
        bc_features = [f for f in s.features if f.type == 'viral_barcode']
        assert len(bc_features) == 1
        bc_feature = bc_features[0]
        upstream_seq_start = int(bc_locs_df.query('gene == @s.id')['start']) - viral_barcode_upstream_length - 1 #Adjust indexing
        upstream_seq_end = int(bc_locs_df.query('gene == @s.id')['start']) - 1 # Exclude final nt site which is in barcode
        upstream_seqs.append((s.id,
                              str(s.seq[upstream_seq_start:upstream_seq_end])))
upstream_seqs_df = pd.DataFrame.from_records(upstream_seqs,
                                             columns=['gene', 'upstream_seq'])

display(upstream_seqs_df)

if len(upstream_seqs_df) != upstream_seqs_df['gene'].nunique():
    raise ValueError('code assumes at most one barcode per gene')

Iterate through sequencing samples and make a list of all FASTQ files:

In [None]:
assert len(fastq_df['experiment'].unique()) == 1, "code assumes dataframe contains 1 experiment"

barcodes_records = []
fates_records = []
for source in fastq_df['source'].unique():
    for tag in fastq_df['tag'].unique():
        for gene in fastq_df['gene'].unique():
            upstream_seq = upstream_seqs_df.query('gene == @gene')['upstream_seq'].to_string(index=False).strip()
            viral_barcode_parser = illuminabarcodeparser.IlluminaBarcodeParser(bclen=viral_barcode_length,
                                                                               upstream=upstream_seq,
                                                                               upstream_mismatch=viral_barcode_mismatch_threshold,
                                                                               bc_orientation='R2')
            for replicate in fastq_df['replicate'].unique():
                r1files = (fastq_df.query('(source == @source) and '
                                     '(tag == @tag) and '
                                     '(gene == @gene) and '
                                     '(replicate == @replicate)')['fastq_path'].tolist())
                
                print(f"Parsing files for {source} {tag} {gene} {replicate}.")
                ibarcodes, ifates = viral_barcode_parser.parse(r1files)
                barcodes_records.append(ibarcodes.assign(source=source,
                                                         tag=tag,
                                                         gene=gene,
                                                         replicate=replicate))
                fates_records.append(ifates.assign(source=source,
                                                   tag=tag,
                                                   gene=gene,
                                                   replicate=replicate))


barcodes_df = pd.concat(barcodes_records).reset_index()
barcodes_df = barcodes_df[['source','tag','gene','replicate','barcode','count']]
fates_df = pd.concat(fates_records).reset_index()
fates_df = fates_df[['source','tag','gene','replicate','fate','count']]
print('Done.')

Write the viral barcodes to the output CSV file:

In [None]:
print(f"Writing viral barcodes to {viral_bc_in_progeny_csv}")

barcodes_df.to_csv(viral_bc_in_progeny_csv,
                  index=False,
                  compression='gzip')