# Extract viral barcodes from progeny viral barcode sequencing data
This Python Jupyter notebook parses viral barcodes from progeny sequencing data sources (e.g. supernatant or second infection). For each sequencing sample, the viral barcodes are parsed and tallied.

Import Python modules:

In [None]:
import pandas as pd

import Bio.SeqIO

Get `snakemake` variables [as described here](https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#jupyter-notebook-integration):

In [None]:
viral_bc_locs = snakemake.input.viral_bc_locs
viral_genbank = snakemake.input.viral_genbank
fastq_df = snakemake.params.fastq_df
viral_barcode_upstream_length = snakemake.params.viral_barcode_upstream_length
viral_barcode_length = snakemake.params.viral_barcode_length
barcoded_viral_genes = snakemake.params.barcoded_viral_genes

Read the viral barcode locations:

In [None]:
print(f"Reading viral barcode locations from {viral_bc_locs}")
bc_locs_df = pd.read_csv(viral_bc_locs)
display(bc_locs_df)

if len(bc_locs_df) != bc_locs_df['gene'].nunique():
    raise ValueError('code assumes at most one barcode per gene')

for index, row in bc_locs_df.iterrows():
    assert ((row['end'] - (row['start']-1)) == viral_barcode_length), \
           'barcode start and end sites do not match expected length'

Get the upstream sequence for each barcoded gene:

In [None]:
upstream_seqs = []
for s in Bio.SeqIO.parse(viral_genbank, 'genbank'):
    if s.id in barcoded_viral_genes:
        bc_features = [f for f in s.features if f.type == 'viral_barcode']
        assert len(bc_features) == 1
        bc_feature = bc_features[0]
        upstream_seq_start = int(bc_locs_df.query('gene == @s.id')['start']) - viral_barcode_upstream_length - 1 #Adjust indexing
        upstream_seq_end = int(bc_locs_df.query('gene == @s.id')['start']) - 1 # Exclude final nt site which is in barcode
        upstream_seqs.append((s.id,
                              str(s.seq[upstream_seq_start:upstream_seq_end])))
upstream_seqs_df = pd.DataFrame.from_records(upstream_seqs,
                                             columns=['gene', 'upstream_seq'])

display(upstream_seqs_df)

if len(upstream_seqs_df) != upstream_seqs_df['gene'].nunique():
    raise ValueError('code assumes at most one barcode per gene')

For now just touch the intended output file so `Snakefile` runs via `snakemake`:

In [None]:
from pathlib import Path

Path(snakemake.output.viral_bc_in_progeny_csv).touch()