# Filter progeny viral barcodes
This notebook filters viral barcodes in progeny samples to identify high-confidence, valid viral barcodes linked to infected cells.

Import Python modules:

In [None]:
import gzip

from IPython.display import display

from dms_variants.constants import CBPALETTE

from scipy.stats.mstats import gmean

import numpy as np

import pandas as pd

import plotnine as p9

Hardcode variables for now:

In [None]:
cell_annotations_csv = 'results/viral_tags_bcs_in_cells/scProgenyProduction_trial3_cell_barcodes_with_viral_tags.csv.gz'
viral_bc_in_progeny_corrected_csv = snakemake.input.viral_bc_in_progeny_corrected_csv
valid_viral_bc_csv = 'results/viral_fastq10x/scProgenyProduction_trial3_valid_viral_bc_by_cell.csv.gz'
filtered_progeny_viral_bc_csv = 'results/viral_fastq10x/scProgenyProduction_trial3_filtered_progeny_viral_bc.csv.gz'
plot = snakemake.output.plot
expt = snakemake.wildcards.expt

### Load data
Load viral barcode counts in progeny.

In [None]:
viral_progeny = pd.read_csv(viral_bc_in_progeny_corrected_csv)
viral_progeny = viral_progeny.rename(columns={'barcode': 'viral_barcode'})
viral_progeny = pd.merge(viral_progeny,
                    (viral_progeny
                     .groupby(['source',
                               'tag',
                               'gene',
                               'replicate'])
                     .sum()
                     .reset_index()),
                    on=['source', 'tag', 'gene', 'replicate'],
                    suffixes=('_barcode', '_sample'))
viral_progeny['freq'] = (viral_progeny['count_barcode'] /
                         viral_progeny['count_sample'])
display(viral_progeny)

Load viral tags for each cell

In [None]:
viral_tags = pd.read_csv(cell_annotations_csv)
viral_tags = viral_tags.rename(columns={'infecting_viral_tag': 'tag'})
display(viral_tags)

 Load list of valid viral barcodes and merge in tag info for each cell

In [None]:
valid_viral_barcodes = pd.read_csv(valid_viral_bc_csv)
valid_viral_barcodes = pd.merge(
    left=valid_viral_barcodes,
    right=viral_tags[['cell_barcode', 'tag']],
    how='left',
    validate='many_to_one'
)
valid_viral_barcodes = (
    pd.concat([valid_viral_barcodes
               .assign(source=source)
               for source in ['supernatant', 'second_infection']]))
valid_viral_barcodes = (
    pd.concat([valid_viral_barcodes
               .assign(replicate=replicate)
               for replicate in ['replicate_1', 'replicate_2']]))
display(valid_viral_barcodes)

### Integrate dataframes
Merge progeny barcodes with valid barcodes

In [None]:
all_progeny_barcodes = pd.merge(
    left=viral_progeny,
    right=(valid_viral_barcodes[['source',
                                 'gene',
                                 'tag',
                                 'replicate',
                                 'viral_barcode',
                                 'valid_viral_bc']]
           .drop_duplicates()),
    how='outer',
    on=['source', 'gene', 'tag', 'replicate', 'viral_barcode'],
    validate='many_to_one'
)
all_progeny_barcodes['valid_viral_bc'] = (
    all_progeny_barcodes['valid_viral_bc']
    .fillna(False))
all_progeny_barcodes = all_progeny_barcodes.fillna(0)
                          
display(all_progeny_barcodes)

Set theme for plots.

In [None]:
p9.theme_set(p9.theme_classic())

### Valid viral barcodes from infected cells
Plot and filter viral barcodes so only valid viral barcodes found in infected cells are analyzed.

In [None]:
fig = (p9.ggplot(all_progeny_barcodes,
                 p9.aes(x='valid_viral_bc',
                        y='freq')) +
       p9.geom_jitter(alpha=0.1, width=0.1, height=0) +
       p9.facet_grid('source~gene') +
       p9.ggtitle('all viral barcodes\n'
                  'faceted by progeny source, valid status in infected cells\n'
                  f'{expt}') +
       p9.scale_y_log10() +
       p9.theme(figure_size=(3, 4),
                plot_title=p9.element_text(size=9),
                axis_title=p9.element_text(size=9),
                legend_title=p9.element_text(size=9),
                legend_title_align='center'))
display(fig)

In [None]:
valid_progeny_barcodes = (
    all_progeny_barcodes.query('valid_viral_bc == True')
)

In [None]:
fig = (p9.ggplot(valid_progeny_barcodes,
                 p9.aes(x='source',
                        y='freq')) +
       p9.geom_jitter(alpha=0.1, width=0.1, height=0) +
       p9.facet_grid('~gene') +
       p9.ggtitle('valid viral barcodes\n'
                  f'{expt}') +
       p9.labs(y='progeny frequency') +
       p9.scale_y_log10() +
       p9.theme(figure_size=(3, 2),
                plot_title=p9.element_text(size=9),
                axis_title=p9.element_text(size=9),
                axis_text_x=p9.element_text(rotation=45, ha='right'),
                legend_title=p9.element_text(size=9),
                legend_title_align='center'))
display(fig)

### Define limit of detection
Some very low frequency viral barcodes in the sequencing samples are likely to be spurious. Here, I will set a detection limit value and make knee plots.

In [None]:
# Set detection limit
detection_limit = 1e-5

In [None]:
freq_rank = (valid_progeny_barcodes
             .query('source.notnull()', engine='python')
                 [['source',
                   'gene',
                   'tag',
                   'replicate',
                   'viral_barcode',
                   'freq']]
                 .copy()
                 .drop_duplicates())
freq_rank['rank'] = (freq_rank
                     .groupby(['source', 'gene'])
                         ['freq']
                         .rank(ascending=False, method='first'))
freq_rank['above_detection_limit'] = (
    freq_rank['freq'] > detection_limit
)

display(freq_rank)

fig = (p9.ggplot(freq_rank,
                 p9.aes(x='rank',
                        y='freq',
                        color='above_detection_limit')) +
       p9.geom_point(alpha=0.1) +
       p9.geom_hline(yintercept=detection_limit, linetype='dashed', color=CBPALETTE[2]) +
       p9.facet_grid('source~gene') +
       p9.ggtitle('valid viral barcodes in progeny\n'
                  f'{expt}') +
       p9.scale_y_log10() +
       p9.theme(figure_size=(5, 3),
                plot_title=p9.element_text(size=9),
                axis_title=p9.element_text(size=9),
                legend_title=p9.element_text(size=9),
                legend_title_align='center') +
       p9.scale_color_manual(CBPALETTE[0:]))
display(fig)

### Replicate correlation
Plot correlation of replicates

In [None]:
replicates = (
    valid_progeny_barcodes
    .query('source.notnull()', engine='python')
    .pivot(index=['source','tag','gene','viral_barcode'],
           columns=['replicate'],
           values='freq')
    .reset_index()
)
replicates = replicates.fillna(0)
replicates['above_limit_both_replicates'] = (
    (replicates['replicate_1'] > detection_limit) & (replicates['replicate_2'] > detection_limit)
)
replicates['contributes_progeny'] = (
    (replicates['replicate_1'] > 0) | (replicates['replicate_2'] > 0)
)
display(replicates)

In [None]:
fig = (p9.ggplot(replicates,
                 p9.aes(x='replicate_1',
                        y='replicate_2',
                        color='above_limit_both_replicates')) +
       p9.geom_point(alpha=0.2) +
       p9.geom_hline(yintercept=detection_limit, linetype='dashed', color=CBPALETTE[2]) +
       p9.geom_vline(xintercept=detection_limit, linetype='dashed', color=CBPALETTE[2]) +
       p9.facet_grid('source~gene') +
       p9.ggtitle('valid viral barcodes in progeny\n'
                  f'detection limit = {detection_limit}\n'
                  f'{expt}') +
       p9.scale_x_log10() +
       p9.scale_y_log10() +
       p9.theme(figure_size=(4, 4),
                plot_title=p9.element_text(size=9),
                axis_title=p9.element_text(size=9),
                legend_title=p9.element_text(size=9),
                legend_title_align='center') +
       p9.scale_color_manual([CBPALETTE[1], CBPALETTE[0]]))
display(fig)

Calculate the correlation of progeny barcodes with both replicates found above limit of detection.

In [None]:
(replicates
 .query('above_limit_both_replicates == True')
 .groupby(['source', 'gene'])
 [['replicate_1', 'replicate_2']]
 .corr()
 .reset_index())

### Average replicates
First, apply quality control to very low frequency sequences. Then, take the geometric mean of the two replicates. Geometeric mean is a better representation of our samples than an arithmetic mean, since they demonstrate bottlenecking.

Quality control very low frequency measurements:  
    1. Assign all frequencies below detection limt to the detection limit value.  
    2. Assign all viral barcodes that are missing from a replicate the detection limit value

In [None]:
for replicate in ['replicate_1', 'replicate_2']:
    replicates[f'{replicate}'] = (
        replicates[f'{replicate}']
        .apply(lambda x: x if x > detection_limit else detection_limit)
    )
display(replicates)

Calculate geometric mean.

In [None]:
replicates = replicates.assign(
    average_freq = lambda x: gmean([x['replicate_1'], x['replicate_2']])
)

In [None]:
replicates

In [None]:
output_fig = (p9.ggplot(replicates,
                 p9.aes(x='average_freq', fill='contributes_progeny')) +
       p9.geom_histogram(bins=20) +
       p9.geom_vline(xintercept=detection_limit, linetype='dashed', color=CBPALETTE[2]) +
       p9.facet_grid('source~gene') +
       p9.ggtitle('valid viral barcodes in progeny\n'
                  'averaged technical replicates\n'
                  f'detection limit = {detection_limit}\n'
                  f'{expt}') +
       p9.labs(x='average frequency',
               y='n viral barcodes') +
       p9.scale_x_log10() +
       p9.theme(figure_size=(6, 4),
                plot_title=p9.element_text(size=10),
                axis_title=p9.element_text(size=10),
                legend_title=p9.element_text(size=10),
                legend_title_align='center') + 
       p9.scale_fill_manual([CBPALETTE[1], CBPALETTE[0]]))
display(output_fig)

### Export results
Export histogram and CSV of average barcode frequency in each progeny sample.

In [None]:
# save plot
print(f"Saving plot to {plot}")
p9.ggsave(plot=fig, filename=plot, verbose=False)

In [None]:
# save CSV
(replicates
 [['source',
   'tag',
   'gene',
   'viral_barcode',
   'average_freq']]
 .to_csv(filtered_progeny_viral_bc_csv))