# Contributes progeny by cell
This notebook annotates each infected cell by whether it contributes any progeny to the supernatant and second_infection

Import python modules:


In [None]:
from IPython.display import display

from dms_variants.constants import CBPALETTE

import pandas as pd

import plotnine as p9

Hardcode variables for now:

In [None]:
transcription_progeny_csv = snakemake.input.transcription_progeny_csv
viral_genes_by_cell_csv = snakemake.input.viral_genes_by_cell_csv
expt = snakemake.wildcards.expt
plot = snakemake.output.plot
progeny_detection_limit = float(snakemake.params.progeny_detection_limit)
contributes_progeny_by_cell_csv = snakemake.output.contributes_progeny_by_cell_csv

Style parameters. *N.b.* `CBPALETTE` is defined in imports above.

In [None]:
p9.theme_set(p9.theme_classic())

## Load data
Load data on viral barcodes in each infected cell and frequency of those viral barcodes in each progeny sample.  

In [None]:
viral_barcode_freqs = pd.read_csv(transcription_progeny_csv)
display(viral_barcode_freqs)

Load viral gene presence/absence data

In [None]:
viral_genes_by_cell = pd.read_csv(viral_genes_by_cell_csv)
display(viral_genes_by_cell)

Merge viral gene data into viral progeny data:

In [None]:
viral_barcode_freqs = pd.merge(
    left=viral_barcode_freqs,
    right=viral_genes_by_cell,
    on=['cell_barcode', 'gene'],
    how='left',
    validate='many_to_one'
)
display(viral_barcode_freqs)

## Detect viral barcodes in cells

Annotate whether each cell has a viral barcode detected on each barcoded gene:

In [None]:
viral_barcode_freqs = viral_barcode_freqs.assign(
    viral_bc_detected=lambda x: x['viral_barcode'].notnull()
)
display(viral_barcode_freqs)

In [None]:
viral_bc_detected_counts_df = (
    viral_barcode_freqs
    .groupby(['gene', 'viral_bc_detected'])
    ['cell_barcode']
    .nunique()
    .reset_index()
    .rename(columns={'cell_barcode': 'n_cells'}))

viral_bc_detected_counts = (
    p9.ggplot(viral_bc_detected_counts_df,
              p9.aes(x='gene',
                     y='n_cells',
                     fill='viral_bc_detected')) +
    p9.geom_bar(stat='identity') +
    p9.ggtitle('Valid viral barcode detected in each infected cell\n'
               f'{expt}') +
    p9.theme(figure_size=(3, 3),
                plot_title=p9.element_text(size=9),
                axis_title=p9.element_text(size=9),
                legend_title=p9.element_text(size=9),
                legend_title_align='center') +
    p9.scale_fill_manual([CBPALETTE[1], CBPALETTE[0]]))
display(viral_bc_detected_counts)

Plot whether missing viral barcodes are the result of that spedcific barcoded segment missing:

In [None]:
missing_genes_df = (
    viral_barcode_freqs
    .query('viral_bc_detected == False')
    .groupby(['gene','gene_present'])
    ['cell_barcode']
    .nunique()
    .reset_index()
    .rename(columns={'cell_barcode': 'n_cells'}))

missing_genes = (
    p9.ggplot(missing_genes_df,
              p9.aes(x='gene',
                     y='n_cells',
                     fill='gene_present')) +
    p9.geom_bar(stat='identity') +
    p9.ggtitle('Missing viral genes in cells without a valid viral barcode detected\n'
               f'{expt}') +
    p9.theme(figure_size=(3, 1.5),
                plot_title=p9.element_text(size=9),
                axis_title=p9.element_text(size=9),
                legend_title=p9.element_text(size=9),
                legend_title_align='center') +
    p9.scale_fill_manual([CBPALETTE[1], CBPALETTE[0]]))
display(missing_genes)

Annotate whether cell contributes progeny.

In [None]:
contributes_progeny_by_cell_df = (
    viral_barcode_freqs
    .query('viral_bc_detected == True')
    .groupby(['cell_barcode',
              'source'])
    ['progeny_freq']
    .max()
    .reset_index()
    .rename(columns={'progeny_freq': 'max_progeny_freq'}))
contributes_progeny_by_cell_df['contributes_progeny'] = (
    contributes_progeny_by_cell_df['max_progeny_freq'] > progeny_detection_limit
)

contributes_progeny_by_cell = (
    p9.ggplot(contributes_progeny_by_cell_df,
              p9.aes(x='source',
                     fill='contributes_progeny')) +
    p9.geom_bar(stat='count') +
    p9.ggtitle('Infected cells with valid viral barcode detected\n'
               'contributes any progeny\n'
               f'{expt}') +
    p9.theme(figure_size=(1.5*contributes_progeny_by_cell_df['source'].nunique(),
                          3),
                plot_title=p9.element_text(size=9),
                axis_title=p9.element_text(size=9),
                legend_title=p9.element_text(size=9),
                legend_title_align='center') +
    p9.scale_fill_manual([CBPALETTE[1], CBPALETTE[0]]))
display(contributes_progeny_by_cell)

Merge back into `viral_barcode_freqs` dataframe.  If viral barcode is not detected on gene, annotate as "viral barcode not detected".

In [None]:
viral_barcode_freqs = pd.merge(
    left=viral_barcode_freqs,
    right=contributes_progeny_by_cell_df,
    on=['cell_barcode', 'source'],
    how='left',
    validate='many_to_one'
)
viral_barcode_freqs['contributes_progeny'] = (
    viral_barcode_freqs['contributes_progeny'].fillna("valid viral bc not detected")
)

display(viral_barcode_freqs)

Plot outcome for every cell, ranked by max viral progeny contribution:

In [None]:
max_progeny_rank_df = (
    viral_barcode_freqs
    [['cell_barcode', 'source', 'max_progeny_freq', 'contributes_progeny']]
    .drop_duplicates())
max_progeny_rank_df['max_progeny_freq'] = max_progeny_rank_df['max_progeny_freq'].fillna(progeny_detection_limit)
max_progeny_rank_df['rank'] = (
    max_progeny_rank_df
    .groupby('source')
    ['max_progeny_freq']
    .rank(ascending=False,
          method='first'))

max_progeny_rank = (
    p9.ggplot(max_progeny_rank_df,
              p9.aes(x='rank',
                     y='max_progeny_freq',
                     color='factor(contributes_progeny)')) +
    p9.geom_point(alpha=0.3) +
    p9.facet_grid('~source') +
    p9.geom_hline(yintercept=progeny_detection_limit, linetype='dashed', color=CBPALETTE[2]) +
    p9.ggtitle('Infected cells contributes any progeny\n'
               f'{expt}') +
    p9.scale_y_log10() +
    p9.theme(figure_size=(4*max_progeny_rank_df['source'].nunique(),
                          2),
                plot_title=p9.element_text(size=9),
                axis_title=p9.element_text(size=9),
                legend_title=p9.element_text(size=9),
                legend_title_align='center') +
    p9.scale_color_manual(CBPALETTE[0:]))

display(max_progeny_rank)

### Relationship between viral burden and progeny production

In [None]:
viral_burden_contributes_progeny = (
    p9.ggplot((viral_barcode_freqs
               [['cell_barcode', 'contributes_progeny', 'source', 'frac_viral_UMIs']]
               .drop_duplicates()),
              p9.aes(x='factor(contributes_progeny)',
                     y='frac_viral_UMIs',
                     fill='factor(contributes_progeny)')) +
    p9.geom_boxplot() +
    p9.facet_grid('~source') +
    p9.ggtitle('Infected cells with valid viral barcode detected\n'
               'contributes any progeny\n'
               f'{expt}') +
    p9.scale_y_log10() +
    p9.theme(figure_size=(3*viral_barcode_freqs['source'].nunique(),
                          3),
                plot_title=p9.element_text(size=9),
                axis_title=p9.element_text(size=9),
                axis_text_x=p9.element_text(rotation=45),
                legend_title=p9.element_text(size=9),
                legend_title_align='center') +
    p9.scale_fill_manual(CBPALETTE[0:]))
display(viral_burden_contributes_progeny)

Bin on cells with all 8 genes:

In [None]:
viral_burden_contributes_progeny = (
    p9.ggplot((viral_barcode_freqs.query('n_viral_genes == 8')
               [['cell_barcode', 'contributes_progeny', 'source', 'frac_viral_UMIs']]
               .drop_duplicates()),
              p9.aes(x='factor(contributes_progeny)',
                     y='frac_viral_UMIs',
                     fill='factor(contributes_progeny)')) +
    p9.geom_boxplot() +
    p9.facet_grid('~source') +
    p9.ggtitle('Infected cells with 8 viral genes\n'
               'contributes any progeny\n'
               f'{expt}') +
    p9.scale_y_log10() +
    p9.theme(figure_size=(3*viral_barcode_freqs.query('n_viral_genes == 8')['source'].nunique(),
                          3),
                plot_title=p9.element_text(size=9),
                axis_title=p9.element_text(size=9),
                axis_text_x=p9.element_text(rotation=45),
                legend_title=p9.element_text(size=9),
                legend_title_align='center') +
    p9.scale_fill_manual(CBPALETTE[0:]))
display(viral_burden_contributes_progeny)

## Output

Save output figure

In [None]:
# save plot
print(f"Saving plot to {plot}")
p9.ggsave(plot=contributes_progeny_by_cell, filename=plot, verbose=False)

Export whether each infected cell contributes viral progeny for each source and barcoded gene:

In [None]:
output_df = (
    viral_barcode_freqs[['cell_barcode', 
                         'source',
                         'max_progeny_freq',
                         'contributes_progeny']]
    .drop_duplicates())

output_df.to_csv(contributes_progeny_by_cell_csv, index=False)