# Process viral barcode replicates
This notebook plots the correlation between technical replicates of raw viral barcode sequencing data. Then, it calculates the mean frequency of each barcode and exports this value to a CSV. Data source is supernatant or second infection.

Import Python modules:

In [None]:
import gzip
import itertools
import random

from IPython.display import display

from dms_variants.constants import CBPALETTE

import numpy as np

import editdistance

import pandas as pd

import plotnine as p9

Hardcode variables for now:

In [None]:
viral_bc_in_progeny_csv = snakemake.input.viral_bc_in_progeny_csv
viral_bc_in_progeny_freq_csv = snakemake.output.viral_bc_in_progeny_freq_csv
plot = snakemake.output.plot
expt = snakemake.wildcards.expt

Load viral barcode counts and pivot dataframe so replicates in two columns.

In [None]:
viral_bc = pd.read_csv(gzip.open(viral_bc_in_progeny_csv))
viral_bc = pd.merge(viral_bc,
                    (viral_bc
                     .groupby(['source',
                               'tag',
                               'gene',
                               'replicate'])
                     .sum()
                     .reset_index()),
                    on=['source', 'tag', 'gene', 'replicate'],
                    suffixes=('_barcode', '_total'))
viral_bc['freq'] = (viral_bc['count_barcode'] /
                    viral_bc['count_total'])
display(viral_bc)

### Plots
Set theme for plots.

In [None]:
p9.theme_set(p9.theme_classic())

Plot number of barcodes identified for each sample:

In [None]:
n_barcodes_df = (viral_bc
                 .groupby(['source', 'tag', 'gene', 'replicate'])['barcode']
                 .nunique()
                 .reset_index()
                 .rename(columns={'barcode': 'n_unique_barcodes'}))

display(n_barcodes_df)

fig = (p9.ggplot(n_barcodes_df, p9.aes(x='tag',
                                       y='n_unique_barcodes',
                                       fill='replicate')) +
       p9.geom_bar(stat='identity', position='position_dodge') +
       p9.facet_grid('gene~source') +
       p9.theme(figure_size=(3*n_barcodes_df['gene'].nunique(),
                             1.7*n_barcodes_df['source'].nunique()),
                plot_title=p9.element_text(size=10),
                axis_title=p9.element_text(size=10),
                legend_title=p9.element_text(size=9),
                legend_title_align='center') +
       p9.scale_fill_manual(CBPALETTE[0:]))
fig

Plot histogram for each sample:

In [None]:
fig = (p9.ggplot(viral_bc, p9.aes(x='freq', fill='replicate')) +
       p9.geom_histogram(bins=100) +
       p9.facet_grid('gene+tag~source') +
       p9.scale_y_log10() +
       p9.scale_x_log10() +
       p9.theme(figure_size=(3*viral_bc['source'].nunique(),
                             2*viral_bc['gene'].nunique()),
                plot_title=p9.element_text(size=10),
                axis_title=p9.element_text(size=10),
                legend_title=p9.element_text(size=9),
                legend_title_align='center') +
       p9.ylab('n_barcodes') +
       p9.scale_fill_manual(CBPALETTE[0:]))
fig

Plot barcode sharing across tags:

In [None]:
shared_tags = (viral_bc
               .groupby(['source',
                         'gene',
                         'barcode'])
               .agg({'tag': lambda x: ' and '.join(sorted(set(x)))})
               .reset_index())

display(shared_tags)


fig = (p9.ggplot(shared_tags,
                 p9.aes(x='gene',
                        fill='tag')) +
       p9.geom_bar(stat='count') +
       p9.ggtitle(f'barcodes identified in each tag for {expt}') +
       p9.ylab('n_barcodes') +
       p9.coord_flip() +
       p9.facet_grid('source~') +
       p9.theme(figure_size=(7,
                             2*shared_tags['source'].nunique()),
                plot_title=p9.element_text(size=10),
                axis_title=p9.element_text(size=10),
                legend_title=p9.element_text(size=9),
                legend_title_align='center') +
       p9.scale_fill_manual(CBPALETTE[0:]))

# show plot
fig

Plot barcode frequency by sharing:

In [None]:
shared_tags_freq = pd.merge(viral_bc,
                            shared_tags,
                            on=['source', 'gene', 'barcode'],
                            suffixes=[None, 's_present'])

display(shared_tags_freq)

fig = (p9.ggplot(shared_tags_freq, p9.aes(x='freq', fill='tags_present')) +
       p9.geom_histogram(bins=100) +
       p9.facet_grid('gene~source') +
       p9.scale_y_log10() +
       p9.scale_x_log10() +
       p9.theme(figure_size=(3*viral_bc['source'].nunique(),
                             2*viral_bc['gene'].nunique()),
                plot_title=p9.element_text(size=10),
                axis_title=p9.element_text(size=10),
                legend_title=p9.element_text(size=9),
                legend_title_align='center') +
       p9.ylab('n_barcodes') +
       p9.scale_fill_manual(CBPALETTE[0:]))
fig

Plot correlation of barcodes in supernatant and in second infection. Color by presence in each (or both) tags.

In [None]:
tag_freqs = (shared_tags_freq
             .pivot_table(index=['gene',
                                 'tag',
                                 'barcode',
                                 'tags_present'],
                          columns=['source'],
                          values='freq',
                          aggfunc=np.mean)
             .fillna(0)
             .reset_index())

display(tag_freqs)

fig = (p9.ggplot(tag_freqs,
                 p9.aes(x='supernatant',
                        y='second_infection',
                        color='tags_present')) +
       p9.ggtitle(f'{expt}\n'
                  'barcode counts in supernatant '
                  'and second infection for shared libraries') +
       p9.geom_point(alpha=0.4) +
       p9.facet_grid('gene~tag') +
       p9.scale_x_log10() +
       p9.scale_y_log10() +
       p9.scale_color_manual(CBPALETTE[0:]))
fig

Plot frequencies of barcodes and color by which tag (or both) they are found in:

In [None]:
tag_freqs = (shared_tags_freq
             .pivot_table(index=['source',
                                 'gene',
                                 'barcode',
                                 'tags_present'],
                          columns=['tag'],
                          values='freq',
                          aggfunc=np.mean)
             .fillna(0)
             .reset_index())

display(tag_freqs)

fig = (p9.ggplot(tag_freqs,
                 p9.aes(x='wt',
                        y='syn',
                        color='tags_present')) +
       p9.ggtitle(f'{expt}\n'
                  'barcode counts in wt and syn for shared libraries') +
       p9.geom_point() +
       p9.facet_grid('gene~source') +
       p9.scale_x_log10() +
       p9.scale_y_log10() +
       p9.scale_color_manual(CBPALETTE[1:]))
fig

Plot frequencies on barcodes that are found in both tags, colored by replicate:

In [None]:
tag_freqs = (shared_tags_freq
             .query('tags_present == "syn and wt"')
             .pivot_table(index=['source',
                                 'gene',
                                 'barcode',
                                 'replicate'],
                          columns=['tag'],
                          values='freq')
             .fillna(0)
             .reset_index())

display(tag_freqs)

fig = (p9.ggplot(tag_freqs,
                 p9.aes(x='wt',
                        y='syn',
                        color='replicate')) +
       p9.ggtitle(f'{expt}\n'
                  'barcode counts in wt and syn for shared libraries') +
       p9.geom_point() +
       p9.facet_grid('gene~source') +
       p9.scale_x_log10() +
       p9.scale_y_log10() +
       p9.scale_color_manual(CBPALETTE[1:]))
fig

Plot correlation of technical replicates

In [None]:
# Check that replicates fit hardcoded expectations
replicate_names = ['replicate_1', 'replicate_2']
n_replicates = len(replicate_names)
for (_source, _tag, _gene), df in (viral_bc.groupby(['source',
                                                     'tag',
                                                     'gene'])):
    assert set(df['replicate'].unique()) == set(replicate_names), \
        f"Code assumes exactly {n_replicates} replicates per sample."

viral_bc_wide = (viral_bc
                 .pivot_table(index=['source',
                                     'tag',
                                     'gene',
                                     'barcode'],
                              columns='replicate',
                              values='freq')
                 .reset_index()
                 .fillna(0))

viral_bc_wide.columns = ['source',
                         'tag',
                         'gene',
                         'barcode',
                         'replicate_1_freq',
                         'replicate_2_freq']

viral_bc_wide['in_both'] = ((viral_bc_wide['replicate_1_freq'] > 0)
                            & (viral_bc_wide['replicate_2_freq'] > 0))

display(viral_bc_wide)

fig = (p9.ggplot(viral_bc_wide,
                 p9.aes(x='replicate_1_freq',
                        y='replicate_2_freq',
                        color='in_both')) +
       p9.ggtitle(f'viral barcode replicates for experiment {expt}') +
       p9.geom_point(alpha=0.1) +
       p9.facet_grid('source~gene+tag') +
       p9.scale_x_log10() +
       p9.scale_y_log10() +
       p9.theme(figure_size=(3.5*viral_bc_wide['gene'].nunique(),
                             2*viral_bc_wide['source'].nunique()),
                plot_title=p9.element_text(size=10),
                axis_title=p9.element_text(size=10),
                legend_title=p9.element_text(size=9),
                legend_title_align='center') +
       p9.scale_color_manual([CBPALETTE[1], '#000000']))

# save plot
print(f"Saving plot to {plot}")
p9.ggsave(plot=fig, filename=plot, verbose=False)

# show plot
fig

Average frequencies and export CSV:

In [None]:
viral_bc_wide['mean_freq'] = ((viral_bc_wide['replicate_1_freq']
                               + viral_bc_wide['replicate_2_freq'])
                              / 2)

viral_bc_wide.to_csv(viral_bc_in_progeny_freq_csv,
                     columns=['source',
                              'tag',
                              'gene',
                              'barcode',
                              'mean_freq'],
                     index=False)

Calculate and plot edit distance between observed barcodes:

In [None]:
sample_size = 1000
records = []
for (source, tag, gene), df in (viral_bc
                                .groupby(['source',
                                          'tag',
                                          'gene'])):
    print(f'Calculating hamming distance for {source} {tag} {gene}.'
          f'\n\tThere are {df["barcode"].nunique()} '
          'unique barcodes in dataset.'
          f'\n\tSampling {sample_size} barcodes for distribution.')
    sample = random.sample(list(df['barcode'].unique()), sample_size)
    for bc_pair in itertools.combinations(sample, 2):
        records.append((source,
                        tag,
                        gene,
                        bc_pair[0],
                        bc_pair[1],
                        editdistance.eval(bc_pair[0], bc_pair[1])))

pairs_df = pd.DataFrame.from_records(records)
pairs_df.columns = ['source',
                    'tag',
                    'gene',
                    'seq_1',
                    'seq_2',
                    'edit_distance']
display(pairs_df)

fig = (p9.ggplot(pairs_df, p9.aes(x='edit_distance')) +
       p9.ggtitle(f'edit distance between barcodes in {expt}') +
       p9.geom_histogram(binwidth=1) +
       p9.facet_grid('gene+tag~source'))

fig