# Viral barcodes by cell from 10x transcriptomics
This Python Jupyter notebook counts the viral barcodes by cell in the 10x transcriptomic data, and drops any viral barcodes that are ambiguous (either called as ambiguous due to lack of consensus for UMI, or have a `N` in them).

Import Python modules:

In [None]:
import matplotlib.pyplot as plt

import pandas as pd

Get `snakemake` variables [as described here](https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#jupyter-notebook-integration):

In [None]:
viral_bc_by_cell_umi_csv = snakemake.input.viral_bc_by_cell_umi_csv
viral_bc_by_cell_csv = snakemake.output.viral_bc_by_cell_csv
plot = snakemake.output.plot
expt = snakemake.wildcards.expt

Read data frame of viral barcodes by cell and UMI:

In [None]:
viral_bc_by_cell_umi = pd.read_csv(viral_bc_by_cell_umi_csv)

Create data frame where we count viral barcodes by cell (and gene), and then annotate any viral barcodes with `N` nucleotides as ambiguous:

In [None]:
viral_bc_by_cell = (
    viral_bc_by_cell_umi
    .groupby(['gene', 'cell_barcode', 'viral_barcode'], as_index=False)
    .aggregate(count=pd.NamedAgg('UMI', 'count'))
    .assign(viral_barcode=lambda x: x['viral_barcode'].where(~x['viral_barcode'].str.contains('N'),
                                                             'ambiguous'),
            is_ambiguous=lambda x: x['viral_barcode'] == 'ambiguous',
            )
    )

assert all(viral_bc_by_cell['is_ambiguous'] |
           viral_bc_by_cell['viral_barcode'].str.fullmatch('[ACGT]+'))

Write output CSV file with the viral barcode counts per cell, dropping ambiguous barcodes:

In [None]:
print(f"Writing per-cell viral barcode counts to {viral_bc_by_cell_csv}")
(viral_bc_by_cell
 .query('not is_ambiguous')
 .drop(columns='is_ambiguous')
 .to_csv(viral_bc_by_cell_csv,
         compression='gzip',
         index=False)
 )

Make summary plots:

In [None]:
viral_genes = viral_bc_by_cell['gene'].unique()

fig, axes = plt.subplots(ncols=2 + len(viral_genes),
                         figsize=(6.5 + 3.25 * len(viral_genes), 4),
                         )
fig.suptitle(f"viral barcodes in 10x transcriptomics for experiment {expt}")

# number of UMIs with viral barcode for each gene stratified by whether ambiguous
tot_umi_counts = (
    viral_bc_by_cell
    .groupby(['gene', 'is_ambiguous'], as_index=False)
    .aggregate(UMIs=pd.NamedAgg('count', 'sum'))
    .assign(is_ambiguous=lambda x: x['is_ambiguous'].map({True: 'ambiguous (discarded)',
                                                          False: 'unambiguous (retained)'}))
    .pivot_table(index='gene',
                 columns='is_ambiguous')
    .fillna(0)
    )
tot_umi_counts.columns = tot_umi_counts.columns.get_level_values(1)
tot_umi_counts.columns.name = None
_ = tot_umi_counts.plot(kind='bar',
                        ax=axes[0],
                        ).legend(loc='lower center', bbox_to_anchor=(0.5, 1))
axes[0].set_ylabel('UMIs with called viral barcode')
ymax = tot_umi_counts.max().max()
axes[0].set_ylim(0, 1.5 * ymax)
for p in axes[0].patches:
    axes[0].annotate(f"{p.get_height():.2g}",
                     (p.get_x() + 0.1, p.get_height() + 0.05 * ymax),
                     rotation=90)

# knee plot of number of times each viral barcode is observed (excluding ambiguous barcodes)
n_umis = (
    viral_bc_by_cell
    .query('not is_ambiguous')
    .groupby(['gene', 'viral_barcode'], as_index=False)
    .aggregate(count=pd.NamedAgg('count', 'sum'))
    .sort_values('count', ascending=False)
    .assign(rank=lambda x: x.groupby('gene').cumcount() + 1)
    .pivot_table(index='rank',
                 columns='gene',
                 values='count')
    )
n_umis.columns.name = None
_ = n_umis.plot(kind='line',
                logx=True,
                logy=True,
                ax=axes[1],
                color=plt.rcParams['axes.prop_cycle'].by_key()['color'][2:],
                )
axes[1].set_ylabel('UMIs for viral barcode (all cells)')
axes[1].set_xlabel('viral barcode rank')
axes[1].set_title('knee plot for viral barcodes')

# "purity" (freq of most abundant viral barcode) vs viral barcode UMIs across cells (excluding ambiguous barcodes)
purity = (
    viral_bc_by_cell
    .query('not is_ambiguous')
    .assign(cell_count=lambda x: x.groupby(['gene', 'cell_barcode'])['count'].transform('sum'),
            cell_freq=lambda x: x['count'] / x['cell_count'],
            )
    .groupby(['gene', 'cell_barcode', 'cell_count'], as_index=False)
    .aggregate(purity=pd.NamedAgg('cell_freq', 'max'))
    )
for i, (gene, color) in enumerate(zip(viral_genes,
                                      plt.rcParams['axes.prop_cycle'].by_key()['color'][2:])
                                  ):
    _ = purity.query('gene == @gene').plot(x='cell_count',
                                           y='purity',
                                           alpha=0.3,
                                           kind='scatter',
                                           c=color,
                                           ax=axes[2 + i],
                                           )
    axes[2 + i].set_xlabel('UMIs with viral barcode in cell')
    axes[2 + i].set_ylabel('freq most abundant barcode')
    axes[2 + i].set_title(f"per-cell purity for {gene}")
    
# save plot
fig.tight_layout()
print(f"Saving plot to {plot}")
fig.savefig(plot)