# Pacbio consensus UMI
This notebook calls consensus muations on PacBio data grouped by by cell_barcode, gene, and UMI. Then, it exports a processed CSV with the following columns:
* cell_barcode
* gene
* UMI
* mutation
* mutation_CCS

In [None]:
from IPython.display import display

from dms_variants.constants import CBPALETTE

import pandas as pd

import plotnine as p9

In [None]:
UMI_mutations_csv = snakemake.input.UMI_mutations_csv
expt = snakemake.wildcards.expt
consensus_UMI_csv = snakemake.output.consensus_UMI_csv

Style parameters:

In [None]:
p9.theme_set(p9.theme_classic())

## Load Data

In [None]:
UMI_mutations = pd.read_csv(UMI_mutations_csv)
display(UMI_mutations)

## Process Data

Split each mutation into its own row:

In [None]:
UMI_mutations['muts_split'] = (UMI_mutations['mutations']
                               .apply(lambda x: str(x).split(' ')))
UMI_mutations = UMI_mutations.explode('muts_split')

display(UMI_mutations)

Count total CCS for cell_barcode-gene-UMI and count number of CCS supporting each mutation for cell_barcode-gene-UMI:

In [None]:
mutation_CCS_df = (
    UMI_mutations
    .groupby(['cell_barcode', 'gene', 'UMI', 'muts_split'])
    ['query_name']
    .nunique()
    .reset_index()
    .rename(columns={'query_name': 'mutation_CCS'}))
total_CCS_df = (
    UMI_mutations
    .groupby(['cell_barcode', 'gene', 'UMI'])
    ['query_name']
    .nunique()
    .reset_index()
    .rename(columns={'query_name': 'total_CCS'}))
display(mutation_CCS_df)
display(total_CCS_df)

Merge into UMI_mutations df:

In [None]:
UMI_mutations = pd.merge(
    left=UMI_mutations,
    right=mutation_CCS_df,
    on=['cell_barcode', 'gene', 'UMI', 'muts_split'],
    how='left',
    validate='many_to_one'
)
UMI_mutations = pd.merge(
    left=UMI_mutations,
    right=total_CCS_df,
    on=['cell_barcode', 'gene', 'UMI'],
    how='left',
    validate='many_to_one'
)
display(UMI_mutations)

Calculate fraction of total CCS with mutation:

In [None]:
UMI_mutations['frac_CCS'] = (
    UMI_mutations['mutation_CCS'] /
    UMI_mutations['total_CCS']
)
display(UMI_mutations)

## Call consensus
Label consensus mutations if found in >50% of CCS. This corresponds to `frac_CCS > 0.5` in our dataframe.

In [None]:
UMI_mutations['consensus'] = (
    UMI_mutations['frac_CCS'] > 0.5
)
display(UMI_mutations)

## Plot outcomes

Plot distribution of total CCS per cell_barcode-gene-UMI:

In [None]:
total_CCS_histo = (
    p9.ggplot(
        (UMI_mutations
         [['cell_barcode', 'gene', 'UMI', 'total_CCS']]
         .drop_duplicates()),
         p9.aes(x='total_CCS')) +
    p9.geom_histogram(stat='count') +
    p9.ggtitle('n CCS per cell_barcode-gene-UMI\n'
               f'{expt}') +
    p9.labs(x='n CCS',
            y='n cell_barcode-gene-UMI') +
    p9.theme(figure_size=(4, 3),
             plot_title=p9.element_text(size=9),
             axis_title=p9.element_text(size=9),
             legend_title=p9.element_text(size=9),
             legend_title_align='center'))

display(total_CCS_histo)

Plot distribution of CCS per mutation:

In [None]:
mutation_CCS_histo = (
    p9.ggplot(
        (UMI_mutations
         [['cell_barcode', 'gene', 'UMI', 'muts_split', 'mutation_CCS']]
         .drop_duplicates()),
         p9.aes(x='mutation_CCS')) +
    p9.geom_histogram(stat='count') +
    p9.ggtitle('n CCS per mutation\n'
               f'{expt}') +
    p9.labs(x='n CCS',
            y='n cell_barcode-gene-UMI-mutation') +
    p9.theme(figure_size=(4, 3),
             plot_title=p9.element_text(size=9),
             axis_title=p9.element_text(size=9),
             legend_title=p9.element_text(size=9),
             legend_title_align='center'))

display(mutation_CCS_histo)

Plot relationship between the two:

In [None]:
consensus_mutations_scatter = (
    p9.ggplot(
        (UMI_mutations
         [['cell_barcode', 'gene', 'UMI', 'muts_split', 'mutation_CCS', 'total_CCS', 'consensus']]
         .drop_duplicates()),
         p9.aes(x='total_CCS',
                y='mutation_CCS',
                color='consensus')) +
    p9.geom_point(alpha=0.1) +
    p9.ggtitle('Consensus mutations \n'
               f'{expt}') +
    p9.theme(figure_size=(4, 3),
             plot_title=p9.element_text(size=9),
             axis_title=p9.element_text(size=9),
             legend_title=p9.element_text(size=9),
             legend_title_align='center') +
    p9.scale_color_manual([CBPALETTE[1], CBPALETTE[0]]))

display(consensus_mutations_scatter)

In [None]:
output_df = (
    UMI_mutations
    .query('consensus==True')
    [['cell_barcode', 'gene', 'UMI', 'muts_split']]
    .rename(columns={'muts_split': 'mutation'})
)
display(output_df)

print(f'Saving UMI consensus mutations to {consensus_UMI_csv}')
output_df.to_csv(consensus_UMI_csv, index=False)