# Pacbio consensus UMI
This notebook calls consensus muations on PacBio data grouped by by cell_barcode, gene, and UMI. Then, it exports a processed CSV with the following columns:
* cell_barcode
* gene
* UMI
* mutation
* mutation_CCS

In [None]:
from IPython.display import display

from dms_variants.constants import CBPALETTE

import numpy as np

import pandas as pd

import plotnine as p9

In [None]:
CCS_mutations_csv = snakemake.input.CCS_mutations_csv
expt = snakemake.wildcards.expt
consensus_UMI_csv = snakemake.output.consensus_UMI_csv

Style parameters:

In [None]:
p9.theme_set(p9.theme_classic())

## Load Data

In [None]:
CCS_mutations = pd.read_csv(CCS_mutations_csv)
display(CCS_mutations)

## Process Data

Generate list of all cell_barcode-gene-UMI:

In [None]:
cb_gene_UMI = (
    CCS_mutations
    [['cell_barcode',
      'gene',
      'UMI']]
    .drop_duplicates()
)
display(cb_gene_UMI)

Count total CCS for cell_barcode-gene-UMI:

In [None]:
total_CCS_df = (
    CCS_mutations
    .groupby(['cell_barcode', 'gene', 'UMI'])
    ['query_name']
    .nunique()
    .reset_index()
    .rename(columns={'query_name': 'total_CCS'}))
display(total_CCS_df)

Exclude WT CCS and split each mutation into its own row:

In [None]:
CCS_mutations_noWT = CCS_mutations.query('mutations.notnull()', engine='python')
CCS_mutations_noWT['muts_split'] = (CCS_mutations_noWT['mutations']
                                    .apply(lambda x: str(x).split(' ')))
CCS_mutations_noWT = CCS_mutations_noWT.explode('muts_split')

display(CCS_mutations_noWT)

Count number of CCS supporting each mutation for cell_barcode-gene-UMI:

In [None]:
mutation_CCS_df = (
    CCS_mutations_noWT
    .groupby(['cell_barcode', 'gene', 'UMI', 'muts_split'])
    ['query_name']
    .nunique()
    .reset_index()
    .rename(columns={'query_name': 'mutation_CCS'}))
display(mutation_CCS_df)

Bring in total CCS counts and calculate fraction of total CCS with each mutation:

In [None]:
mutation_frac_df = pd.merge(
    left=mutation_CCS_df,
    right=total_CCS_df,
    on=['cell_barcode', 'gene', 'UMI'],
    how='left',
    validate='many_to_one')
mutation_frac_df['frac_CCS'] = (
    mutation_frac_df['mutation_CCS'] /
    mutation_frac_df['total_CCS']
)
display(mutation_frac_df)

**Call Consensus Mutations**  
Label mutation as consensus if it is found in >50% of CCS. This corresponds to `frac_CCS > 0.5` in our dataframe.

In [None]:
mutation_frac_df['consensus'] = (
    (mutation_frac_df['frac_CCS'] > 0.5))
display(mutation_frac_df)

In [None]:
mutation_frac_histo = (
    p9.ggplot(
        (mutation_frac_df
         [['cell_barcode', 'gene', 'UMI', 'muts_split', 'frac_CCS', 'consensus']]
         .drop_duplicates()),
         p9.aes(x='frac_CCS',
                fill='consensus')) +
    p9.geom_histogram(bins=20) +
    p9.ggtitle('Mutation fractions\n'
               '(excludes WT UMI)\n'
               f'{expt}') +
    p9.labs(x='fraction of total UMI for cell_barcode-gene') +
    p9.theme(figure_size=(4, 3),
             plot_title=p9.element_text(size=9),
             axis_title=p9.element_text(size=9),
             legend_title=p9.element_text(size=9),
             legend_title_align='center') +
    p9.scale_fill_manual([CBPALETTE[1], CBPALETTE[0]]))

display(mutation_frac_histo)

## Merge mutatant and WT UMIs
Merge data into single dataframe, `UMI_mutations`.  
Steps:  
1. Filter `mutation_frac_df` for only consensus mutations
2. Merge with `cb_gene_UMI` dataframe so every `cell_barcode-gene-UMI` has at least one row.
3. Fill `muts_split` column with WT if a `cell_barcode-gene-UMI` does not have any consensus mutations. 

In [None]:
# Merge in consesnsus mutations
UMI_mutations = pd.merge(
    left=cb_gene_UMI,
    right=mutation_frac_df.query('consensus == True'),
    on=['cell_barcode', 'gene', 'UMI'],
    how='left',
    validate='one_to_many'
)

# Fill muts_split with WT if no consensus mutation
UMI_mutations['muts_split'] = (
    UMI_mutations['muts_split']
    .fillna('WT')
)

display(UMI_mutations)

Check that every `cell_barcode-gene-UMI` is represented in final `UMI_mutations` dataframe.

In [None]:
assert len(UMI_mutations[['cell_barcode', 'gene', 'UMI']].drop_duplicates()) == \
    len(cb_gene_UMI), "Missing cell_barcode-gene-UMI from df"

## Plot outcomes

Plot distribution of total CCS per cell_barcode-gene-UMI:

In [None]:
total_CCS_histo = (
    p9.ggplot(
        (total_CCS_df),
         p9.aes(x='total_CCS')) +
    p9.geom_histogram(stat='count') +
    p9.ggtitle('n CCS per cell_barcode-gene-UMI\n'
               f'{expt}') +
    p9.labs(x='n CCS',
            y='n cell_barcode-gene-UMI') +
    p9.theme(figure_size=(4, 3),
             plot_title=p9.element_text(size=9),
             axis_title=p9.element_text(size=9),
             legend_title=p9.element_text(size=9),
             legend_title_align='center'))

display(total_CCS_histo)

Plot distribution of CCS per mutation:

In [None]:
mutation_CCS_histo = (
    p9.ggplot(
        (UMI_mutations
         [['cell_barcode', 'gene', 'UMI', 'muts_split', 'mutation_CCS']]
         .drop_duplicates()),
         p9.aes(x='mutation_CCS')) +
    p9.geom_histogram(stat='count') +
    p9.ggtitle('n CCS per mutation\n'
               '(excludes WT CCS)\n'
               f'{expt}') +
    p9.labs(x='n CCS',
            y='n cell_barcode-gene-UMI-mutation') +
    p9.theme(figure_size=(4, 3),
             plot_title=p9.element_text(size=9),
             axis_title=p9.element_text(size=9),
             legend_title=p9.element_text(size=9),
             legend_title_align='center'))

display(mutation_CCS_histo)

Plot relationship between the two:

In [None]:
consensus_mutations_histo = (
    p9.ggplot(
        (UMI_mutations
         [['cell_barcode', 'gene', 'UMI', 'muts_split', 'frac_CCS']]
         .drop_duplicates()),
         p9.aes(x='frac_CCS')) +
    p9.geom_histogram(bins=20) +
    p9.ggtitle('Consensus mutation fractions\n'
               '(excludes WT CCS)\n'
               f'{expt}') +
    p9.labs(x='fraction of total CCS for UMI') +
    p9.theme(figure_size=(4, 3),
             plot_title=p9.element_text(size=9),
             axis_title=p9.element_text(size=9),
             legend_title=p9.element_text(size=9),
             legend_title_align='center') +
    p9.scale_color_manual([CBPALETTE[1], CBPALETTE[0]]))

display(consensus_mutations_histo)

## Output

In [None]:
output_df = (
    UMI_mutations
    [['cell_barcode', 'gene', 'UMI', 'muts_split']]
    .rename(columns={'muts_split': 'mutation'})
)

display(output_df)

Double check that every cell_barcod-gene-UMI is represented in final `output_df`

In [None]:
assert len(output_df[['cell_barcode', 'gene', 'UMI']].drop_duplicates()) == \
    len(cb_gene_UMI), "Missing cell_barcode-gene-UMI from df"

Make sure no na values are included in final `output_df`. Everything should either have a consensus mutation or be annotated as `"WT"`

In [None]:
assert output_df.notnull().any().any(), \
    "Found null value in output_df"

In [None]:
print(f'Saving UMI consensus mutations to {consensus_UMI_csv}')
output_df.to_csv(consensus_UMI_csv, index=False)