# Pacbio consensus gene
This notebook calls consensus muations on PacBio data grouped by by cell_barcode and gene. Then, it exports a processed CSV with the following columns:
* cell_barcode
* gene
* mutations

In [None]:
from IPython.display import display

from dms_variants.constants import CBPALETTE

import pandas as pd

import plotnine as p9

In [None]:
consensus_UMI_csv = snakemake.input.consensus_UMI_csv
expt = snakemake.wildcards.expt
consensus_gene_csv = snakemake.output.consensus_gene_csv

Style parameters:

In [None]:
p9.theme_set(p9.theme_classic())

## Load Data

In [None]:
mutations = pd.read_csv(consensus_UMI_csv)
mutations['mutation'] = mutations['mutation'].fillna('None')
display(mutations)

## Process Data

Count total UMIs for cell_barcode-gene and count number of UMIs supporting each mutation for cell_barcode-gene:

In [None]:
mutation_UMIs_df = (
    mutations
    .groupby(['cell_barcode', 'gene', 'mutation'])
    ['UMI']
    .nunique()
    .reset_index()
    .rename(columns={'UMI': 'mutation_UMIs'}))
total_UMIs_df = (
    mutations
    .groupby(['cell_barcode', 'gene'])
    ['UMI']
    .nunique()
    .reset_index()
    .rename(columns={'UMI': 'total_UMIs'}))
display(mutation_UMIs_df)
display(total_UMIs_df)

Merge into mutations df:

In [None]:
mutations = pd.merge(
    left=mutations,
    right=mutation_UMIs_df,
    on=['cell_barcode', 'gene', 'mutation'],
    how='left',
    validate='many_to_one'
)
mutations = pd.merge(
    left=mutations,
    right=total_UMIs_df,
    on=['cell_barcode', 'gene'],
    how='left',
    validate='many_to_one'
)
display(mutations)

Calculate fraction of total UMIs with mutation:

In [None]:
mutations['frac_UMIs'] = (
    mutations['mutation_UMIs'] /
    mutations['total_UMIs']
)
display(mutations)

## Call consensus
Label consensus mutations if found in >50% of UMIs. This corresponds to `frac_UMIs > 0.5` in our dataframe.

In [None]:
mutations['consensus'] = (
    mutations['frac_UMIs'] > 0.5
)
display(mutations)

## Plot outcomes

Plot distribution of total UMIs per cell_barcode-gene:

In [None]:
total_UMIs_histo = (
    p9.ggplot(
        (mutations
         [['cell_barcode', 'gene', 'total_UMIs']]
         .drop_duplicates()),
         p9.aes(x='total_UMIs')) +
    p9.geom_histogram(bins=20) +
    p9.ggtitle('n UMIs per cell_barcode-gene\n'
               f'{expt}') +
    p9.labs(x='n UMIs',
            y='n cell_barcode-gene') +
    p9.theme(figure_size=(4, 3),
             plot_title=p9.element_text(size=9),
             axis_title=p9.element_text(size=9),
             legend_title=p9.element_text(size=9),
             legend_title_align='center'))

display(total_UMIs_histo)

Plot distribution of UMIs per mutation:

In [None]:
mutation_UMIs_histo = (
    p9.ggplot(
        (mutations
         [['cell_barcode', 'gene', 'mutation', 'mutation_UMIs']]
         .drop_duplicates()),
         p9.aes(x='mutation_UMIs')) +
    p9.geom_histogram(bins=20) +
    p9.ggtitle('n UMIs per mutation\n'
               f'{expt}') +
    p9.labs(x='n UMIs',
            y='n cell_barcode-gene-mutation') +
    p9.theme(figure_size=(4, 3),
             plot_title=p9.element_text(size=9),
             axis_title=p9.element_text(size=9),
             legend_title=p9.element_text(size=9),
             legend_title_align='center'))

display(mutation_UMIs_histo)

Plot relationship between the two:

In [None]:
consensus_mutations_scatter = (
    p9.ggplot(
        (mutations
         .query('mutation != "None"')
         [['cell_barcode', 'gene', 'mutation', 'mutation_UMIs', 'total_UMIs', 'consensus']]
         .drop_duplicates()),
         p9.aes(x='total_UMIs',
                y='mutation_UMIs',
                color='consensus')) +
    p9.geom_point(alpha=0.2) +
    p9.geom_abline(intercept=0, slope=0.5, linetype='dashed', color=CBPALETTE[2]) +
    p9.ggtitle('Consensus mutations \n'
               f'{expt}') +
    p9.theme(figure_size=(4, 3),
             plot_title=p9.element_text(size=9),
             axis_title=p9.element_text(size=9),
             legend_title=p9.element_text(size=9),
             legend_title_align='center') +
    p9.scale_color_manual([CBPALETTE[1], CBPALETTE[0]]))

display(consensus_mutations_scatter)

## Integrate mutations into genotype

In [None]:
genotypes = (
    mutations
    .query('consensus == True and mutation != "None"')
    .groupby(['cell_barcode', 'gene'])
    ['mutation']
    .unique()
    .reset_index()
    .rename(columns={'mutation': 'consensus_mutations'})
)
genotypes['consensus_mutations'] = (
    genotypes['consensus_mutations']
    .apply(str))

display(genotypes)

Add back into mutations df

In [None]:
mutations = pd.merge(
    left=mutations,
    right=genotypes,
    on=['cell_barcode', 'gene'],
    how='left',
    validate='many_to_one'
)
mutations['consensus_mutations'] = mutations['consensus_mutations'].fillna('None')
display(mutations)

Plot number of cell-gene with or without mutation identified:

In [None]:
consensus_mutations_per_gene = (
    p9.ggplot(
        (mutations
         [['cell_barcode', 'gene', 'consensus_mutations']]
         .drop_duplicates()),
         p9.aes(x='gene',
                fill='factor(consensus_mutations == "None")')) +
    p9.geom_bar(stat='count') +
    p9.ggtitle('WT and mutant gentotypes\n'
               'in infected cells\n'
               f'{expt}') +
    p9.labs(y='n cell_barcode-gene') +
    p9.theme(figure_size=(4, 3),
             plot_title=p9.element_text(size=9),
             axis_title=p9.element_text(size=9),
             legend_title=p9.element_text(size=9),
             legend_title_align='center') +
    p9.scale_fill_manual([CBPALETTE[1], CBPALETTE[0]]))

display(consensus_mutations_per_gene)

Export results

In [None]:
output_df = (
    mutations
    [['cell_barcode', 'gene', 'consensus_mutations']]
    .drop_duplicates()
)
display(output_df)

print(f'Saving UMI consensus mutations to {consensus_gene_csv}')
output_df.to_csv(consensus_gene_csv, index=False)