# Process pacbio mutations

This notebook groups PacBio data by cell_barcode, gene, and UMI and plots the number of CCS per cell_barcode-gene-UMI. It then deduplicates mutations in the same CCS and combine adjacent indels. Finally, it exports a processed CSV with the following columns:
* cell_barcode
* gene
* UMI
* mutations
* query_name (PacBio CCS ID)

In [None]:
from Bio.Seq import Seq

import alignparse.ccs
import alignparse.consensus
import alignparse.utils

import numpy as np

import pandas as pd

import plotnine as p9

In [None]:
mutation_df = snakemake.input.mutation_df
cell_annotations_csv = snakemake.input.cell_annotations
CCS_mutations_csv = snakemake.output.CCS_mutations_csv

In [None]:
mutations = pd.read_csv(mutation_df, compression='gzip', low_memory=False)
cell_barcodes = pd.read_csv(cell_annotations_csv, compression='gzip', low_memory=False)

In [None]:
filter_col = [col for col in mutations
              if col.startswith('variant_tag')]

mutations = mutations.drop(columns=filter_col)
mutations = mutations.drop(columns=['name'])

In [None]:
col_one_list = mutations['cellbarcode_sequence'].tolist()

In [None]:
barcode_list = []
for barcode in col_one_list:
    seq = Seq(barcode)
    rv_barcode = seq.reverse_complement()
    rv_barcode = str(rv_barcode)
    barcode_list.append(rv_barcode)
barcode_list
mutations['cellbarcode_rv'] = (
    pd.DataFrame(barcode_list, columns=['cellbarcode_rv'])
)

In [None]:
df_merged = pd.merge(mutations, cell_barcodes,
                     how='left', left_on=['cellbarcode_rv'],
                     right_on=['cell_barcode'])

We only want to look at CCSs that don't have strandexchange issues, so we filter out CCSs with chimeric tags.

In [None]:
df_merged = df_merged.loc[df_merged['tag_status'] != 'chimeric tags']

## CCSs in infected and uninfected cells

In [None]:
print('How many CCSs are called in infected vs uninfected cells:')
df_merged.groupby(['infected']).agg({'infected': 'count'})

### CCSs in uninfected cells

In [None]:
uninfected = df_merged.loc[df_merged['infected'] == 'uninfected']
uninfected_barcode_count = (
                            uninfected
                            .groupby(['cell_barcode'])
                            .agg({'cell_barcode': 'count'})
)
uninfected_barcode_count.columns = ['count']

In [None]:
p = (
    p9.ggplot(uninfected_barcode_count,
              p9.aes(x='count'))
    + p9.geom_histogram(bins=80)
    + p9.theme(axis_text_x=p9.element_text(angle=90),
               panel_grid_major_x=p9.element_blank()
               )
    + p9.theme_bw()
    + p9.xlab("CCS count")
    + p9.ylab("n cell barcodes")
)
_ = p.draw()
print('In total we find', len(uninfected_barcode_count.index),
      'unique barcodes that are counted',
      'as uninfected based on illumina sequencing ')
print('This is how CCSs are distributed in uninfected cells:')

### CCSs in ambiguous cells

In [None]:
ambiguous = df_merged.loc[df_merged['infected'] == 'ambiguous']
ambiguous_barcode_count = (
                            ambiguous
                            .groupby(['cell_barcode'])
                            .agg({'cell_barcode': 'count'})
)
ambiguous_barcode_count.columns = ['count']

In [None]:
p = (
    p9.ggplot(ambiguous_barcode_count,
              p9.aes(x='count'))
    + p9.geom_histogram(bins=80)
    + p9.theme(axis_text_x=p9.element_text(angle=90),
               panel_grid_major_x=p9.element_blank()
               )
    + p9.theme_bw()
    + p9.xlab("CCS count")
    + p9.ylab("n cell barcodes")
)
_ = p.draw()
print('In total we find', len(ambiguous_barcode_count.index),
      'unique barcodes that are counted',
      'as ambiguous based on illumina sequencing ')
print('This is how CCSs are distributed in ambiguous cells:')

### CCSs in infected cells

In [None]:
infected = df_merged.loc[df_merged['infected'] == 'infected']

infected_barcode_count = (
                          infected
                          .groupby(['cell_barcode'])
                          .agg({'cell_barcode': 'count'})
)
infected_barcode_count = (
                          infected_barcode_count
                          .rename(columns={'cell_barcode': 'count'})
)

In [None]:
p = (
    p9.ggplot(infected_barcode_count,
              p9.aes(x='count'))
    + p9.geom_histogram(bins=100)
    + p9.theme(axis_text_x=p9.element_text(angle=90),
               panel_grid_major_x=p9.element_blank()
               )
    + p9.theme_bw()
    + p9.xlab("CCS count")
    + p9.ylab("n cell barcodes")
)
_ = p.draw()

print('In total we find', len(infected_barcode_count.index),
      'unique infected barcodes')
print('This is how CCSs are distributed per cell barcode:')

## UMI counts

Here we count the number of CCS belonging to each cell_barcode-gene-UMI. This represents a individual transcript in a cell, and more CCS's provide stronger support for the sequence of that transcript.

In [None]:
UMI_count = (
    df_merged
    .groupby(['cell_barcode', 'gene', 'UMI_sequence'])
    .agg({'UMI_sequence': 'count'})
)
UMI_count.columns = ['count']
UMI_count = UMI_count.reset_index()
UMI_count.head()

In [None]:
p = (
    p9.ggplot(UMI_count,
              p9.aes(x='count'))
    + p9.geom_histogram(bins=80)
    + p9.theme(axis_text_x=p9.element_text(angle=90),
               panel_grid_major_x=p9.element_blank()
               )
    + p9.theme_bw()
    + p9.xlab("CCSs per cell_barcode-gene-UMI")
    + p9.ylab("number of UMIs")
)
_ = p.draw()

print('max number of CCSs per UMI is', UMI_count['count'].max())

In [None]:
UMI_count_table = (
                  UMI_count
                  .rename(columns={'count': 'no_of_CCSs'})
                  .groupby(['no_of_CCSs'])
                  .agg({'no_of_CCSs': 'count'})
)
print('Here\'s how many CCSs each cell_barcode-gene-UMI has:')
UMI_count_table.rename(columns={'no_of_CCSs': 'no of UMIs'})

## ONLY OUTPUT Data from infected cells

# Combine adjacent indels
Sometimes consecutive deletions are called as separate mutations by alignparse, so we apply `merge_dels` function to `all_mutations_orf_numbered_noT3_dupRM` column to merge consecutive deletions into a single deletion.

In [None]:
infected = infected.replace(np.nan, '', regex=True)
infected['all_mutations_orf_numbered'] = (
    infected['all_mutations_orf_numbered']
    .str.replace('nan', '[]')
)

infected['all_mutations_delsMerge'] = (
    infected['all_mutations_orf_numbered']
    .apply(alignparse.utils.merge_dels)
)

# Organize output and save

In [None]:
output_df = (
    infected
    [['cell_barcode',
      'gene',
      'UMI_sequence',
      'all_mutations_delsMerge',
      'query_name']]
    .drop_duplicates())
output_df = output_df.rename(columns={'UMI_sequence':'UMI',
                                      'all_mutations_delsMerge':'mutations'})
display(output_df)

In [None]:
output_df.to_csv(CCS_mutations_csv,
                 index=False,
                 compression='gzip')