# Call pacbio consensus UMI sequences

This notebook creates a consensus sequences for indivirual UMIs and exports a table with consensus mutations in each UMI.

In [None]:
from Bio.Seq import Seq

import alignparse.ccs
import alignparse.consensus
import alignparse.utils

import numpy as np

import pandas as pd

import plotnine as p9

In [None]:
mutation_df = snakemake.input.mutation_df
cell_df = snakemake.input.cell_df
consensus_UMI_mutations = snakemake.output.consensus_UMI_mutations

In [None]:
mutations = pd.read_csv(mutation_df, compression='gzip', low_memory=False)
cell_barcodes = pd.read_csv(cell_df, compression='gzip', low_memory=False)

I'm removing mutations in 3' termini becasue they are protein noncoding and mostly are due to poly-d(T) primer misaliggnment to poly(A) tail.

In [None]:
# create column that removed termini3 mutations
mutations['all_mutations_orf_numbered_noT3'] = (
    mutations
    .apply(lambda row: str(row['all_mutations_orf_numbered'])
           .replace(str(row['termini3_mutations_orf_numbered']), ''),
           axis=1)
)

In [None]:
filter_col = [col for col in mutations
              if col.startswith('variant_tag')]

mutations = mutations.drop(columns=filter_col)
mutations = mutations.drop(columns=['name'])

In [None]:
col_one_list = mutations['cellbarcode_sequence'].tolist()

In [None]:
barcode_list = []
for barcode in col_one_list:
    seq = Seq(barcode)
    rv_barcode = seq.reverse_complement()
    rv_barcode = str(rv_barcode)
    barcode_list.append(rv_barcode)
barcode_list
mutations['cellbarcode_rv'] = (
    pd.DataFrame(barcode_list, columns=['cellbarcode_rv'])
)

In [None]:
df_merged = pd.merge(mutations, cell_barcodes,
                     how='left', left_on=['cellbarcode_rv'],
                     right_on=['cell_barcode'])

We only want to look at CCSs that don't have strandexchange issues, so we filter out CCSs with chimeric tags.

In [None]:
df_merged = df_merged.loc[df_merged['tag_status'] != 'chimeric tags']

In [None]:
# change bools to yes/no as flake complains on later filtering
df_merged['infected'] = (
    df_merged['infected']
    .replace({True: 'Yes', False: 'No'})
)

## CCSs in infected and uninfected cells

In [None]:
print('How many CCSs are called in infected vs uninfected cells:')
df_merged.groupby(['infected']).agg({'infected': 'count'})

### CCSs in uninfected cells

In [None]:
uninfected = df_merged.loc[df_merged['infected'] == 'No']
uninfected_barcode_count = (
                            uninfected
                            .groupby(['cell_barcode'])
                            .agg({'cell_barcode': 'count'})
)
uninfected_barcode_count.columns = ['count']

In [None]:
p = (
    p9.ggplot(uninfected_barcode_count,
              p9.aes(x='count'))
    + p9.geom_histogram(bins=80)
    + p9.theme(axis_text_x=p9.element_text(angle=90),
               panel_grid_major_x=p9.element_blank()
               )
    + p9.theme_bw()
    + p9.xlab("cell barcode count")
    + p9.ylab("CCS count")
)
_ = p.draw()
print('In total we find', len(uninfected_barcode_count.index),
      'unique barcodes that are counted',
      'as uninfected based on illumina sequencing ')
print('This is how CCSs are distributed in uninfected cells:')

### CCSs in infected cells

In [None]:
infected = df_merged.loc[df_merged['infected'] == 'Yes']

infected_barcode_count = (
                          infected
                          .groupby(['cell_barcode'])
                          .agg({'cell_barcode': 'count'})
)
infected_barcode_count = (
                          infected_barcode_count
                          .rename(columns={'cell_barcode': 'count'})
)

In [None]:
p = (
    p9.ggplot(infected_barcode_count,
              p9.aes(x='count'))
    + p9.geom_histogram(bins=100)
    + p9.theme(axis_text_x=p9.element_text(angle=90),
               panel_grid_major_x=p9.element_blank()
               )
    + p9.theme_bw()
    + p9.xlab("cell barcode count")
    + p9.ylab("CCS count")
)
_ = p.draw()

print('In total we find', len(infected_barcode_count.index),
      'unique infected barcodes')
print('This is how CCSs are distributed per cell barcode:')

## UMI counts

Here we count unique UMIs. We group by cell barcode + gene + UMI to identify unique UMIs.

In [None]:
UMI_count = (
    df_merged
    .groupby(['UMI_sequence', 'cell_barcode', 'gene'])
    .agg({'UMI_sequence': 'count'})
)
UMI_count.columns = ['count']
UMI_count.head()

In [None]:
p = (
    p9.ggplot(UMI_count,
              p9.aes(x='count'))
    + p9.geom_histogram(bins=80)
    + p9.theme(axis_text_x=p9.element_text(angle=90),
               panel_grid_major_x=p9.element_blank()
               )
    + p9.theme_bw()
    + p9.xlab("CCSs per UMI")
    + p9.ylab("number of UMIs")
)
_ = p.draw()

print('max number of CCSs per UMI is', UMI_count['count'].max())

In [None]:
UMI_count_table = (
                  UMI_count
                  .rename(columns={'count': 'no_of_CCSs'})
                  .groupby(['no_of_CCSs'])
                  .agg({'no_of_CCSs': 'count'})
)
print('Here\'s how many CCSs each UMI has:')
UMI_count_table.rename(columns={'no_of_CCSs': 'no of UMIs'})

# Call consensus per UMI

Here we'll call per UMI consensus only in infected cells. We also remove any mutations that are duplicates in the same UMI (this is becasue we used overlapping primers for linearization PCR).

In [None]:
# remove duplicate mutation strings
all_mutations_dupRM = (
    infected['all_mutations_orf_numbered_noT3']
    .str.replace(r'\b(\w+)(\s+\1)+\b', r'\1')
)

infected['all_mutations_orf_numbered_noT3_dupRM'] = all_mutations_dupRM

Sometimes consecutive deletions are called as separate mutations by alignparse, so we apply `merge_dels` function to `all_mutations_orf_numbered_noT3_dupRM` column to merge consecutive.

In [None]:
infected = infected.replace(np.nan, '', regex=True)
infected['all_mutations_orf_numbered_noT3_dupRM'] = (
    infected['all_mutations_orf_numbered_noT3_dupRM']
    .str.replace('nan', '[]')
)

infected['all_mutations_delsMerge'] = (
    infected['all_mutations_orf_numbered_noT3_dupRM']
    .apply(alignparse.utils.merge_dels)
)

Call consensus on mutations on `all_mutations_delsMerge` column that has mutations in 3' termini removed, duplicated mutations removed, and consecutive deletions merged.

In [None]:
consensus, dropped = alignparse.consensus.simple_mutconsensus(
    infected,
    group_cols=['cellbarcode_sequence', 'transcript', 'UMI_sequence'],
    mutation_col='all_mutations_delsMerge',
    max_sub_diffs=5,
    max_indel_diffs=10,
    max_minor_sub_frac=0.3,
    max_minor_indel_frac=0.3,
    support_col='variant_call_support'
)

In [None]:
print('We dropped', len(dropped.index),
      'UMIs, which were supported by',
      dropped['nseqs'].sum(), 'CSSs')

In [None]:
#reverse complement cell barcode
consensus['cellbarcode_sequence'] = consensus['cellbarcode_sequence'].apply(lambda x: str(Seq(x).complement())[::-1])

Finally, we export the consensus table

In [None]:
consensus.to_csv(consensus_UMI_mutations,
                 index=False, compression='gzip')