# Pacbio consensus UMI
This notebook calls consensus muations on PacBio data grouped by by cell_barcode, gene, and UMI. Then, it exports a processed CSV with the following columns:
* cell_barcode
* gene
* UMI
* mutations

In [1]:
from IPython.display import display

import pandas as pd

import plotnine as p9

In [2]:
UMI_mutations_csv = 'UMI_mutations.csv.gz' #snakemake.input.UMI_mutations_csv
consensus_UMI_csv = 'consensus_UMI.csv.gz' #snakemake.output.consensus_UMI_csv

Read in mutations data:

In [3]:
UMI_mutations = pd.read_csv(UMI_mutations_csv)
display(UMI_mutations)

Unnamed: 0,cell_barcode,gene,UMI,mutations,query_name
0,TATTTCGGTGCCTAAT,fluHA,TGGGCGTGAATA,,m64272e_210730_193026/322/ccs
1,TATTTCGGTGCCTAAT,fluHA,ACACGTCCTTGA,,m64272e_210730_193026/620/ccs
2,TGATTTCCATTGCAAC,fluHA,GACGATCTTGGA,T578C,m64272e_210730_193026/1250/ccs
3,GTCGTTCTCGTTTACT,fluHA,ATTCTAGGGGTC,C63T G1907A ins1907CA G1909A,m64272e_210730_193026/2656/ccs
4,GTCATTTGTCTACACA,fluHA,AATAGAAAGATG,ins495CAA del1014to1014,m64272e_210730_193026/2675/ccs
...,...,...,...,...,...
1179473,AACCAACAGGGTCTTT,fluPB2,ATACTAACTCGA,C812T C1636T,m64272e_210730_193026/180553635/ccs
1179474,CGATGGCCAAAGAGTT,fluPB2,CTAACCAGGGGG,,m64272e_210730_193026/180553782/ccs
1179475,GTAGAAACACCTAAAC,fluPB2,ATGGAGCCGTGA,A556G del558to2280,m64272e_210730_193026/180553898/ccs
1179476,GCCATTCAGGTGGCTA,fluPB2,GGATTGATCTTT,del1015to2280,m64272e_210730_193026/180554062/ccs


Split each mutation into its own row:

In [4]:
UMI_mutations['muts_split'] = (UMI_mutations['mutations']
                               .apply(lambda x: str(x).split(' ')))
UMI_mutations = UMI_mutations.explode('muts_split')

display(UMI_mutations)

Unnamed: 0,cell_barcode,gene,UMI,mutations,query_name,muts_split
0,TATTTCGGTGCCTAAT,fluHA,TGGGCGTGAATA,,m64272e_210730_193026/322/ccs,
1,TATTTCGGTGCCTAAT,fluHA,ACACGTCCTTGA,,m64272e_210730_193026/620/ccs,
2,TGATTTCCATTGCAAC,fluHA,GACGATCTTGGA,T578C,m64272e_210730_193026/1250/ccs,T578C
3,GTCGTTCTCGTTTACT,fluHA,ATTCTAGGGGTC,C63T G1907A ins1907CA G1909A,m64272e_210730_193026/2656/ccs,C63T
3,GTCGTTCTCGTTTACT,fluHA,ATTCTAGGGGTC,C63T G1907A ins1907CA G1909A,m64272e_210730_193026/2656/ccs,G1907A
...,...,...,...,...,...,...
1179475,GTAGAAACACCTAAAC,fluPB2,ATGGAGCCGTGA,A556G del558to2280,m64272e_210730_193026/180553898/ccs,A556G
1179475,GTAGAAACACCTAAAC,fluPB2,ATGGAGCCGTGA,A556G del558to2280,m64272e_210730_193026/180553898/ccs,del558to2280
1179476,GCCATTCAGGTGGCTA,fluPB2,GGATTGATCTTT,del1015to2280,m64272e_210730_193026/180554062/ccs,del1015to2280
1179477,GTCATTTGTCTACACA,fluPB2,TAAGAACCGAAT,del2221to2280 ins2221TCTAGCATACTTACTGACAGCCAGA...,m64272e_210730_193026/180554193/ccs,del2221to2280


Count total CCS for cell_barcode-gene-UMI and count number of CCS supporting each mutation for cell_barcode-gene-UMI:

In [7]:
total_CCS = UMI_mutations.groupby(['cell_barcode', 'gene', 'UMI'])['query_name'].nunique().reset_index().rename(columns={'query_name': 'total_CCS'})
display(total_CCS)

Unnamed: 0,cell_barcode,gene,UMI,total_CCS
0,AAACGAAGTACTTCCC,fluM,AAAAAGACGGAA,1
1,AAACGAAGTACTTCCC,fluM,AAAAATACTACG,1
2,AAACGAAGTACTTCCC,fluM,AAAAATTAATGC,1
3,AAACGAAGTACTTCCC,fluM,AAAACAAATATA,1
4,AAACGAAGTACTTCCC,fluM,AAAAGTCGTGGT,1
...,...,...,...,...
549150,TTTGGAGGTATCGTTG,fluPB2,CTATCCGTTCTT,7
549151,TTTGGAGGTATCGTTG,fluPB2,GTGATTCATGAA,1
549152,TTTGGAGGTATCGTTG,fluPB2,TAATCAACAAGG,6
549153,TTTGGAGGTATCGTTG,fluPB2,TAGTTTACACGT,2


In [8]:
mutation_CCS = UMI_mutations.groupby(['cell_barcode', 'gene', 'UMI', 'muts_split'])['query_name'].nunique().reset_index().rename(columns={'query_name': 'mutation_CCS'})
display(mutation_CCS)

Unnamed: 0,cell_barcode,gene,UMI,muts_split,mutation_CCS
0,AAACGAAGTACTTCCC,fluM,AAAAAGACGGAA,,1
1,AAACGAAGTACTTCCC,fluM,AAAAATACTACG,A623G,1
2,AAACGAAGTACTTCCC,fluM,AAAAATTAATGC,,1
3,AAACGAAGTACTTCCC,fluM,AAAACAAATATA,T451C,1
4,AAACGAAGTACTTCCC,fluM,AAAACAAATATA,del969to980,1
...,...,...,...,...,...
1276793,TTTGGAGGTATCGTTG,fluPB2,TATGATCTAGGG,C1865T,1
1276794,TTTGGAGGTATCGTTG,fluPB2,TATGATCTAGGG,del1126to1755,1
1276795,TTTGGAGGTATCGTTG,fluPB2,TATGATCTAGGG,del17to17,1
1276796,TTTGGAGGTATCGTTG,fluPB2,TATGATCTAGGG,ins504C,1
