# Quantify strand exchange

This notebook examines strand exchange that might happen during PCR or SMRTbell library preparation. We do so by looking at mixin of wildtype and synonymous viral tags in pacbio CCSs. 

In [1]:
from alignparse.constants import CBPALETTE

import pandas as pd

import plotnine as p9

import yaml

Import snakemake variables.

In [2]:
# viral_tags = snakemake.input.viral_tags
# ccs_alignments = snakemake.input.ccs_alignments
# plot_strand_exchange = snakemake.output.plot_strand_exchange

viral_tags = '../data/flu_sequences/flu-CA09_viral_tags.yaml'
ccs_alignments = '../results/pacbio/hashing_highMOI_mutations_by_ccs.csv.gz'

Here we load viral tag yalm and alignment files and add identities of expected `wt` and `syn` tags to the end of the alignment df. 

In [3]:
with open(viral_tags) as file:
    tags = yaml.load(file, Loader=yaml.FullLoader)

In [4]:
alignments = pd.read_csv(ccs_alignments, compression='gzip')

In [5]:
alignments['variant_tag'] = alignments['gene'].map(tags)

viral_tags_df = alignments['variant_tag'].apply(pd.Series)

filter_col = [col for col in viral_tags_df if col.startswith('viral_tag')]

for col in filter_col:
    alignments[[col + '_syn', col + '_wt' ]] = viral_tags_df[col].apply(pd.Series).filter(items = ['syn', 'wt'])
alignments.head()

Unnamed: 0,name,query_name,query_clip5,query_clip3,termini5_mutations,termini5_accuracy,termini3_mutations,termini3_accuracy,sequenced_ORF_1_mutations,sequenced_ORF_1_accuracy,...,viral_tag_4_syn,viral_tag_4_wt,viral_tag_5_syn,viral_tag_5_wt,viral_tag_6_syn,viral_tag_6_wt,viral_tag_7_syn,viral_tag_7_wt,viral_tag_8_syn,viral_tag_8_wt
0,hashing_highMOI_2020-10-20_all_segments_run1,m54228_201020_194205/4194462/ccs,0,0,,1.0,ins209GAAAA,1.0,ins422A,0.999817,...,,,,,,,,,,
1,hashing_highMOI_2020-10-20_all_segments_run1,m54228_201020_194205/4194571/ccs,0,0,,1.0,del1to208,,del559to870,1.0,...,,,,,,,,,,
2,hashing_highMOI_2020-10-20_all_segments_run1,m54228_201020_194205/4194630/ccs,0,0,,1.0,ins209AAAAA,0.999409,T345C,0.999998,...,,,,,,,,,,
3,hashing_highMOI_2020-10-20_all_segments_run1,m54228_201020_194205/4194734/ccs,0,0,,1.0,del1to208,,T365C ins411CAACCACCTGGAAAAAAAAAAAAAAAAAAAAAAA...,0.99921,...,,,,,,,,,,
4,hashing_highMOI_2020-10-20_all_segments_run1,m54228_201020_194205/4194868/ccs,0,1,,1.0,del1to46 ins207ACA G208A,1.0,ins411CAACCACCTGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...,0.999988,...,,,,,,,,,,
