# Quantify strand exchange

This notebook examines strand exchange that might happen during PCR or SMRTbell library preparation. We do so by looking at mixin of wildtype and synonymous viral tags in pacbio CCSs. 

In [None]:
from alignparse.constants import CBPALETTE

import pandas as pd

import plotnine as p9

import yaml

Import snakemake variables.

In [None]:
viral_tags = snakemake.input.viral_tags
ccs_alignments = snakemake.input.ccs_alignments
plot_strand_exchange = snakemake.output.plot_strand_exchange

Here we load viral tag yalm file and make a dataframe each for `variant_tag_1` and `variant_tag_2` that hold the identitied of `wt` and `syn` tags. 

In [None]:
with open(viral_tags) as file:
    tags = yaml.load(file, Loader=yaml.FullLoader)

pdtags = pd.DataFrame.from_dict(tags, orient='index')

variant_tag_1 = pdtags['viral_tag_1'].apply(pd.Series)

variant_tag_2 = pdtags['viral_tag_2'].apply(pd.Series)

Open CCS alignment file generated in 'align_pacbio' rule.

In [None]:
alignments = pd.read_csv(ccs_alignments, compression='gzip')
alignments.head()

Here we iterate over each row in alignment file looking at which gene each row holds and extracting the relevant `wt` and `syn` tags for each row from `variant_tag_1` and `variant_tag_2` dataframes generate above. We then attach those tags to end of `alignments` frame.

In [None]:
tag_list = []
for _index, row in alignments.iterrows():
    gene = row['gene']
    variant1_wt = variant_tag_1.loc[gene, 'wt']
    variant2_wt = variant_tag_2.loc[gene, 'wt']
    variant1_syn = variant_tag_1.loc[gene, 'syn']
    variant2_syn = variant_tag_2.loc[gene, 'syn']
    tag_list.append([variant1_wt, variant2_wt, variant1_syn, variant2_syn])

tag_df = pd.DataFrame(tag_list, columns=["variant_tag_1_wt",
                                         "variant_tag_2_wt",
                                         "variant_tag_1_syn",
                                         "variant_tag_2_syn"])

alignments = pd.concat([alignments, tag_df], axis=1, sort=False)

Now we will compare tags found in CCSs to the expected `wt` and `syn` tags and generate a list of labels based on the tags found. We will attach this list to the end of `alignments` frame. 

We use the following tag labels:

`wildtype`: CCS has both `wt` tags  
`synonymous tags`: CCS has both `syn` tags  
`mixed`: CCS has a mixture of `wt` and `syn` tags  
`partial wildtype`: CCS has one `wt` tag and the other tag is missing in the read  
`partial synonymous`: CCS has one `syn` tag and the other tag is missing in the read  
`missing tag`: CCS has no detectable tags  
`other`: CCS has tags that don't match neither `wt` nor `syn` tags at any positions  
  
Note for NEP we only iterate over variant tag 2 because it only has one tag, it's not useful for assessing strand exchange. 

In [None]:
strand_exchange_list = []

for _index, row in alignments.iterrows():

    if row['transcript'] != 'fluNEP':

        if (row['variant_tag_1_sequence'] == row['variant_tag_1_wt']
                and row['variant_tag_2_sequence'] == row['variant_tag_2_wt']):
            strand_exchange_list.append("wildtype")

        elif (row['variant_tag_1_sequence'] == row['variant_tag_1_syn']
              and row['variant_tag_2_sequence'] == row['variant_tag_2_syn']):
            strand_exchange_list.append("synonymous tags")

        elif (row['variant_tag_1_sequence'] == row['variant_tag_1_wt']
              and row['variant_tag_2_sequence'] == row['variant_tag_2_syn']
              or row['variant_tag_1_sequence'] == row['variant_tag_1_syn']
              and row['variant_tag_2_sequence'] == row['variant_tag_2_wt']):
            strand_exchange_list.append("mixed")

        elif (row['variant_tag_1_sequence'] == row['variant_tag_1_wt']
              and pd.isnull(row['variant_tag_2_sequence']) is True
              or row['variant_tag_2_sequence'] == row['variant_tag_2_wt']
              and pd.isnull(row['variant_tag_1_sequence']) is True):
            strand_exchange_list.append("partial wildtype")

        elif (row['variant_tag_1_sequence'] == row['variant_tag_1_syn']
              and pd.isnull(row['variant_tag_2_sequence']) is True
              or row['variant_tag_2_sequence'] == row['variant_tag_2_syn']
              and pd.isnull(row['variant_tag_1_sequence']) is True):
            strand_exchange_list.append("partial synonymous")

        elif (pd.isnull(row['variant_tag_1_sequence']) is True
              and pd.isnull(row['variant_tag_2_sequence']) is True):
            strand_exchange_list.append("missing tag")

        else:
            strand_exchange_list.append("other")

    if row['transcript'] == 'fluNEP':

        if row['variant_tag_2_sequence'] == row['variant_tag_2_wt']:
            strand_exchange_list.append("wildtype")

        elif row['variant_tag_2_sequence'] == row['variant_tag_2_syn']:
            strand_exchange_list.append("synonymous tags")

        elif pd.isnull(row['variant_tag_2_sequence']) is True:
            strand_exchange_list.append("missing tag")

        else:
            strand_exchange_list.append("other")

# ckeck that our list of generated xc
# matches the length of `alignments` file
assert len(strand_exchange_list) == len(alignments.index)

# bind to alignments
alignments['tag_in_css'] = strand_exchange_list

Now we will count how many tags fall in each tag label group.

In [None]:
tag_in_css_df = (
    alignments
    .groupby(['tag_in_css', 'gene'])
    .count()
    .sort_values(by='transcript', ascending=False)
    .reset_index()
    )

And finally we plot how many CCS fall in each tag group.

In [None]:
p = (
    p9.ggplot(tag_in_css_df,
              p9.aes('gene', 'name', fill='tag_in_css'))
    + p9.geom_bar(stat='identity')
    + p9.theme(axis_text_x=p9.element_text(angle=90),
               panel_grid_major_x=p9.element_blank()
               )
    + p9.scale_fill_manual(values=CBPALETTE)
    + p9.xlab('flu gene')
    + p9.ylab('ccs count')
    + p9.theme(axis_text_x=p9.element_text(angle=90, hjust=0.5))

)
_ = p.draw()

p9.ggsave(plot=p, filename=plot_strand_exchange, verbose=False)