# Align flu pacbio reads

This notebook imports, aligns and filters pacbio amplicons. 

In [None]:
import os
import warnings

from IPython.display import HTML, display

import alignparse.ccs
import alignparse.consensus
import alignparse.minimap2
import alignparse.targets
from alignparse.constants import CBPALETTE

import dms_variants.plotnine_themes
import dms_variants.utils

import pandas as pd

import plotnine as p9

In [None]:
warnings.simplefilter('ignore')

In [None]:
p9.theme_set(dms_variants.plotnine_themes.theme_graygrid())

Import snakemake variables

In [None]:
amplicons = snakemake.input.amplicons
features = snakemake.input.features
plot_amplicons = snakemake.output.plot_amplicons
plot_alignment = snakemake.output.plot_alignment
ccs_report = snakemake.input.ccs_report
ccs_fastq = snakemake.input.ccs_fastq
runs = snakemake.params.runs
alignment_stats = snakemake.params.alignment_stats

## Import target amplicons
Here we import target files for amplicons that where made using primers that either aligned to ends of flu sequences ('termini') or primers aligning approximately to the middle of a segment ('mid').

In [None]:
targetfile = amplicons

nlines_to_show = 100
with open(targetfile) as f:
    print(''.join(next(f) for _ in range(nlines_to_show)))

Feature file defines features in amplicon genbank file. After splicing NEP mRNA ends up only containing a single variant tag, other segments have 2 tags. I don't think M segment primers capture M2 very well becasue part of the primers used to linearize M2 read overlap the 5' splice site and so don't align fully. I, therefore, did not include M2 amplicon reference/ 

In [None]:
feature_parse_specs_file = features
with open(feature_parse_specs_file) as f:
    print(f.read())

In [None]:
targets = alignparse.targets.Targets(
                seqsfile=targetfile,
                feature_parse_specs=feature_parse_specs_file,
                ignore_feature_parse_specs_keys=['default_2tags',
                                                 'default_1tag'],
                allow_extra_features=True,
                allow_clipped_muts_seqs=True)

Now let's plot all segment amplicons and save plots as a figure. 

In [None]:
plot = targets.plot(ax_width=10)

In [None]:
print(f"Saving figure to {plot_amplicons}")
plot.savefig(plot_amplicons, format="svg")

## Align pacbio reads

Create packbio run dataframe

In [None]:
pacbio_runs = pd.DataFrame(
    {'name': runs, 'fastq': ccs_fastq, 'report': ccs_report}
)
display(HTML(pacbio_runs.to_html(index=False)))

Now align all ccs to amplicons and filter for the ones that meet criteria defined in `fluCA09_features.yaml`.

Create an alignparse.minimap2.Mapper to run minimap2 with minimap [options](https://jbloomlab.github.io/alignparse/alignparse.minimap2.html#alignparse.minimap2.OPTIONS_VIRUS_W_DEL) that allow for large deletions in viral sequences. 

In [None]:
mapper = alignparse.minimap2.Mapper(alignparse.minimap2.OPTIONS_VIRUS_W_DEL)

print(f"Using `minimap2` {mapper.version} with these options:\n"
      + ' '.join(mapper.options))

In [None]:
align_and_parse_outdir = os.path.join(alignment_stats)

In [None]:
readstats, aligned, filtered = targets.align_and_parse(
    df=pacbio_runs,
    mapper=mapper,
    outdir=align_and_parse_outdir,
    name_col='name',
    queryfile_col='fastq',
    overwrite=True,  # overwrite any existing output
    ncpus=-1,  # use all available CPUs
)

Here are read alignment statistics telling how many reads have been aligned or filtered out for each reference amplicon.

In [None]:
readstats

In [None]:
p = (
    p9.ggplot(readstats.assign(
        category=lambda x: pd.Categorical(x['category'],
                                          x['category'].unique(),
                                          ordered=True),
        is_aligned=lambda x: x['category'].str.contains('aligned'),),
        p9.aes('category', 'count', fill='is_aligned'))
    + p9.geom_bar(stat='identity')
    + p9.facet_wrap('~ name', nrow=1)
    + p9.theme(axis_text_x=p9.element_text(angle=90),
               panel_grid_major_x=p9.element_blank()
               )
    + p9.scale_fill_manual(values=CBPALETTE)
)
_ = p.draw()

p9.ggsave(plot=p, filename=plot_alignment, verbose=False)

Now lets look at the most common reasons why some reads have been filtered out.

In [None]:
for target in targets.target_names[:1]:
    print(f"First few lines of `filtered` for {target}:")
    display(filtered[target].head())

In [None]:
p = (
    p9.ggplot(pd.concat([df.assign(gene=gene) for gene, df
                         in filtered.items()])
              .assign(gene=lambda x: pd.Categorical(x['gene'],
                                                    x['gene'].unique(),
                                                    ordered=True)),
              p9.aes('filter_reason'))
    + p9.geom_bar()
    + p9.facet_wrap('~ gene', ncol=5)
    + p9.theme(axis_text_x=p9.element_text(angle=90),
               figure_size=(12, 8),
               panel_grid_major_x=p9.element_blank(),
               )
)
_ = p.draw()