# Align flu pacbio reads

This notebook imports, aligns and filters pacbio amplicons. 

In [1]:
import os
import re
import warnings

from IPython.display import HTML, display

import alignparse.ccs
import alignparse.consensus
import alignparse.minimap2
import alignparse.targets
from alignparse.constants import CBPALETTE

import dms_variants.plotnine_themes
import dms_variants.utils

import pandas as pd

import plotnine as p9

In [2]:
warnings.simplefilter('ignore')

In [3]:
p9.theme_set(dms_variants.plotnine_themes.theme_graygrid())

Import snakemake variables

In [5]:
amplicons = snakemake.input.amplicons
features = snakemake.input.features
plot_amplicons = snakemake.output.plot_amplicons
plot_alignment = snakemake.output.plot_alignment
ccs_report = snakemake.input.ccs_report
ccs_fastq = snakemake.input.ccs_fastq
runs = snakemake.params.runs
amplicon_to_reference_df = snakemake.input.amplicon_to_reference_df
alignment_stats = snakemake.params.alignment_stats
mutation_df = snakemake.output.mutation_df
threads = snakemake.threads

## Import target amplicons
Here we import target files for amplicons that where made using primers that either aligned to ends of flu sequences ('termini') or primers aligning approximately to the middle of a segment ('mid').

In [None]:
targetfile = amplicons

nlines_to_show = 100
with open(targetfile) as f:
    print(''.join(next(f) for _ in range(nlines_to_show)))

Feature file defines features in amplicon genbank file.

In [None]:
feature_parse_specs_file = features
with open(feature_parse_specs_file) as f:
    print(f.read())

In [None]:
targets = alignparse.targets.Targets(
                seqsfile=targetfile,
                feature_parse_specs=feature_parse_specs_file,
                ignore_feature_parse_specs_keys=['default_2tags',
                                                 'default_1tag'],
                allow_extra_features=True,
                allow_clipped_muts_seqs=True)

Now let's plot all segment amplicons and save plots as a figure. 

In [None]:
plot = targets.plot(ax_width=10)

In [None]:
print(f"Saving figure to {plot_amplicons}")
plot.savefig(plot_amplicons, format="svg")

## Align pacbio reads

Create packbio run dataframe

In [None]:
pacbio_runs = pd.DataFrame(
    {'name': runs, 'fastq': ccs_fastq, 'report': ccs_report}
)
display(HTML(pacbio_runs.to_html(index=False)))

Now align all ccs to amplicons and filter for the ones that meet criteria defined in `fluCA09_features.yaml`.

Create an alignparse.minimap2.Mapper to run minimap2 with minimap [options](https://jbloomlab.github.io/alignparse/alignparse.minimap2.html#alignparse.minimap2.OPTIONS_VIRUS_W_DEL) that allow for large deletions in viral sequences. 

In [None]:
mapper = alignparse.minimap2.Mapper(alignparse.minimap2.OPTIONS_VIRUS_W_DEL)

print(f"Using `minimap2` {mapper.version} with these options:\n"
      + ' '.join(mapper.options))

In [None]:
align_and_parse_outdir = os.path.join(alignment_stats)

In [None]:
readstats, aligned, filtered = targets.align_and_parse(
    df=pacbio_runs,
    mapper=mapper,
    outdir=align_and_parse_outdir,
    name_col='name',
    queryfile_col='fastq',
    overwrite=True,  # overwrite any existing output
    to_csv=True,
    ncpus=threads,
)

Here are read alignment statistics telling how many reads have been aligned or filtered out for each reference amplicon.

In [None]:
readstats.head()

In [None]:
p = (
    p9.ggplot(readstats.assign(
        category=lambda x: pd.Categorical(x['category'],
                                          x['category'].unique(),
                                          ordered=True),
        is_aligned=lambda x: x['category'].str.contains('aligned'),),
        p9.aes('category', 'count', fill='is_aligned'))
    + p9.geom_bar(stat='identity')
    + p9.facet_wrap('~ name', nrow=1)
    + p9.theme(axis_text_x=p9.element_text(angle=90),
               panel_grid_major_x=p9.element_blank()
               )
    + p9.scale_fill_manual(values=CBPALETTE)
)
_ = p.draw()

p9.ggsave(plot=p, filename=plot_alignment, verbose=False)

Now lets look at the most common reasons why some reads have been filtered out.

In [None]:
concatenated_ff_df = pd.DataFrame()
for filtered_file in filtered:
    ff = pd.read_csv(filtered[filtered_file], na_filter=False)
    ff['gene'] = filtered_file
    concatenated_ff_df = pd.concat([concatenated_ff_df, ff], axis=0)
concatenated_ff_df.head()

In [None]:
p = (
    p9.ggplot(concatenated_ff_df, p9.aes('filter_reason'))
    + p9.geom_bar()
    + p9.facet_wrap('~ gene', ncol=5)
    + p9.theme(axis_text_x=p9.element_text(angle=90),
               figure_size=(12, 8),
               panel_grid_major_x=p9.element_blank(),
               )
    )
_ = p.draw()

## Convert amplicon sequence positions to wt

Here we import file that contains a lookup table. We will make a dictionary out of it to tell what position in reference sequence is equivalent to position in an amplicon sequence.

In [None]:
amplicon_to_reference = pd.read_csv(amplicon_to_reference_df, na_filter=False)
amplicon_to_reference.head()

We extract feature names from `amplicon_to_reference` file. The first four columns are `target`, `gene`, `ORF_position`, and `wt_nt` and are not relevant to features, so we will not include them. 

In [None]:
column_names = amplicon_to_reference.columns.tolist()
amplicon_features = column_names[4:]
print(amplicon_features)

Here are the aligned files made by `targets.align_and_parse`.

In [None]:
for aligned_files in aligned:
    print(aligned[aligned_files])

Here we read in each aligned segment csv file that was made by `targets.align_and_parse`. Then for each amplicon feature we make a dictionry from `amplicon_to_reference` file and use regex to change each nucleotide position in an amplicon to equivalent position in the reference sequence. 

In [None]:
concatenated_amplicons = pd.DataFrame()

for alignment in aligned:
    af = pd.read_csv(aligned[alignment], na_filter=False)
    print('analyzing ' + alignment + ' target using '
          + aligned[alignment] + ' dataframe')

    if af.empty:
        # in case some amplicons have no mutations print warning
        print('no mutations found in ' + alignment)

    else:
        reference = amplicon_to_reference.loc[
            (amplicon_to_reference['target'] == alignment),
            ["target", "ORF_position", 'sequenced_ORF_1',
             "sequenced_ORF_2", "termini5", "termini3"]
        ]

        for ap_feature in amplicon_features:
            # first we build a dictionary
            # to find positions in reference sequence
            ref_dict = {}
            ref_site_mutation_list = []

            for (_target, _feature), df in reference.groupby(['target', ap_feature]):
                df['ORF_position'] = df['ORF_position'].astype(str)
                feature_num_to_orf_num_dict = (df
                                               .query('target == @_target')
                                               .query(f'{ap_feature} == @_feature')
                                               .set_index(ap_feature)['ORF_position']
                                               .to_dict()
                                               )
                ref_dict.update(feature_num_to_orf_num_dict)

            if any(ref_dict) is False:
                print("no " + ap_feature + " in " + alignment)
                continue
            # the first element in the dictionary is always
            # an empty key that should be deleted
            del ref_dict['']

            # here we use regex to change amplicon
            # positions to positions in refrence
            for _index, row in af.iterrows():
                feature_col = row[ap_feature + '_mutations']
                dictionary = ref_dict
                pattern = re.compile(r'(?<!\d)(' + '|'
                                     .join(re.escape(key) for key in dictionary
                                           .keys()) + r')(?!\d)')
                result = pattern.sub(lambda x: dictionary[x.group()], feature_col)
                ref_site_mutation_list.append(result)

            ref_site_mutation_df = pd.DataFrame(ref_site_mutation_list)
            # attach reference-numbered mutations
            af[ap_feature + '_reference'] = ref_site_mutation_df
        af['target'] = alignment
        concatenated_amplicons = pd.concat([concatenated_amplicons, af], axis=0)
concatenated_amplicons.to_csv(mutation_df, index=False)
