# Align and call mutations in PacBio sequencing of viral genes.

This Python Jupyter notebook aligns the PacBio sequencing of the viral genes, calls mutations, and then puts them in numbering based on the viral ORF.

In [None]:
import os
import warnings

from IPython.display import HTML, display

import alignparse.ccs
import alignparse.consensus
import alignparse.minimap2
import alignparse.targets
import alignparse.utils
from alignparse.constants import CBPALETTE

import dms_variants.plotnine_themes
import dms_variants.utils

import pandas as pd

import plotnine as p9

In [None]:
warnings.simplefilter('ignore')

In [None]:
p9.theme_set(dms_variants.plotnine_themes.theme_graygrid())

Import snakemake variables

In [None]:
amplicons = snakemake.input.amplicons
features = snakemake.input.features
plot_amplicons = snakemake.output.plot_amplicons
plot_alignment = snakemake.output.plot_alignment
ccs_report = snakemake.input.ccs_report
ccs_fastq = snakemake.input.ccs_fastq
runs = snakemake.params.runs
amplicon_to_reference_df = snakemake.input.amplicon_to_reference_df
alignment_stats = snakemake.params.alignment_stats
mutation_df = snakemake.output.mutation_df
threads = snakemake.threads

## Import target amplicons
Here we import target files for amplicons that where made using primers that either aligned to ends of flu sequences ('termini') or primers aligning approximately to the middle of a gene ('mid').

In [None]:
targetfile = amplicons

nlines_to_show = 100
with open(targetfile) as f:
    print(''.join(next(f) for _ in range(nlines_to_show)))

Feature file defines features in amplicon genbank file.

In [None]:
feature_parse_specs_file = features
with open(feature_parse_specs_file) as f:
    print(f.read())

In [None]:
targets = alignparse.targets.Targets(
                seqsfile=targetfile,
                feature_parse_specs=feature_parse_specs_file,
                ignore_feature_parse_specs_keys=['default_2tags',
                                                 'default_1tag'],
                allow_extra_features=True,
                allow_clipped_muts_seqs=True)

Now let's plot all transcript amplicons and save plots as a figure. 

In [None]:
plot = targets.plot(ax_width=10)

In [None]:
print(f"Saving figure to {plot_amplicons}")
plot.savefig(plot_amplicons, format="svg")

## Align pacbio reads

Create pacbio run dataframe

In [None]:
pacbio_runs = pd.DataFrame(
    {'name': runs, 'fastq': ccs_fastq, 'report': ccs_report}
)
display(HTML(pacbio_runs.to_html(index=False)))

Now align all ccs to amplicons and filter for the ones that meet criteria defined in `fluCA09_features.yaml`.

Create an alignparse.minimap2.Mapper to run minimap2 with minimap [options](https://jbloomlab.github.io/alignparse/alignparse.minimap2.html#alignparse.minimap2.OPTIONS_VIRUS_W_DEL) that allow for large deletions in viral sequences. 

In [None]:
mapper = alignparse.minimap2.Mapper(alignparse.minimap2.OPTIONS_VIRUS_W_DEL)

print(f"Using `minimap2` {mapper.version} with these options:\n"
      + ' '.join(mapper.options))

In [None]:
align_and_parse_outdir = os.path.join(alignment_stats)

In [None]:
readstats, aligned, filtered = targets.align_and_parse(
    df=pacbio_runs,
    mapper=mapper,
    outdir=align_and_parse_outdir,
    name_col='name',
    queryfile_col='fastq',
    overwrite=True,  # overwrite any existing output
    to_csv=True,
    ncpus=threads,
)

Here are read alignment statistics telling how many reads have been aligned or filtered out for each reference amplicon.

In [None]:
readstats.head()

In [None]:
p = (
    p9.ggplot(readstats.assign(
        category=lambda x: pd.Categorical(x['category'],
                                          x['category'].unique(),
                                          ordered=True),
        is_aligned=lambda x: x['category'].str.contains('aligned'),),
        p9.aes('category', 'count', fill='is_aligned'))
    + p9.geom_bar(stat='identity')
    + p9.facet_wrap('~ name', nrow=1)
    + p9.theme(axis_text_x=p9.element_text(angle=90),
               panel_grid_major_x=p9.element_blank()
               )
    + p9.scale_fill_manual(values=CBPALETTE)
)
_ = p.draw()

p9.ggsave(plot=p, filename=plot_alignment, verbose=False)

Now lets look at the most common reasons why some reads have been filtered out.

In [None]:
concatenated_ff_df = pd.DataFrame()
for filtered_file in filtered:
    ff = pd.read_csv(filtered[filtered_file], na_filter=False)
    ff['gene'] = filtered_file
    concatenated_ff_df = pd.concat([concatenated_ff_df, ff], axis=0)
concatenated_ff_df.head()

In [None]:
p = (
    p9.ggplot(concatenated_ff_df, p9.aes('filter_reason'))
    + p9.geom_bar()
    + p9.facet_wrap('~ gene', ncol=5)
    + p9.theme(axis_text_x=p9.element_text(angle=90),
               figure_size=(12, 8),
               panel_grid_major_x=p9.element_blank(),
               )
    )
_ = p.draw()

## Get all mutations in a common ORF numbering system
The above code has called mutations in key features.
Now we need to actually convert these into the ORF-based numbering scheme that we care about.

First, we read a CSV file that maps the numbering of sites in each feature to the ORF numbering:

In [None]:
amplicon_to_reference = pd.read_csv(amplicon_to_reference_df)
amplicon_to_reference.head()

We extract the names of the features of interest from `amplicon_to_reference`.
The first four columns are `target`, `transcript`, `gene`, `wt_nt`, and `ORF_position` and are not relevant to features, so we will not include them. 

In [None]:
column_names = amplicon_to_reference.columns.tolist()

assert amplicon_to_reference.columns.tolist()[:5] \
    == ['target', 'transcript', 'gene', 'wt_nt', 'ORF_position'], \
    amplicon_to_reference

amplicon_features = column_names[5:]
print(amplicon_features)

Here are the aligned files made by `targets.align_and_parse`.

In [None]:
for aligned_files in aligned:
    print(aligned[aligned_files])

Here we read in each aligned transcript CSV file that was made by `targets.align_and_parse`.
Then we re-number the mutations from the feature-level numbering in that file to an ORF-specific numbering scheme.
Next, we combine all of the mutations from each feature into a single column (`all_mutations_orf_numbered`) that gives all mutations in that CCS, and also add the names of the flu genes and transcripts.
Then we write the resulting data frame of mutations to a CSV.

In [None]:
orf_numbered_alignments = pd.DataFrame()

for target, alignments_csv in aligned.items():

    alignments = pd.read_csv(alignments_csv, na_filter=False)

    print(f"Analyzing {len(alignments)} aligned CCSs for {target}")

    for feature in amplicon_features:
        number_mapping = (
                amplicon_to_reference
                .query('target == @target')
                .query(f"{feature}.notnull()", engine='python')
                .assign(feature_num=lambda x: x[feature].astype(int))
                )

        renumberer = alignparse.utils.MutationRenumber(
                                      number_mapping=number_mapping,
                                      old_num_col='feature_num',
                                      new_num_col='ORF_position',
                                      wt_nt_col='wt_nt',
                                      err_suffix=f" for {feature} in {target}",
                                      )

        alignments[f"{feature}_mutations_orf_numbered"] = (
                    alignments[f"{feature}_mutations"]
                    .map(renumberer.renumber_muts)
                    )

    alignments['target'] = target
    orf_numbered_alignments = orf_numbered_alignments.append(alignments)

orf_numbered_alignments = (
    orf_numbered_alignments
    # aggregate mutations across all features to get all mutations in ORF
    .assign(all_mutations_orf_numbered=lambda x: x.apply(
                    lambda row: alignparse.utils.sort_mutations([
                            row[f"{feature}_mutations_orf_numbered"]
                            for feature in amplicon_features
                            ]),
                    axis=1,
                    )
            )
    # merge in the viral trandcript and gene names
    .merge(amplicon_to_reference[['target', 'gene', 'transcript']].drop_duplicates(),
           how='left', on='target')
    )
assert orf_numbered_alignments['gene'].notnull().all()

print(f"\nWriting ORF-numbered alignments to {mutation_df}")
orf_numbered_alignments.to_csv(mutation_df, index=False, compression='gzip',)