## CCS stats for PacBio runs¶

This notebook summarizes ccs for pacbio runs from hashing experiments.

In [None]:
import re

from IPython.display import HTML, display

import alignparse
import alignparse.ccs

import pandas as pd

import plotnine as p9

Get snakemake variables.

In [None]:
ccs_report = snakemake.input.ccs_report
ccs_fastq = snakemake.input.ccs_fastq
runs = snakemake.params.runs
summary = snakemake.output.summary
threads = snakemake.threads
expt = snakemake.wildcards.expt

Create pacbio run dataframe

In [None]:
run_df = pd.DataFrame({'name': [re.sub(f"^{expt}_", '', run) for run in runs],
                       'fastq': ccs_fastq,
                       'report': ccs_report})
display(HTML(run_df.to_html(index=False)))

In [None]:
ccs_summaries = alignparse.ccs.Summaries(run_df, ncpus=threads)

Plot statistics for each ccs run.

In [None]:
plot = ccs_summaries.plot_zmw_stats()
plot = plot + p9.theme(panel_grid_major_x=p9.element_blank())
plot = plot + p9.ggtitle(expt)

_ = plot.draw()

Plot statistics on generated CCSs: their length, and accuracy (as reported by the ccs program):

In [None]:
for variable in ['length', 'accuracy', 'passes']:
    if ccs_summaries.has_stat(variable):
        p = ccs_summaries.plot_ccs_stats(variable, maxcol=7,
                                         bins=25, panelsize=2.5)
        p = p + p9.theme(panel_grid_major_x=p9.element_blank())
        p = p + p9.ggtitle(expt)
        _ = p.draw()
    else:
        print(f"No {variable} statistics available.")

Finally, save ccs statistics.

In [None]:
p9.ggsave(plot=plot, filename=summary, verbose=False)