# Analyze BAM pileups

Import Python modules:

In [None]:
import altair as alt

import pandas as pd

_ = alt.data_transformers.disable_max_rows()

Get key variables from `Snakemake`:

In [None]:
pileup_files = snakemake.input.pileups
chart_file = snakemake.output.chart
diffs_from_ref_file = snakemake.output.diffs_from_ref
frac_coverage_file = snakemake.output.frac_coverage
pileup_csv = snakemake.output.pileup_csv
consensus_min_coverage = snakemake.params.consensus_min_coverage
consensus_min_frac = snakemake.params.consensus_min_frac
report_frac_coverage = snakemake.params.report_frac_coverage
descriptors = snakemake.params.descriptors
chart_title = snakemake.params.chart_title

Read the data:

In [None]:
assert len(pileup_files) == len(descriptors) >= 1
assert all(set(descriptors[0]) == set(descriptor)
           for descriptor in descriptors)

pileup = pd.DataFrame()
for pileup_file, descriptor in zip(pileup_files, descriptors):
    pileup = pileup.append(pd.read_csv(pileup_file)
                             .assign(**descriptor),
                           ignore_index=True
                           )

nts = ['A', 'C', 'G', 'T']
req_cols = ['reference', 'site'] + nts
descriptor_cols = list(descriptors[0])
assert not set(req_cols).intersection(set(descriptor_cols))
assert set(req_cols + descriptor_cols).issubset(set(pileup.columns))

assert len(pileup) == len(pileup.drop_duplicates())
assert pileup['site'].nunique() == pileup['site'].max()

print(f"Writing pileup to {pileup_csv}")
pileup.to_csv(pileup_csv, index=False)

pileup

Identify differences between reference and consensus:

In [None]:
diffs_from_ref = (
    pileup
    .assign(depth=lambda x: x[nts].sum(axis=1),
            consensus=lambda x: x[nts].idxmax(axis=1),
            consensus_frac=lambda x: x[nts].max(axis=1) / x['depth']
            )
    .query('depth >= @consensus_min_coverage')
    .query('reference != consensus')
    .query('consensus_frac > @consensus_min_frac')
    )

print(f"Writing reference / consensus differences to {diffs_from_ref_file}")
diffs_from_ref.to_csv(diffs_from_ref_file, index=False)

diffs_from_ref

Get fraction of sites above coverage cutoffs:

In [None]:
assert not set(report_frac_coverage).intersection(pileup.columns)

frac_coverage = (
    pileup
    .assign(depth=lambda x: x[nts].sum(axis=1))
    )

frac_coverage_list = []
for cutoff in report_frac_coverage:
    frac_coverage_list.append(
        frac_coverage
        .groupby(list(descriptors[0]))
        .aggregate(
            total_sites=pd.NamedAgg('site', 'nunique'),
            sites_above_cutoff=pd.NamedAgg('depth', lambda s: (s >= cutoff).sum()),
            )
        .reset_index()
        .assign(depth_cutoff=cutoff)
        )
    
frac_coverage = (
    pd.concat(frac_coverage_list)
    .assign(frac_above_cutoff=lambda x: x['sites_above_cutoff'] / x['total_sites'])
    [['aligner', 'depth_cutoff', 'frac_above_cutoff',
      'sites_above_cutoff', 'total_sites']]
    .reset_index(drop=True)
    )

print(f"Saving to {frac_coverage_file}")
frac_coverage.to_csv(frac_coverage_file, index=False)

frac_coverage

Make interactive pilup plot (this plot will be large in terms of file size):

In [None]:
width = 800

descriptor_names = {'aligner': 'read',
                    'sample': 'viral'}

selections = {}
for descriptor in descriptor_cols:
    if descriptor not in descriptor_names:
        raise ValueError(f"no name for descriptor {descriptor}")
    selections[descriptor] = alt.selection_single(
            name=descriptor_names[descriptor],
            fields=[descriptor],
            bind=alt.binding_select(options=pileup[descriptor].unique()),
            init={descriptor: pileup[descriptor].unique()[0]}
            )

zoom_brush = alt.selection_interval(
                encodings=['x'],
                mark=alt.BrushConfig(stroke='black', strokeWidth=2)
                )
    
site_zoom = (
    alt.Chart(pileup[['site']].drop_duplicates())
    .mark_rect(color='lightgray')
    .encode(x=alt.X('site:O',
                    axis=alt.Axis(grid=False,
                                  ticks=False,
                                  ),
                    title='site zoom bar')
            )
    .add_selection(zoom_brush)
    .properties(width=width,
                height=15,
                title=chart_title,
                )
    )

pileup_chart = (
    alt.Chart(pileup)
    .encode(x=alt.X('site:O',
                    axis=alt.Axis(grid=False,
                                  ticks=False,
                                  ),
                    ),
            y=alt.Y('depth:Q',
                    title='sequencing depth'),
            tooltip=['site', 'depth:Q', 'A', 'C', 'G', 'T'],
            )
    .mark_bar()
    .add_selection(zoom_brush)
    .transform_filter(zoom_brush)
    .transform_calculate(
        depth=alt.datum.A + alt.datum.C + alt.datum.G + alt.datum.T
        )
    .properties(height=250,
                width=width,
                )
    )
for selection in selections.values():
    pileup_chart = (pileup_chart
                    .add_selection(selection)
                    .transform_filter(selection)
                    )

zoomable_pileup_chart = site_zoom & pileup_chart

print(f"Saving chart to {chart_file}")
zoomable_pileup_chart.save(chart_file)

zoomable_pileup_chart