# Analyze BAM pileups

Import Python modules:

In [1]:
import altair as alt

import pandas as pd

_ = alt.data_transformers.disable_max_rows()

Get key variables from `Snakemake`:

In [5]:
#pileup_file = snakemake.input.pileup
#aligner_key_file = snakemake.input.aligner_key
#genome_key_file = snakemake.input.genome_key
#sample_key_file = snakemake.input.sample_key
#chart_file = snakemake.output.chart
#diffs_from_reference_file = snakemake.output.diffs_from_reference
#consensus_min_coverage = snakemake.params.consensus_min_coverage
#conensus_min_frac = snakemake.params.consensus_min_frac

pileup_file='../results/pileup/merged_pileup.csv'
aligner_key_file='../results/pileup/merged_aligner_key.csv'
genome_key_file='../results/pileup/merged_genome_key.csv'
sample_key_file='../results/pileup/merged_sample_key.csv'
chart='../results/pileup/merged_interactive_pileup_chart.html'
diffs_from_reference='../results/pileup/merged_diffs_from_reference.csv'
consensus_min_coverage = 5  # require >= this coverage
consensus_min_frac = 0.75

Read the data:

In [6]:
pileup = pd.read_csv(pileup_file)
aligner_key = pd.read_csv(aligner_key_file)
genome_key = pd.read_csv(genome_key_file)
sample_key = pd.read_csv(sample_key_file)

assert set(pileup['aligner']) == set(aligner_key['aligner'])
assert set(pileup['genome']) == set(genome_key['genome'])
assert set(pileup['sample']) == set(sample_key['sample'])

assert len(pileup) == len(pileup.drop_duplicates())
assert pileup['site'].nunique() == pileup['site'].max()

pileup

Unnamed: 0,site,ref_nt,A,C,G,T,aligner,genome,sample
0,1,A,35,0,1,0,1,1,1
1,2,T,0,0,0,39,1,1,1
2,3,T,0,0,0,41,1,1,1
3,4,A,41,0,0,0,1,1,1
4,5,A,36,0,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...
1015575,29866,A,0,0,0,0,2,1,17
1015576,29867,T,0,0,0,0,2,1,17
1015577,29868,G,0,0,0,0,2,1,17
1015578,29869,A,0,0,0,0,2,1,17


Identify differences between reference and consensus:

In [13]:
nts = ['A', 'C', 'G', 'T']

diffs_from_reference = (
    pileup
    .assign(depth=lambda x: x[nts].sum(axis=1),
            consensus=lambda x: x[nts].idxmax(axis=1),
            consensus_frac=lambda x: x[nts].max(axis=1) / x['depth']
            )
    .query('depth >= @consensus_min_coverage')
    .query('(ref_nt != consensus) or (consensus_frac < @consensus_min_frac)')
    )

diffs_from_reference

Unnamed: 0,site,ref_nt,A,C,G,T,aligner,genome,sample,depth,consensus,consensus_frac
12312,12313,T,226,273,57,1131,1,1,1,1687,T,0.670421
20976,20977,G,60,53,421,35,1,1,1,569,G,0.739895
26523,26524,T,249,202,335,1113,1,1,1,1899,T,0.586098
26524,26525,G,246,157,1378,119,1,1,1,1900,G,0.725263
26525,26526,G,198,217,1184,140,1,1,1,1739,G,0.680851
...,...,...,...,...,...,...,...,...,...,...,...,...
911573,15474,T,3,2,0,13,2,1,14,18,T,0.722222
913243,17144,G,2,0,5,0,2,1,14,7,G,0.714286
923207,27108,T,0,0,4,5,2,1,14,9,T,0.555556
980162,24323,A,197,126,1,0,2,1,16,324,A,0.608025


Make interactive pilup plot (this may be very large, and won't scale well if many more samples are added):

In [None]:
width = 900

selections = {}
for key_df, field, name in [(aligner_key, 'aligner', 'read'),
                            (genome_key, 'genome', 'reference'),
                            (sample_key, 'sample', 'viral'),
                            ]:
    selections[field] = alt.selection_single(
            name=name,
            fields=[field],
            bind=alt.binding_select(options=key_df[field].tolist(),
                                    labels=key_df[field + '_name'].tolist()),
            init={field: key_df[field].tolist()[0]}
            )

zoom_brush = alt.selection_interval(
                encodings=['x'],
                mark=alt.BrushConfig(stroke='black', strokeWidth=2)
                )
    
site_zoom = (
    alt.Chart(pileup[['site']].drop_duplicates())
    .mark_rect(color='lightgray')
    .encode(x=alt.X('site:O',
                    axis=alt.Axis(grid=False,
                                  ticks=False,
                                  ),
                    title=None)
            )
    .add_selection(zoom_brush)
    .properties(width=width,
                height=15,
                title='site zoom bar')
    )

pileup_chart = (
    alt.Chart(pileup)
    .encode(x=alt.X('site:O',
                    axis=alt.Axis(grid=False,
                                  ticks=False,
                                  ),
                    ),
            y='depth:Q',
            tooltip=['site', 'depth:Q', 'A', 'C', 'G', 'T',
                     alt.Tooltip('ref_nt', title='reference')],
            )
    .mark_bar()
    .add_selection(selections['aligner'],
                   selections['genome'],
                   selections['sample'],
                   zoom_brush)
    .transform_filter(selections['aligner'])
    .transform_filter(selections['genome'])
    .transform_filter(selections['sample'])
    .transform_filter(zoom_brush)
    .transform_calculate(
        depth=alt.datum.A + alt.datum.C + alt.datum.G + alt.datum.T
        )
    .properties(height=250,
                width=width,
                title='read coverage over genome',
                )
    )

zoomable_pileup_chart = site_zoom & pileup_chart