# Diffs from reference from pileup files

Import Python modules:

In [None]:
import itertools

import altair as alt

import pandas as pd

Get variables from `snakemake`:

In [None]:
diffs_from_ref_files = snakemake.input.diffs_from_ref
comparator_map_file = snakemake.input.comparator_map
diffs_from_ref_stats_file = snakemake.output.diffs_from_ref_stats
diffs_from_ref_chart_file = snakemake.output.diffs_from_ref_chart
samples = snakemake.params.samples

assert len(diffs_from_ref_files) == len(samples)

Get data frame of differences from reference, adding in the comparator identities

In [None]:
nts = ['A', 'C', 'G', 'T']

comparator_map = pd.read_csv(comparator_map_file)

diffs_from_ref = (
    pd.concat([pd.read_csv(f).assign(sample=sample)
               for f, sample in zip(diffs_from_ref_files, samples)],
              ignore_index=False)
    .drop(columns=['depth', 'consensus_frac'])
    [['sample', 'aligner', 'site', 'reference', 'consensus', *nts]]
    .merge(comparator_map,
           on=['site', 'reference'],
           validate='many_to_one',
           how='left')
    )

assert diffs_from_ref.notnull().all().all()

In [None]:
print(f"Writing to {diffs_from_ref_stats_file}")

diffs_from_ref.to_csv(diffs_from_ref_stats_file, index=False)

diffs_from_ref

Now plot differences from the reference including the comparator genomes:

In [None]:
# first make data frame to plot
comparators = ['any_mutation'] + comparator_map.columns[2:].tolist()

def format_mut(r):
    return (f"{r['reference']}{r['site']}{r['consensus']}" + ' (' +
            ', '.join(f"{nt}={r[nt]}" for nt in ['A', 'C', 'G', 'T'] if r[nt]) + ')'
            )

plot_df = (
    diffs_from_ref
    .assign(mutation=lambda x: x.apply(format_mut, axis=1),
            sample=lambda x: pd.Categorical(x['sample'], samples, ordered=True),
            any_mutation=pd.NA,
            )
    .melt(id_vars=['sample', 'aligner', 'site', 'consensus', 'mutation'],
          value_vars=comparators,
          var_name='towards',
          value_name='comparator_nt')
    .assign(keep=lambda x: (x['towards'] == 'any_mutation') | (x['consensus'] == x['comparator_nt']))
    .query('keep')
    .groupby(['sample', 'aligner', 'towards'])
    .aggregate(n_mutations=pd.NamedAgg('site', 'count'),
               mutations=pd.NamedAgg('mutation', lambda s: '; '.join(s)))
    .reset_index()
    .assign(mutations=lambda x: x['mutations'].fillna(''))
    )
assert len(plot_df) == len(samples) * diffs_from_ref['aligner'].nunique() * len(comparators)

# make selectors
comparator_selection = alt.selection_multi(
        fields=['towards'],
        bind='legend',
        init=[{'towards': 'any_mutation'}]
        )
aligners = plot_df['aligner'].unique()
aligner_selection = alt.selection_single(
        name='read',
        fields=['aligner'],
        bind=alt.binding_select(options=aligners),
        init={'aligner': aligners[0]}
        )
# make chart
diffs_from_ref_chart = (
    alt.Chart(plot_df)
    .encode(x=alt.X('n_mutations:Q',
                    title='number of mismatch mutations relative to reference',
                    axis=alt.Axis(tickMinStep=1)
                    ),
            y='sample:N',
            color=alt.condition(comparator_selection, 'towards:N', alt.value(None)),
            shape='towards:N',
            tooltip=['sample',
                     'n_mutations',
                     'mutations'
                     ]
            )
    .mark_point(size=75,
                filled=True)
    .add_selection(aligner_selection,
                   comparator_selection)
    .transform_filter(aligner_selection)
    )

print(f"Saving to {diffs_from_ref_chart_file}")
diffs_from_ref_chart.save(diffs_from_ref_chart_file)

diffs_from_ref_chart