# Analyze the plasmid control sequences

Get variables from `snakemake`:

In [None]:
pileup_csvs = snakemake.input.pileups
consensus_min_frac = snakemake.params.consensus_min_frac
consensus_min_coverage = snakemake.params.consensus_min_coverage
descriptors = snakemake.params.descriptors
plasmid_muts_csv = snakemake.output.plasmid_muts

Import Python modules:

In [None]:
import altair as alt

import altair_saver

import pandas as pd

_ = alt.data_transformers.disable_max_rows()

Read the data:

In [None]:
nts = ['A', 'C', 'G', 'T']

assert len(descriptors) == len(pileup_csvs)

pileups = (pd.concat([pd.read_csv(f).assign(**descriptor)
                      for f, descriptor in zip(pileup_csvs, descriptors)])
           .assign(depth=lambda x: x[nts].sum(axis=1),
                   consensus=lambda x: x[nts].idxmax(axis=1),
                   consensus_frac=lambda x: x[nts].max(axis=1) / x['depth'],
                   site_called=lambda x: ((x['depth'] >= consensus_min_coverage) &
                                          (x['consensus_frac'] > consensus_min_frac))
                   )         
           )

Get all sites mutated in at least one sample:

In [None]:
mutated_sites = (
    pileups
    .query('site_called')
    .query('consensus != reference')
    ['site']
    .unique()
    .tolist()
    )

print(mutated_sites)

Get counts at mutated sites.
Shows that most samples have refernece nucleotide at all sites except that reference has G28085T and C28144T (the latter being consistent with aligning against proCoV2 rather than Wuhan-Hu-1).

In [None]:
mut_site_counts = (
    pileups
    .query('site in @mutated_sites')
    .query('site_called')
    .sort_values('site')
    .groupby(['site', 'reference', 'aligner', 'consensus'])
    .aggregate(nsample=pd.NamedAgg('sample', 'count'))
    )

display(mut_site_counts)

print(f"Writing to {plasmid_muts_csv}")
mut_site_counts.to_csv(plasmid_muts_csv)