In [None]:
import pandas as pd
import sc2ts
import tszip
import numpy as np
import matplotlib.pyplot as plt

In [None]:
tsu = tszip.load(snakemake.input[0])
tsup = tszip.load(snakemake.input[1])
dfup = pd.read_csv(snakemake.input[2], index_col=0)

In [None]:
tss = tszip.load(snakemake.input[3])
tssp = tszip.load(snakemake.input[4])
dfsp = pd.read_csv(snakemake.input[5], index_col=0)

# Base ARGs

Comparison of the two base ARGs (tsu is Usher, tss is sc2ts)

In [None]:
tsu

In [None]:
tss

In [None]:
data = []
for name, ts in [("usher", tsu), ("sc2ts", tss)]:
    data.append({
        "name": name,
        "trees": ts.num_trees,
        "nodes": ts.num_nodes,
        "edges": ts.num_edges,
        "mutations": ts.num_mutations,
        "samples": ts.num_samples,
    })
pd.DataFrame(data)

In [None]:
assert tsu.reference_sequence.metadata == tss.reference_sequence.metadata
assert tsu.reference_sequence.data == tss.reference_sequence.data


In [None]:
diff = tss.num_mutations - tsu.num_mutations
diff

In [None]:
f"Difference in mutations = {diff /  tss.num_mutations:.2%}"

The differences are not uniform along the genome, with sc2ts having significantly more mutations at a handful of positions

In [None]:
_, ax = plt.subplots(1, figsize=(16, 4))
ax.plot(dfup.site, dfsp.old - dfup.old) 
ax.set_ylabel("Difference between mutation counts")

In [None]:
diff = tsu.num_nodes - tss.num_nodes
diff

In [None]:
f"Difference in nodes = {diff /  tsu.num_nodes:.2%}"

There is a significant difference in the number of nodes, although this is at least partially explained by sc2ts allowing samples to be internal:

In [None]:
tree = tsu.first()
np.sum(tree.num_children_array[tsu.samples()] > 0)

In [None]:
tree = tss.first()
np.sum(tree.num_children_array[tss.samples()] > 0)

# All sites parsimony ARGs

Comparison of the ARGs we get when we map all the deletions back on (only changes mutations)

In [None]:
tsup.num_mutations

In [None]:
tssp.num_mutations

In [None]:
diff = tssp.num_mutations - tsup.num_mutations
diff

In [None]:
f"Difference in mutations = {diff /  tssp.num_mutations:.2%}"

There is a significant difference between the two - what is this driven by? Let's look at the parsimony reports

In [None]:
dfup

In [None]:
dfsp

In [None]:
data = []
for name, df in [("sc2ts", dfsp), ("usher", dfup)]:
    data.append({
        "name": name,
        "identical": np.sum(df.old == df.new),
        "exact_inter": np.sum(df.old == df.intersection),
        "exact_inter10": np.sum((df.old == df.intersection) & (df.new - df.old < 10)),
        "exact_inter100": np.sum((df.old == df.intersection) & (df.new - df.old < 100)),
    })
pd.DataFrame(data)

A large majority of the sites are identical before and after remapping with parsimony, and about 25k sites recapture the existing mutations exactly with small number of additional mutations.

The differences seem to be from a small number of sites doing very badly. If we plot the number of remapped mutations in sc2ts vs usher along the genome.

In [None]:
_, ax = plt.subplots(1, figsize=(16, 4))
ax.plot(dfup.site, dfsp.new - dfup.new) 
ax.set_ylabel("Difference between remapped mutation counts")
ax.set_ylim((-2000, 2000));

Sc2ts seems to do particularly badly in the righthand flank, which accounts for a large fraction of the excess mutations

In [None]:
dfsp[dfsp.site > 29600]

In [None]:
dfsp[dfsp.site > 29600].new.sum() - dfup[dfup.site > 29600].new.sum()

Usher seems to do badly in one particular place:

In [None]:
diffs = dfsp.new - dfup.new
dfup[diffs < -1000]