# Plot frequencies of clade founders at 4-fold degenerate sites

In [1]:
clade_founder_nts_csv = "results/clade_founder_nts/clade_founder_nts.csv"

In [2]:
import os

import altair as alt

import pandas as pd

import yaml

In [3]:
with open("config.yaml") as f:
    config = yaml.safe_load(f)
    
clade_synonyms = config["clade_synonyms"]

In [4]:
df = (
    pd.read_csv(clade_founder_nts_csv, low_memory=False)
    .query("clade in @clade_synonyms")
    .query("four_fold_degenerate")
    .assign(clade=lambda x: x["clade"] + " (" + x["clade"].map(clade_synonyms) + ")")
    .groupby(["clade", "nt"], as_index=False)
    .aggregate(count=pd.NamedAgg("nt", "count"))
    .assign(frac=lambda x: x["count"] / x.groupby("clade")["count"].transform("sum"))
)

chart = (
    alt.Chart(df)
    .encode(
        x=alt.X("frac", title="fraction of 4-fold degenerate sites"),
        y=alt.Y("clade", title=None),
        color=alt.Color("nt", title="nucleotide"),
        tooltip=df.columns.tolist(),
    )
    .mark_bar()
    .properties(width=250)
)

chart.save(os.path.join(os.path.dirname(clade_founder_nts_csv), "nucleotide_freqs.html"))

chart