# Merge expected and actual counts

First, read the data:

In [1]:
import pandas as pd


expected = (
    pd.read_csv("../results/expected_mut_counts/expected_mut_counts.csv")
    .rename(columns={"site": "nt_site", "codon": "clade_founder_codon"})
    .assign(
        nt_mutation=lambda x: (
            x["clade_founder_nt"] + x["nt_site"].astype(str) + x["mut_type"].str[-1]
        )
    )
)

actual = pd.read_csv("../results/mutation_counts/aggregated.csv")[
    [
        "clade",
        "subset",
        "nt_site",
        "nt_mutation",
        "codon_change",
        "aa_mutation",
        "count",
    ]
].rename(columns={"count": "actual_count"})

muts_to_exclude = (
    pd.read_csv("../results/expected_vs_actual_mut_counts/mutations_to_exclude.csv")
    .rename(columns={"site": "nt_site", "mutation": "nt_mutation"})
)

Merge expected and actual, putting counts of any expected that are not observed to zero:

In [2]:
merged = (
    pd.concat(
        [
            expected.merge(subset_actual, how="left", validate="one_to_many").assign(
                subset=subset,
                actual_count=lambda x: x["actual_count"].fillna(0).astype(int),
            )
            for subset, subset_actual in actual.groupby("subset")
        ],
        ignore_index=True,
    )
)

Now exclude any mutations that are not from the founder identity codon, which results when there are multiple substitutions per site so that a clade founder identity is different from the reference:

In [3]:
merged = (
    merged
    .assign(
        start_codon=lambda x: x["codon_change"].map(
                lambda s: pd.NA if pd.isnull(s) else ";".join([c[: 3] for c in s.split(";")])
        ),
        from_founder_codon=lambda x: (
            (x["start_codon"] == x["clade_founder_codon"]) | x["start_codon"].isnull()
        ),
    )
    .query("from_founder_codon")
)

# add back any zero count mutations lost to this process
merged = (
    merged.merge(
        pd.concat([expected.assign(subset=subset) for subset in merged["subset"].unique()]),
        how="outer",
    )
    .assign(actual_count=lambda x: x["actual_count"].fillna(0).astype(int))
)

Now add the sites to exclude:

In [4]:
merged = (
    merged
    .merge(
        muts_to_exclude.assign(exclude=True),
        on=["clade", "nt_site", "nt_mutation"],
        how="left",
        validate="many_to_one",
    )
    .assign(exclude=lambda x: x["exclude"].fillna(False))
)

merged

Unnamed: 0,clade,nt_site,clade_founder_nt,gene,clade_founder_codon,codon_position,mut_type,four_fold_degenerate,expected_count,nt_mutation,subset,codon_change,aa_mutation,actual_count,start_codon,from_founder_codon,exclude
0,20A,266,A,ORF1a;ORF1ab,ATG;ATG,1;1,AtoC,False,0.23089,A266C,England,,,0,,True,False
1,20A,266,A,ORF1a;ORF1ab,ATG;ATG,1;1,AtoG,False,1.30730,A266G,England,,,0,,True,False
2,20A,266,A,ORF1a;ORF1ab,ATG;ATG,1;1,AtoT,False,0.41545,A266T,England,,,0,,True,False
3,20A,270,A,ORF1a;ORF1ab,GAG;GAG,2;2,AtoC,False,0.23089,A270C,England,,,0,,True,False
4,20A,270,A,ORF1a;ORF1ab,GAG;GAG,2;2,AtoG,False,1.30730,A270G,England,,,0,,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3419905,22B,28679,G,N,GAG,1,GtoA,False,11.40400,G28679A,all,,,0,,,False
3419906,22B,28679,G,N,GAG,1,GtoT,False,15.64200,G28679T,all,,,0,,,False
3419907,22B,395,C,ORF1a;ORF1ab,CAA;CAA,1;1,CtoT,False,33.66100,C395T,all,,,0,,,False
3419908,22B,1625,C,ORF1a;ORF1ab,CTC;CTC,1;1,CtoA,False,2.17220,C1625A,all,,,0,,,False


Finally, clean up the data frame and add amino-acid mutations:

In [6]:
# clean up data frame to just have columns of interest
merged = (
    merged[
        [
            "clade",
            "subset",
            "nt_site",
            "nt_mutation",
            "exclude",
            "expected_count",
            "actual_count",
            "clade_founder_nt",
            "gene",
            "clade_founder_codon",
            "codon_position",
            "four_fold_degenerate",
        ]
    ]
)

merged

Unnamed: 0,clade,subset,nt_site,nt_mutation,exclude,expected_count,actual_count,clade_founder_nt,gene,clade_founder_codon,codon_position,four_fold_degenerate
0,20A,England,266,A266C,False,0.23089,0,A,ORF1a;ORF1ab,ATG;ATG,1;1,False
1,20A,England,266,A266G,False,1.30730,0,A,ORF1a;ORF1ab,ATG;ATG,1;1,False
2,20A,England,266,A266T,False,0.41545,0,A,ORF1a;ORF1ab,ATG;ATG,1;1,False
3,20A,England,270,A270C,False,0.23089,0,A,ORF1a;ORF1ab,GAG;GAG,2;2,False
4,20A,England,270,A270G,False,1.30730,0,A,ORF1a;ORF1ab,GAG;GAG,2;2,False
...,...,...,...,...,...,...,...,...,...,...,...,...
3419905,22B,all,28679,G28679A,False,11.40400,0,G,N,GAG,1,False
3419906,22B,all,28679,G28679T,False,15.64200,0,G,N,GAG,1,False
3419907,22B,all,395,C395T,False,33.66100,0,C,ORF1a;ORF1ab,CAA;CAA,1;1,False
3419908,22B,all,1625,C1625A,False,2.17220,0,C,ORF1a;ORF1ab,CTC;CTC,1;1,False


In [13]:
merged[["exclude", "four_fold_degenerate", "actual_count", "expected_count"]].groupby(["exclude", "four_fold_degenerate"]).corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,actual_count,expected_count
exclude,four_fold_degenerate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,False,actual_count,1.0,0.432518
False,False,expected_count,0.432518,1.0
False,True,actual_count,1.0,0.574913
False,True,expected_count,0.574913,1.0
True,False,actual_count,1.0,0.198621
True,False,expected_count,0.198621,1.0
True,True,actual_count,1.0,0.258195
True,True,expected_count,0.258195,1.0
