# Get clade founder amino acids

In [6]:
import Bio.Seq

import pandas as pd

import yaml

clade_founder_nts = pd.read_csv(snakemake.input.clade_founder_nts)

orf1ab_to_nsps = snakemake.params.orf1ab_to_nsps
clade_synonyms = snakemake.params.clade_synonyms

In [15]:
def clade_label(clade):
    if clade in clade_synonyms:
        return f"{clade} ({clade_synonyms[clade]})"
    else:
        return clade

# codon translation table
codon_table = {
    f"{nt1}{nt2}{nt3}": str(Bio.Seq.Seq(f"{nt1}{nt2}{nt3}").translate())
    for nt1 in "ACGT" for nt2 in "ACGT" for nt3 in "ACGT"
}

# get clade founder amino-acids
clade_founder_aas = (
    clade_founder_nts
    .query("gene != 'noncoding'")
    [["clade", "gene", "codon", "codon_site"]]
    .drop_duplicates()
    .assign(
        gene=lambda x: x["gene"].str.split(";"),
        codon=lambda x: x["codon"].str.split(";"),
        codon_site=lambda x: x["codon_site"].str.split(";"),
    )
    .explode(["gene", "codon", "codon_site"])
    .assign(
        aa=lambda x: x["codon"].map(codon_table),
        codon_site=lambda x: x["codon_site"].astype(int),
        clade=lambda x: x["clade"].map(clade_label),
    )
    .rename(columns={"codon_site": "site", "aa": "amino acid"})
    .drop(columns="codon")
    .query("gene != 'ORF1a'")  # this is just subset of ORF1ab
)

# now convert ORF1ab numbers to nsp numbers
orf1ab_to_nsps_df = pd.concat(
    [
        pd.DataFrame(
            [(i, i - start + 1) for i in range(start, end + 1)],
            columns=["ORF1ab_site", "nsp_site"],
        ).assign(nsp=nsp).drop_duplicates()
        for nsp, (start, end) in orf1ab_to_nsps.items()
    ],
    ignore_index=True,
)

clade_founder_aas = pd.concat(
    [
        clade_founder_aas,
        (
            clade_founder_aas
            .query("gene == 'ORF1ab'")
            .merge(
                orf1ab_to_nsps_df,
                left_on="site",
                right_on="ORF1ab_site",
                validate="many_to_one",
            )
            .drop(columns=["gene", "ORF1ab_site", "site"])
            .rename(columns={"nsp_site": "site", "nsp": "gene"})
        ),
    ],
    ignore_index=False,
).drop_duplicates()

clade_founder_aas.to_csv(snakemake.output.clade_founder_aas, index=False)

array(['ORF1ab', 'S', 'ORF3a', 'E', 'M', 'ORF6', 'ORF7a', 'ORF7b', 'ORF8',
       'N', 'ORF9b', 'ORF10', 'nsp1', 'nsp2', 'nsp3', 'nsp4',
       'nsp5 (Mpro)', 'nsp6', 'nsp7', 'nsp8', 'nsp9', 'nsp10',
       'nsp12 (RdRp)', 'nsp13', 'nsp14', 'nsp15', 'nsp16'], dtype=object)