# Get amino-acid mutations in clade founders

In [1]:
# input files
clade_founder_nts_csv = "../results/clade_founder_nts/clade_founder_nts.csv"

In [40]:
import itertools

import altair as alt

import Bio.Seq
import Bio.SeqIO

import pandas as pd

Get protein sequences for all genes:

In [20]:
clade_founder_seqs = (
    pd.read_csv(clade_founder_nts_csv)
    .assign(gene=lambda x: x["gene"].str.split(";"))
    .explode("gene")
    .sort_values(["clade", "gene", "site"])
    .groupby(["clade", "gene"], as_index=False)
    .aggregate(gene_seq=pd.NamedAgg("nt", lambda s: "".join(s)))
    .assign(
        prot_seq=lambda x: x["gene_seq"].map(
            lambda s: str(Bio.Seq.Seq(s).translate())
        )
    )
)

assert all(
    clade_founder_seqs["gene_seq"].map(len)
    == 3 * clade_founder_seqs["prot_seq"].map(len)
)

clade_founder_seqs

Unnamed: 0,clade,gene,gene_seq,prot_seq
0,19A,E,ATGTACTCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATA...,MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCCNI...
1,19A,M,ATGGCAGATTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGC...,MADSNGTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNRFL...
2,19A,N,ATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTA...,MSDNGPQNQRNAPRITFGGPSDSTGSNQNGERSGARSKQRRPQGLP...
3,19A,ORF10,ATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTC...,MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT
4,19A,ORF1a,ATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAAC...,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...
...,...,...,...,...
223,22C,ORF6,ATGTTTCATCTCGTTGACTTTCAGGTTACTATAGCAGAGATATTAC...,MFHLVDFQVTIAEILLIIMRTFKVSIWNLDYIINLIIKNLSKSLTE...
224,22C,ORF7a,ATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTG...,MKIILFLALITLATCELYHYQECVRGTTVLLKEPCSSGTYEGNSPF...
225,22C,ORF7b,ATGATTGAACTTTCATTAATTGACTTCTATTTGTGCTTTTTAGCCT...,MIELSLIDFYLCFLAFLLFLVLIMLIIFWFSLELQDHNETCHA
226,22C,ORF8,ATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTGTAGCTGCAT...,MKFLVFLGIITTVAAFHQECSLQSCTQHQPYVVDDPCPIHFYSKWY...


Get amino-acid mutations between each pair of clades:

In [39]:
def get_muts(row):
    assert len(row["prot_seq_1"]) == len(row["prot_seq_2"])
    return [
        f"{x1}{r + 1}{x2}"
        for r, (x1, x2) in enumerate(zip(row["prot_seq_1"], row["prot_seq_2"]))
        if x1 != x2
    ]

aa_muts_df = []

for clade_1, clade_2 in itertools.product(
    clade_founder_seqs["clade"].unique(), clade_founder_seqs["clade"].unique(),
):
    aa_muts_df.append(
        clade_founder_seqs
        .query("clade == @clade_1")
        .merge(
            clade_founder_seqs.query("clade == @clade_2"),
            on="gene",
            suffixes=["_1", "_2"],
        )
        .assign(
            mutations_1_to_2=lambda x: x.apply(get_muts, axis=1),
            n_mutations=lambda x: x["mutations_1_to_2"].map(len),
        )
        [["clade_1", "clade_2", "gene", "mutations_1_to_2", "n_mutations"]]
    )

aa_muts_df = pd.concat(aa_muts_df, ignore_index=True)

aa_muts_df

Unnamed: 0,clade_1,clade_2,gene,mutations_1_to_2,n_mutations
0,19A,19A,E,[],0
1,19A,19A,M,[],0
2,19A,19A,N,[],0
3,19A,19A,ORF10,[],0
4,19A,19A,ORF1a,[],0
...,...,...,...,...,...
4327,22C,22C,ORF6,[],0
4328,22C,22C,ORF7a,[],0
4329,22C,22C,ORF7b,[],0
4330,22C,22C,ORF8,[],0


Plot number of mutations separating each clade:

In [49]:
chart = (
    alt.Chart(aa_muts_df)
    .encode(
        x="clade_1",
        y="clade_2",
        color="n_mutations",
        tooltip=aa_muts_df.columns.tolist(),
    )
    .mark_rect(stroke="black")
    .properties(width=alt.Step(11), height=alt.Step(11))
    .facet("gene", columns=3)
)

chart

In [44]:
?alt.Chart.facet

[0;31mSignature:[0m
[0malt[0m[0;34m.[0m[0mChart[0m[0;34m.[0m[0mfacet[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mself[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfacet[0m[0;34m=[0m[0mUndefined[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrow[0m[0;34m=[0m[0mUndefined[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumn[0m[0;34m=[0m[0mUndefined[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata[0m[0;34m=[0m[0mUndefined[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumns[0m[0;34m=[0m[0mUndefined[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Create a facet chart from the current chart.

Faceted charts require data to be specified at the top level; if data
is not specified, the data from the current chart will be used at the
top level.

Parameters
----------
facet : string or alt.Facet (optional)
    The data column to use as an encodin