# Get amino-acid mutations in clade founders

In [1]:
# input files
clade_founder_nts_csv = "../results/clade_founder_nts/clade_founder_nts.csv"

In [2]:
import itertools

import altair as alt

import Bio.Seq
import Bio.SeqIO

import pandas as pd

import yaml

In [3]:
with open("../config.yaml") as f:
    config = yaml.safe_load(f)
    
orf1ab_to_nsps = config["orf1ab_to_nsps"]

Get protein sequences for all genes:

In [4]:
clade_founder_seqs = (
    pd.read_csv(clade_founder_nts_csv)
    .assign(gene=lambda x: x["gene"].str.split(";"))
    .explode("gene")
    .sort_values(["clade", "gene", "site"])
    .groupby(["clade", "gene"], as_index=False)
    .aggregate(gene_seq=pd.NamedAgg("nt", lambda s: "".join(s)))
    .assign(
        prot_seq=lambda x: x["gene_seq"].map(
            lambda s: str(Bio.Seq.Seq(s).translate())
        )
    )
)

assert all(
    clade_founder_seqs["gene_seq"].map(len)
    == 3 * clade_founder_seqs["prot_seq"].map(len)
)

clade_founder_seqs

Unnamed: 0,clade,gene,gene_seq,prot_seq
0,19A,E,ATGTACTCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATA...,MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCCNI...
1,19A,M,ATGGCAGATTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGC...,MADSNGTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNRFL...
2,19A,N,ATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTA...,MSDNGPQNQRNAPRITFGGPSDSTGSNQNGERSGARSKQRRPQGLP...
3,19A,ORF10,ATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTC...,MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT
4,19A,ORF1a,ATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAAC...,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...
...,...,...,...,...
223,22C,ORF6,ATGTTTCATCTCGTTGACTTTCAGGTTACTATAGCAGAGATATTAC...,MFHLVDFQVTIAEILLIIMRTFKVSIWNLDYIINLIIKNLSKSLTE...
224,22C,ORF7a,ATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTG...,MKIILFLALITLATCELYHYQECVRGTTVLLKEPCSSGTYEGNSPF...
225,22C,ORF7b,ATGATTGAACTTTCATTAATTGACTTCTATTTGTGCTTTTTAGCCT...,MIELSLIDFYLCFLAFLLFLVLIMLIIFWFSLELQDHNETCHA
226,22C,ORF8,ATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTGTAGCTGCAT...,MKFLVFLGIITTVAAFHQECSLQSCTQHQPYVVDDPCPIHFYSKWY...


Get amino-acid mutations between each pair of clades:

In [5]:
def get_muts(row):
    assert len(row["prot_seq_1"]) == len(row["prot_seq_2"])
    return [
        f"{x1}{r + 1}{x2}"
        for r, (x1, x2) in enumerate(zip(row["prot_seq_1"], row["prot_seq_2"]))
        if x1 != x2
    ]

aa_muts_df = []

for clade_1, clade_2 in itertools.product(
    clade_founder_seqs["clade"].unique(), clade_founder_seqs["clade"].unique(),
):
    aa_muts_df.append(
        clade_founder_seqs
        .query("clade == @clade_1")
        .merge(
            clade_founder_seqs.query("clade == @clade_2"),
            on="gene",
            suffixes=["_1", "_2"],
        )
        .assign(mutations=lambda x: x.apply(get_muts, axis=1))
        [["clade_1", "clade_2", "gene", "mutations"]]
    )

aa_muts_df = pd.concat(aa_muts_df, ignore_index=True)

aa_muts_df

Unnamed: 0,clade_1,clade_2,gene,mutations
0,19A,19A,E,[]
1,19A,19A,M,[]
2,19A,19A,N,[]
3,19A,19A,ORF10,[]
4,19A,19A,ORF1a,[]
...,...,...,...,...
4327,22C,22C,ORF6,[]
4328,22C,22C,ORF7a,[]
4329,22C,22C,ORF7b,[]
4330,22C,22C,ORF8,[]


Assign ORF1ab to Nsp mutations:

In [21]:
aa_muts_nsp_df = (
    aa_muts_df
    .explode("mutations")
    .query("mutations.notnull()")
)

# only keep ORF1ab mutations as ORF1a is subset of those
assert set(aa_muts_nsp_df.query("gene == 'ORF1a'")["mutations"]).issubset(
    aa_muts_nsp_df.query("gene == 'ORF1ab'")["mutations"])

def gene_mutation(row):
    gene = row["gene"]
    mutation = row["mutations"]
    if gene == "ORF1ab":
        r = int(mutation[1: -1])
        for nsp, nsp_start in orf1ab_to_nsps.items():
            if r < nsp_start:
                mutation = f"{mutation[0]}{r - last_nsp_start + 1}{mutation[-1]}"
                gene = last_nsp
                break
            last_nsp = nsp
            last_nsp_start = nsp_start
        else:
            mutation = f"{mutation[0]}{r - last_nsp_start + 1}{mutation[-1]}"
            gene = last_nsp
    return f"{gene} {mutation}"

aa_muts_nsp_df = (
    aa_muts_nsp_df
    .query("gene != 'ORF1a'")
    .assign(
        gene_mutation=lambda x: x.apply(gene_mutation, axis=1),
        gene=lambda x: x["gene_mutation"].str.split().str[0],
        mutation=lambda x: x["gene_mutation"].str.split().str[1],
    )
    .drop(columns="mutations")
)

aa_muts_nsp_df

Unnamed: 0,clade_1,clade_2,gene,gene_mutation,mutation
17,19A,20A,nsp12,nsp12 P323L,P323L
23,19A,20A,S,S D614G,D614G
26,19A,20B,N,N R203K,R203K
26,19A,20B,N,N G204R,G204R
29,19A,20B,nsp12,nsp12 P323L,P323L
...,...,...,...,...,...
4315,22C,22B,ORF6,ORF6 L61D,L61D
4319,22C,22B,S,S Q452R,Q452R
4319,22C,22B,S,S F486V,F486V
4319,22C,22B,S,S R493Q,R493Q


Get non-spike mutations:

In [16]:
non_spike_muts = (
    aa_muts_nsp_df
    .query("gene != 'S'")
    .query("clade_1 == '20A'")
    .groupby("clade_2")
    .aggregate(
        mutations=pd.NamedAgg("gene_mutation", lambda s: ", ".join(s)),
        n_mutations=pd.NamedAgg("gene_mutation", "count"),
    )
)

non_spike_muts

Unnamed: 0_level_0,mutations,n_mutations
clade_2,Unnamed: 1_level_1,Unnamed: 2_level_1
19A,nsp12 L323P,1
20B,"N R203K, N G204R",2
20C,"nsp2 T85I, ORF3a Q57H",2
20E,"N A220V, ORF10 V30L",2
20F,"N R203K, N G204R, nsp2 I120F",3
20G,"N P67S, N P199L, nsp2 T85I, nsp5 L89F, nsp14 N...",9
20I,"N D3L, N R203K, N G204R, N S235F, nsp3 T183I, ...",10
20J,"N P80R, N R203K, N G204R, nsp3 S370L, nsp3 K97...",8
21C,"N T205I, nsp2 T85I, nsp13 D260Y, ORF3a Q57H",4
21F,"nsp2 T85I, nsp4 L438P, nsp13 Q88H, ORF3a P42L,...",6


In [15]:
aa_muts_nsp_df

Unnamed: 0,clade_1,clade_2,gene,mutations,gene_mutation
17,19A,20A,ORF1ab,P4715L,nsp12 P323L
23,19A,20A,S,D614G,S D614G
26,19A,20B,N,R203K,N R203K
26,19A,20B,N,G204R,N G204R
29,19A,20B,ORF1ab,P4715L,nsp12 P323L
...,...,...,...,...,...
4315,22C,22B,ORF6,L61D,ORF6 L61D
4319,22C,22B,S,Q452R,S Q452R
4319,22C,22B,S,F486V,S F486V
4319,22C,22B,S,R493Q,S R493Q


In [9]:
aa_muts_df

Unnamed: 0,clade_1,clade_2,gene,mutations
0,19A,19A,E,[]
1,19A,19A,M,[]
2,19A,19A,N,[]
3,19A,19A,ORF10,[]
4,19A,19A,ORF1a,[]
...,...,...,...,...
4327,22C,22C,ORF6,[]
4328,22C,22C,ORF7a,[]
4329,22C,22C,ORF7b,[]
4330,22C,22C,ORF8,[]
