# Plot "clock" for SARS-CoV-2 clades: number of mutations relative to Wuhan-Hu-1 versus clade designation date

In [1]:
import json
import urllib

import altair as alt

import pandas as pd

_ = alt.data_transformers.disable_max_rows()

Get latest Pango clade JSON from Cornelius Roemer's repo:

In [2]:
url_link = "https://raw.githubusercontent.com/corneliusroemer/pango-sequences/main/data/pango-consensus-sequences_summary.json"

with urllib.request.urlopen(url_link) as url:
    pango_data = json.load(url)

pango_df = (
    pd.DataFrame.from_dict(pango_data, orient="index")
    .rename(columns={"lineage": "Pango clade", "nextstrainClade": "Nextstrain clade", "designationDate": "date"})
    .query("date != ''")
    .assign(
        nt_mutations=lambda x: x["nucSubstitutions"].map(len),
        aa_mutations=lambda x: x["aaSubstitutions"].map(len),
        spike_aa_mutations=lambda x: x["aaSubstitutions"].map(
            lambda ms: sum(m.startswith("S:") for m in ms)
        ),
        nonspike_aa_mutations=lambda x: x["aa_mutations"] - x["spike_aa_mutations"],
        date=lambda x: pd.to_datetime(x["date"]),
    )
    .sort_values("date")
    .reset_index(drop=True)
    [["date", "Pango clade", "Nextstrain clade", "nt_mutations", "spike_aa_mutations", "nonspike_aa_mutations"]]
)

pango_df

Unnamed: 0,date,Pango clade,Nextstrain clade,nt_mutations,spike_aa_mutations,nonspike_aa_mutations
0,2021-02-18,A.27,19B,24,7,7
1,2021-02-18,A.28,19B,20,2,9
2,2021-02-21,R.2,20B,22,5,8
3,2021-02-21,P.2,20B,17,3,7
4,2021-02-21,P.1,20J,33,12,10
...,...,...,...,...,...,...
2918,2023-12-02,KF.2,23D,111,41,37
2919,2023-12-02,HK.34.1,23F,115,42,34
2920,2023-12-02,KE.3,22F,111,43,35
2921,2023-12-02,HV.1.5,23F,109,43,33


Also make a Nextrain clade data frame which is the median of the Pango clades contained therein for all Nextstrain clades with more than 10 Pango clades:

In [3]:
nextstrain_df = (
    pango_df
    .assign(
        n_Pango_clades=lambda x: x.groupby("Nextstrain clade")["Pango clade"].transform("count"),
        first_Pango_clade=lambda x: x.groupby("Nextstrain clade")["Pango clade"].transform("first"),
    )
    .query("n_Pango_clades > 10")
    .drop(columns="Pango clade")
    .groupby(["Nextstrain clade", "first_Pango_clade"], as_index=False)
    .aggregate("median")
)
nextstrain_df

Unnamed: 0,Nextstrain clade,first_Pango_clade,date,nt_mutations,spike_aa_mutations,nonspike_aa_mutations,n_Pango_clades
0,20A,B.1.214.1,2021-03-15 00:00:00,19.0,2.0,8.0,117.0
1,20B,R.2,2021-03-15 00:00:00,12.0,1.0,5.0,231.0
2,20C,B.1.595.1,2021-03-15 00:00:00,12.0,1.0,6.0,74.0
3,20D,C.36.2,2021-03-15 00:00:00,15.5,1.5,6.5,30.0
4,20E,B.1.177.44,2021-03-15 00:00:00,17.0,2.0,3.5,80.0
5,20J,P.1,2021-08-12 00:00:00,36.0,12.0,11.0,23.0
6,21A,B.1.617.2,2021-09-02 00:00:00,34.0,8.0,19.0,21.0
7,21I,AY.2,2021-10-22 00:00:00,33.0,9.0,17.0,40.0
8,21J,AY.1,2021-11-04 00:00:00,37.0,9.0,22.0,183.0
9,21K,BA.1,2022-02-23 00:00:00,57.0,31.0,17.0,56.0


Now make plots for both Pango and Nextstrain clades:

In [4]:
for clade_type, wide_df in [("Pango", pango_df), ("Nextstrain", nextstrain_df)]:

    melt_cols = {
        "nt_mutations": "all nucleotide mutations",
        "spike_aa_mutations": "spike amino-acid mutations",
        "nonspike_aa_mutations": "non-spike amino-acid mutations",
    }
    tidy_df = wide_df.rename(columns=melt_cols).melt(
        id_vars=[c for c in wide_df.columns if c not in melt_cols],
        value_vars=melt_cols.values(),
        var_name="mutation type",
        value_name="mutations from Wuhan-Hu-1",
    )

    clade_selection = alt.selection_point(
        fields=[f"{clade_type} clade"],
        on="mouseover",
        empty=False,
    )

    point_scale = {"Pango": 1, "Nextstrain": 2}[clade_type]

    chart = (
        alt.Chart(tidy_df)
        .add_params(clade_selection)
        .encode(
            alt.X("date", axis=alt.Axis(titleFontSize=13, format="%b-%Y", labelAngle=-90)),
            alt.Y("mutations from Wuhan-Hu-1", axis=alt.Axis(titleFontSize=13)),
            alt.Column(
                "mutation type",
                title=None,
                header=alt.Header(labelFontStyle="bold", labelFontSize=14),
                sort="descending",
            ),
            strokeWidth=alt.condition(clade_selection, alt.value(2), alt.value(0)),
            size=alt.condition(clade_selection, alt.value(70 * point_scale), alt.value(45 * point_scale)),
            tooltip=[
                c for c in tidy_df.columns
            ],
        )
        .mark_circle(stroke="red", strokeOpacity=1, fill="black", fillOpacity=0.25 * point_scale)
        .resolve_scale(y="independent")
        .configure_axis(grid=False)
        .properties(
            width=210,
            height=210,
            title=alt.TitleParams(
                f"Mutations of different types relative to Wuhan-Hu-1 for SARS-CoV-2 {clade_type} clades",
                subtitle="The plot is interactive, mouseover points for clade details",
                fontSize=16,
                fontWeight="normal",
                dy=-10,
            ),
        )
    )

    display(chart)

    outfile = f"{clade_type}.html"
    chart.save(outfile)
        