# Tabulate phenotypes of SARS-CoV-2 spike that might have predictive power in understanding evolution for different mutations and clades

Import Python modules:

In [1]:
import json
import os
import urllib.request

import altair as alt

import numpy

import pandas as pd

import ruamel.yaml as yaml

Read the configuration YAML:

In [2]:
# This cell is tagged as parameters. So if you run the notebook using papermill
# (https://papermill.readthedocs.io/en/latest/usage-cli.html) with 
# `-p config_yaml <config_yaml_file>` then the file defined here will be replaced
# with the one you pass
config_yaml = "config.yaml"

In [3]:
with open(config_yaml) as f:
    config = yaml.YAML().load(f)

## Mutation effects on phenotypes
Read all the specified mutation-level phenotypic effects into a Data Frame.
Write that data frame to a file.
Also created data frame and file in which the mutation-level phenotypic data are randomized among mutations.

In [4]:
mutation_phenotypes = []
req_cols = ["site", "wildtype", "mutant"]
for phenotype_category, category_d in config["phenotype_csvs"].items():
    pheno_cols = category_d["columns"]
    print(
        f"Reading {phenotype_category} phenotypes from {category_d['csv']}\n  "
        + "\n  ".join(pheno_cols)
    )
    if set(req_cols).intersection(pheno_cols):
        raise ValueError(f"{phenotype_category} columns {pheno_cols} include {req_cols}")
    df = pd.read_csv(category_d["csv"])
    if not set(req_cols + pheno_cols).issubset(df.columns):
        raise ValueError(f"cannot find expected columns for {phenotype_category}")
    mutation_phenotypes.append(
        df
        .melt(
            id_vars=req_cols,
            value_vars=pheno_cols,
            var_name="phenotype",
            value_name="mutation_effect",
        )
        .assign(phenotype_category=phenotype_category)
        [["phenotype_category", "phenotype", *req_cols, "mutation_effect"]]
        .query("mutation_effect.notnull()")
        .query("wildtype != mutant")  # drop any wildtype to wildtype mutations
    )

mutation_phenotypes = pd.concat(mutation_phenotypes, ignore_index=True)

mutation_phenotypes_csv = config["mutation_phenotypes_csv"]
os.makedirs(os.path.dirname(mutation_phenotypes_csv), exist_ok=True)
print(f"\nWriting mutation phenotypes to {mutation_phenotypes_csv}")
mutation_phenotypes.to_csv(mutation_phenotypes_csv, index=False, float_format="%.4g")

n_randomizations = config["n_randomizations"]
print(f"\nNow randomizing each mutation phenotype {n_randomizations} times")
mutation_phenotypes_randomized = []
for _, df in mutation_phenotypes.groupby(["phenotype_category", "phenotype"]):
    for random_seed in range(n_randomizations):
        numpy.random.seed(random_seed)
        mutation_phenotypes_randomized.append(
            df
            .assign(
                mutation_effect=lambda x: numpy.random.shuffle(x["mutation_effect"].to_numpy()),
                random_seed=random_seed
            )
            [["random_seed"] + df.columns.tolist()]
        )
mutation_phenotypes_randomized = pd.concat(mutation_phenotypes_randomized, ignore_index=True)
mutation_phenotypes_randomized_csv = config["mutation_phenotypes_randomized_csv"]
os.makedirs(os.path.dirname(mutation_phenotypes_randomized_csv), exist_ok=True)
print(f"Writing randomized mutation phenotypes to {mutation_phenotypes_randomized_csv}")
mutation_phenotypes_randomized.to_csv(mutation_phenotypes_randomized_csv, index=False, float_format="%.4g")

Reading XBB.1.5 spike pseudovirus DMS phenotypes from data/spike_pseudovirus_DMS_XBB.1.5.csv
  human sera escape
  spike mediated entry
  ACE2 binding
Reading XBB.1.5 RBD yeast-display DMS phenotypes from data/yeast_RBD_DMS_XBB.1.5.csv
  ACE2 affinity
  RBD expression
  escape
Reading EVEscape phenotypes from data/EVEscape_XBB_single_mutation_predictions.csv
  EVEscape
  fitness_evol_indices
  dissimilarity_charge_hydrophobicity
  accessibility_wcn

Writing mutation phenotypes to results/mutation_phenotypes.csv

Now randomizing each mutation phenotype 10 times
Writing randomized mutation phenotypes to results/mutation_phenotypes_randomized.csv


## Read the Pango clades
These come from Cornelius Roemer's GitHub repo ([https://github.com/corneliusroemer/pango-sequences](https://github.com/corneliusroemer/pango-sequences)):

In [5]:
pango_json = config["pango_json"]
print(f"Reading Pango clade definitions from {pango_json}")
with urllib.request.urlopen(pango_json) as url:
    pango_clades = json.load(url)
print(f"Read definitions for {len(pango_clades)} clades")

Reading Pango clade definitions from https://raw.githubusercontent.com/corneliusroemer/pango-sequences/main/data/pango-consensus-sequences_summary.json
Read definitions for 3810 clades


Get a data frame of all Pango clades along with other relevant information:

In [27]:
clades_df = {}

def is_descendant_of(clade, ancestor):
    """Returns True iff `clade` is a descendant of `ancestor`."""
    if pango_clades[clade]["parent"] == ancestor:
        return True
    elif pango_clades[clade]["parent"]:
        return is_descendant_of(pango_clades[clade]["parent"], ancestor)
    else:
        return False

def relative_mutations(clade_muts, reference_muts):
    """Get mutation in `clade_muts` relative `reference_muts`."""
    shared_muts = set(clade_muts).intersection(reference_muts)
    clade_sites = {
        r: (wt, m) for (wt, r, m) in [tup for tup in clade_muts if tup not in shared_muts]
    }
    reference_sites = {
        r: (wt, m) for (wt, r, m) in [tup for tup in reference_muts if tup not in shared_muts]
    }
    muts = []
    for r, (wt, m) in clade_sites.items():
        if r in reference_sites:
            assert wt == reference_sites[r][0]
            muts.append((r, reference_sites[r][1], m))
        else:
            muts.append((r, wt, m))
    for r, (wt, m) in reference_sites.items():
        if r in clade_sites:
            assert wt == clade_sites[r][0]
            pass  # already counted
        else:
            muts.append((r, m, wt))
    return [(wt, r, m) for (r, wt, m) in sorted(muts)]

def parse_spike_muts(clade_d):
    """Parse spike mutations from dict for a clade."""
    return [
        (mut.split(":")[1][0], int(mut.split(":")[1][1: -1]), mut.split(":")[1][-1])
        for mut in clade_d["aaSubstitutions"] + clade_d["aaDeletions"]
        if mut and mut.startswith("S:")
    ]

compute_relative_to = config["compute_relative_to"]
compute_relative_to_spike_muts = {
    ref_clade: parse_spike_muts(pango_clades[ref_clade])
    for ref_clade in compute_relative_to
}

for clade, clade_d in pango_clades.items():
    spike_muts = parse_spike_muts(clade_d)
    clades_df[clade] = {
        "date": clade_d["designationDate"] if clade_d["designationDate"] else pd.NA,
        "parent": clade_d["parent"] if clade_d["parent"] else pd.NA,
        **{
            f"spike_muts_relative_to_{ref_clade}":
                relative_mutations(spike_muts, compute_relative_to_spike_muts[ref_clade])
            for ref_clade in compute_relative_to
        },
        **{
            f"is_descendant_of_{ancestor}": is_descendant_of(clade, ancestor)
            for ancestor in config["classify_descendants_of"]
        },
        
    }
    
clades_df = pd.DataFrame.from_dict(clades_df, orient="index").reset_index(names="clade")

clades_df

Unnamed: 0,clade,date,parent,spike_muts_relative_to_XBB.1.5,spike_muts_relative_to_BA.2.86,is_descendant_of_XBB,is_descendant_of_BA.2,is_descendant_of_BA.2.86
0,A,,,"[(I, 19, T), (-, 24, L), (-, 25, P), (-, 26, P...","[(I, 19, T), (T, 21, R), (-, 24, L), (-, 25, P...",False,False,False
1,A.1,,A,"[(I, 19, T), (-, 24, L), (-, 25, P), (-, 26, P...","[(I, 19, T), (T, 21, R), (-, 24, L), (-, 25, P...",False,False,False
2,A.2,,A,"[(I, 19, T), (-, 24, L), (-, 25, P), (-, 26, P...","[(I, 19, T), (T, 21, R), (-, 24, L), (-, 25, P...",False,False,False
3,A.2.2,,A.2,"[(I, 19, T), (-, 24, L), (-, 25, P), (-, 26, P...","[(I, 19, T), (T, 21, R), (-, 24, L), (-, 25, P...",False,False,False
4,A.2.3,,A.2,"[(I, 19, T), (-, 24, L), (-, 25, P), (-, 26, P...","[(I, 19, T), (T, 21, R), (-, 24, L), (-, 25, P...",False,False,False
...,...,...,...,...,...,...,...,...
3805,XU,2022-04-19,,"[(A, 83, V), (-, 144, Y), (Q, 146, H), (E, 183...","[(T, 21, R), (L, 50, S), (-, 69, H), (-, 70, V...",False,False,False
3806,XV,2022-05-11,,"[(-, 24, L), (-, 25, P), (-, 26, P), (S, 27, A...","[(T, 21, R), (-, 24, L), (-, 25, P), (-, 26, P...",False,False,False
3807,XW,2022-05-12,,"[(A, 83, V), (-, 144, Y), (Q, 146, H), (E, 183...","[(T, 21, R), (L, 50, S), (-, 69, H), (-, 70, V...",False,False,False
3808,XY,2022-05-18,,"[(A, 83, V), (-, 144, Y), (Q, 146, H), (E, 183...","[(T, 21, R), (L, 50, S), (-, 69, H), (-, 70, V...",False,False,False


In [7]:
config

{'phenotype_csvs': {'XBB.1.5 spike pseudovirus DMS': {'csv': 'data/spike_pseudovirus_DMS_XBB.1.5.csv', 'columns': ['human sera escape', 'spike mediated entry', 'ACE2 binding']}, 'XBB.1.5 RBD yeast-display DMS': {'csv': 'data/yeast_RBD_DMS_XBB.1.5.csv', 'columns': ['ACE2 affinity', 'RBD expression', 'escape']}, 'EVEscape': {'csv': 'data/EVEscape_XBB_single_mutation_predictions.csv', 'columns': ['EVEscape', 'fitness_evol_indices', 'dissimilarity_charge_hydrophobicity', 'accessibility_wcn']}}, 'include_hamming_distance': True, 'pango_json': 'https://raw.githubusercontent.com/corneliusroemer/pango-sequences/main/data/pango-consensus-sequences_summary.json', 'compute_relative_to': ['XBB.1.5', 'BA.2.86'], 'classify_descendants_of': ['XBB', 'BA.2', 'BA.2.86'], 'n_randomizations': 10, 'mutation_phenotypes_csv': 'results/mutation_phenotypes.csv', 'mutation_phenotypes_randomized_csv': 'results/mutation_phenotypes_randomized.csv', 'clade_phenotypes_csv': 'results/clade_phenotypes.csv', 'clade_phe