# Tabulate phenotypes of SARS-CoV-2 spike that might have predictive power in understanding evolution for different mutations and clades

Import Python modules:

In [1]:
import json
import os

import altair as alt

import numpy

import pandas as pd

import ruamel.yaml as yaml

Read the configuration YAML:

In [2]:
# This cell is tagged as parameters. So if you run the notebook using papermill
# (https://papermill.readthedocs.io/en/latest/usage-cli.html) with 
# `-p config_yaml <config_yaml_file>` then the file defined here will be replaced
# with the one you pass
config_yaml = "config.yaml"

In [3]:
with open(config_yaml) as f:
    config = yaml.YAML().load(f)

Read all the specified mutation-level phenotypic data into a Data Frame.
Write that data frame to a file.
Also created data frame and file in which the mutation-level phenotypic data are randomized among mutations.

In [5]:
mutation_phenotypes = []
req_cols = ["site", "wildtype", "mutant"]
for phenotype_category, category_d in config["phenotype_csvs"].items():
    pheno_cols = category_d["columns"]
    print(
        f"Reading {phenotype_category} phenotypes from {category_d['csv']}\n  "
        + "\n  ".join(pheno_cols)
    )
    if set(req_cols).intersection(pheno_cols):
        raise ValueError(f"{phenotype_category} columns {pheno_cols} include {req_cols}")
    df = pd.read_csv(category_d["csv"])
    if not set(req_cols + pheno_cols).issubset(df.columns):
        raise ValueError(f"cannot find expected columns for {phenotype_category}")
    mutation_phenotypes.append(
        df
        .melt(
            id_vars=req_cols,
            value_vars=pheno_cols,
            var_name="phenotype",
            value_name="mutation_effect",
        )
        .assign(phenotype_category=phenotype_category)
        [["phenotype_category", "phenotype", *req_cols, "mutation_effect"]]
        .query("mutation_effect.notnull()")
        .query("wildtype != mutant")  # drop any wildtype to wildtype mutations
    )

mutation_phenotypes = pd.concat(mutation_phenotypes, ignore_index=True)

mutation_phenotypes_csv = config["mutation_phenotypes_csv"]
os.makedirs(os.path.dirname(mutation_phenotypes_csv), exist_ok=True)
print(f"\nWriting mutation phenotypes to {mutation_phenotypes_csv}")
mutation_phenotypes.to_csv(mutation_phenotypes_csv, index=False, float_format="%.4g")

n_randomizations = config["n_randomizations"]
print(f"\nNow randomizing each mutation phenotype {n_randomizations} times")
mutation_phenotypes_randomized = []
for _, df in mutation_phenotypes.groupby(["phenotype_category", "phenotype"]):
    for random_seed in range(n_randomizations):
        numpy.random.seed(random_seed)
        mutation_phenotypes_randomized.append(
            df
            .assign(
                mutation_effect=lambda x: numpy.random.shuffle(x["mutation_effect"].to_numpy()),
                random_seed=random_seed
            )
            [["random_seed"] + df.columns.tolist()]
        )
mutation_phenotypes_randomized = pd.concat(mutation_phenotypes_randomized, ignore_index=True)
mutation_phenotypes_randomized_csv = config["mutation_phenotypes_randomized_csv"]
os.makedirs(os.path.dirname(mutation_phenotypes_randomized_csv), exist_ok=True)
print(f"\nWriting randomized mutation phenotypes to {mutation_phenotypes_randomized_csv}")
mutation_phenotypes_randomized.to_csv(mutation_phenotypes_randomized_csv, index=False, float_format="%.4g")

Reading XBB.1.5 spike pseudovirus DMS phenotypes from data/spike_pseudovirus_DMS_XBB.1.5.csv
  human sera escape
  spike mediated entry
  ACE2 binding
Reading XBB.1.5 RBD yeast-display DMS phenotypes from data/yeast_RBD_DMS_XBB.1.5.csv
  ACE2 affinity
  RBD expression
  escape
Reading EVEscape phenotypes from data/EVEscape_XBB_single_mutation_predictions.csv
  EVEscape
  fitness_evol_indices
  dissimilarity_charge_hydrophobicity
  accessibility_wcn

Writing mutation phenotypes to results/mutation_phenotypes.csv

Now randomizing each mutation phenotype 10 times

Writing randomized mutation phenotypes to results/mutation_phenotypes_randomized.csv
