# Process RBD deep mutational scanning
Process the values to mutation effect estimates.

The next cell is tagged parameters, and so the values will be replaced by `papermill` parameterization:

In [None]:
raw_data_csv = "../results/dms/starr_rbd/raw.csv"
processed_csv = "../results/dms/starr_rbd/processed.csv"

Import Python modules:

In [None]:
import numpy

import pandas as pd

Read the raw data and convert to mutation effects.
We do the following to average across the homologs, as we have measurements in several different homolog (clade) backgrounds:

 - For Wuhan-Hu-1, only keep the `Wuhan-Hu-1_v2` but not the `Wuhan-Hu-1_v1` dataset.
 - We call the Wuhan-Hu-1 wildtype identity the reference wildtype.
 - For any homolog that has a wildtype different than the reference wildtype at a site, we compute the effect of the mutation (on binding and expression) from the reference wildtype to the homolog wildtype at that site.
 - We then adjust the measured mutational effects on binding and expression to be relative to the reference wildtype value by adding the effect of the mutation in the homolog to the reference wildtype to homolog wildtype affect.
 - We then average the binding and affinity effects of mutations across all homologs.
 - We compute an "effect" of mutations as just the average of the effect on binding and expression.

In [None]:
raw_data = pd.read_csv(raw_data_csv)

processed_data = (
    raw_data
    .merge(
        (
            raw_data
            .query("target == 'Wuhan-Hu-1_v2'")
            [["position", "wildtype"]]
            .drop_duplicates()
            .rename(columns={"wildtype": "ref_wildtype"})
        ),
        on="position",
        how="left",
        validate="many_to_one",
    )
    .rename(columns={"position": "site"})
    .assign(
        ref_to_wildtype_bind=lambda x: numpy.where(
            x["mutant"] == x["ref_wildtype"], -x["delta_bind"], pd.NA,
        ),
        ref_to_wildtype_expr=lambda x: numpy.where(
            x["mutant"] == x["ref_wildtype"], -x["delta_expr"], pd.NA,
        ),
    )
    .assign(
        # max will get biggest non-null value, and there is just one non-null
        ref_to_wildtype_bind=lambda x: (
            x.groupby(["target", "site"])
            ["ref_to_wildtype_bind"]
            .transform(lambda s: s.max())
        ),
        ref_to_wildtype_expr=lambda x: (
            x.groupby(["target", "site"])
            ["ref_to_wildtype_expr"]
            .transform(lambda s: s.max())
        ),
        delta_bind=lambda x: x["delta_bind"] + x["ref_to_wildtype_bind"],
        delta_expr=lambda x: x["delta_expr"] + x["ref_to_wildtype_expr"],
    )
    .drop(columns="wildtype")
    .rename(columns={"ref_wildtype": "wildtype"})
    .query("wildtype != mutant")
    .groupby(["wildtype", "site", "mutant"], as_index=False)
    .aggregate({"delta_bind": "mean", "delta_expr": "mean"})
    .assign(effect=lambda x: (x["delta_bind"] + x["delta_expr"]) / 2)
    .rename(
        columns={
            "delta bind": "effect on ACE2 affinity",
            "delta expr": "effect on RBD expression"
        }
    )
)

assert len(processed_data) == len(processed_data.groupby(["site", "mutant"]))

In [None]:
processed_data.to_csv(processed_csv, index=False, float_format="%.5g")