# Real serum example
In this notebook we fit real data from a deep mutational scanning experiment that used an anti-HIV serum sample, IDC508.

First, import the Python modules:

In [1]:
import requests
import tempfile
import time

import altair as alt

import polyclonal
import polyclonal.pdb_utils

import pandas as pd

## Get the data to fit
Now we read the deep mutational scanning measurements, which quantify the "probability of escape" (fraction not neutralized) for each variant. For a description of key columns, see [here](https://jbloomlab.github.io/polyclonal/real_LyCoV1404.html). 
We have data for two libraries against the same serum:

In [2]:
prob_escape_files = {
    "libA": "A_2022-10-17_rescue-4_IDC508_1_prob_escape.csv",
    "libB": "B_2022-09-27_rescue-3_IDC508_1_prob_escape.csv",
}

prob_escape = pd.concat(
    [
        pd.read_csv(f, keep_default_na=False, na_values="nan").assign(library=lib)
        for lib, f in prob_escape_files.items()
    ],
    ignore_index=True,
).query("`no-antibody_count` >= no_antibody_count_threshold")

assert prob_escape.notnull().all().all()

Display the number of variants per concentration.

In [3]:
display(
    prob_escape.groupby(["library", "antibody_concentration"]).aggregate(
        n_variants=pd.NamedAgg("barcode", "nunique")
    )
)

Unnamed: 0_level_0,Unnamed: 1_level_0,n_variants
library,antibody_concentration,Unnamed: 2_level_1
libA,1.0,17778
libA,2.0,17778
libA,3.0,17778
libB,1.0,15476
libB,2.0,15476
libB,3.0,15476


Plot mean probability of escape across all variants with the indicated number of mutations. Note that this plot weights each variant the same in the means regardless of how many barcode counts it has. We plot means for both censored (set to between 0 and 1) and uncensored probabilities of escape. Also, note it uses a symlog scale for the y-axis. Mouseover points for values:

In [4]:
# NBVAL_IGNORE_OUTPUT
max_aa_subs = 4  # group if >= this many substitutions

mean_prob_escape = (
    prob_escape.assign(
        n_subs=lambda x: (
            x["aa_substitutions_sequential"]
            .str.split()
            .map(len)
            .clip(upper=max_aa_subs)
            .map(lambda n: str(n) if n < max_aa_subs else f">{max_aa_subs - 1}")
        )
    )
    .groupby(["antibody_concentration", "library", "n_subs"], as_index=False)
    .aggregate({"prob_escape": "mean", "prob_escape_uncensored": "mean"})
    .rename(
        columns={
            "prob_escape": "censored to [0, 1]",
            "prob_escape_uncensored": "not censored",
        }
    )
    .melt(
        id_vars=["antibody_concentration", "n_subs", "library"],
        var_name="censored",
        value_name="probability escape",
    )
)

mean_prob_escape_chart = (
    alt.Chart(mean_prob_escape)
    .encode(
        x=alt.X("antibody_concentration"),
        y=alt.Y(
            "probability escape",
            scale=alt.Scale(type="symlog", constant=0.05),
        ),
        column=alt.Column("censored", title=None),
        row=alt.Row("library", title=None),
        color=alt.Color("n_subs", title="n substitutions"),
        tooltip=[
            alt.Tooltip(c, format=".3g") if mean_prob_escape[c].dtype == float else c
            for c in mean_prob_escape.columns
        ],
    )
    .mark_line(point=True, size=0.5)
    .properties(width=200, height=125)
    .configure_axis(grid=False)
)

mean_prob_escape_chart

  for col_name, dtype in df.dtypes.iteritems():


Read the spatial distances to use for regularization:

In [5]:
spatial_distances = polyclonal.pdb_utils.inter_residue_distances(
    "Env_trimer.pdb",
    target_chains=["A", "B", "C"],
)

spatial_distances

Unnamed: 0,site_1,site_2,distance,chain_1,chain_2
0,32,33,1.330031,C,C
1,32,34,3.524719,C,C
2,32,35,5.966030,A,A
3,32,36,9.465096,A,A
4,32,37,10.110476,C,C
...,...,...,...,...,...
159890,662,664,3.405216,C,C
159891,662,701,29.860090,B,A
159892,663,664,1.328199,A,A
159893,663,701,26.849056,A,C


Now fit the model to each replicate for the serum, turning the regularization parameters to get a better fit:

In [6]:
# NBVAL_IGNORE_OUTPUT

reference_sites = pd.read_csv("Env_site_numbering_map.csv")["reference_site"].tolist()

for library, df in prob_escape.groupby("library"):

    print(f"\n\nFitting for {library}")

    model = polyclonal.Polyclonal(
        n_epitopes=2,
        data_to_fit=df.rename(
            columns={
                "antibody_concentration": "concentration",
                "aa_substitutions_reference": "aa_substitutions",
            }
        ),
        alphabet=polyclonal.AAS_WITHSTOP_WITHGAP,
        sites=reference_sites,
        spatial_distances=spatial_distances,
    )

    # fit model
    opt_res = model.fit(
        logfreq=200,
        reg_escape_weight=0.1,
        reg_uniqueness_weight=0,
        reg_uniqueness2_weight=1,
        reg_spatial_weight=0.0,
        reg_spatial2_weight=0.0005,
    )

    # display results
    display(model.activity_wt_barplot())
    display(model.mut_escape_plot(addtl_slider_stats={"times_seen": 3}))



Fitting for libA
# First fitting site-level model.
# Starting optimization of 1682 parameters at Sun Nov 13 05:55:56 2022.
        step    time_sec        loss    fit_loss  reg_escape  reg_spread reg_spatial reg_uniqueness reg_uniqueness2 reg_activity
           0     0.05512      1215.6      1213.6           0           0           0              0               0       2.0059
          60       3.372      904.54      889.22      4.0825           0      9.4639              0         0.35786       1.4207
# Successfully finished at Sun Nov 13 05:55:59 2022.
# Starting optimization of 12628 parameters at Sun Nov 13 05:55:59 2022.
        step    time_sec        loss    fit_loss  reg_escape  reg_spread reg_spatial reg_uniqueness reg_uniqueness2 reg_activity
           0     0.13282      1262.3      1177.1      31.416  1.9765e-32      9.4639              0          42.897       1.4207
         200      54.822      1132.1      1087.1      29.769     0.79351      9.9782              0     

  for col_name, dtype in df.dtypes.iteritems():


  for col_name, dtype in df.dtypes.iteritems():




Fitting for libB
# First fitting site-level model.
# Starting optimization of 1684 parameters at Sun Nov 13 05:57:28 2022.
        step    time_sec        loss    fit_loss  reg_escape  reg_spread reg_spatial reg_uniqueness reg_uniqueness2 reg_activity
           0    0.056544      378.53      376.53           0           0           0              0               0       2.0059
         179      10.813      295.99      285.58      3.1997           0      4.7397              0        0.095018       2.3777
# Successfully finished at Sun Nov 13 05:57:38 2022.
# Starting optimization of 12288 parameters at Sun Nov 13 05:57:39 2022.
        step    time_sec        loss    fit_loss  reg_escape  reg_spread reg_spatial reg_uniqueness reg_uniqueness2 reg_activity
           0      0.1359      406.55      370.03      25.195  2.0631e-32      4.7397              0          4.2116       2.3777
         200      54.465      361.59       331.8      19.667     0.94181      5.4303              0     

  for col_name, dtype in df.dtypes.iteritems():


  for col_name, dtype in df.dtypes.iteritems():
