# Binding affinity, and expression from DMS of anti-CGG

This notebook contains code to make interactive heatmaps for binding, expression, and poly-secificity measured in the of anti-CGG antibodies as part of a collaboration with the Matsen and Victora labs.

Most of this code was inspired, and in some cases, repurposed from Sarah Hilton's work found [here](https://github.com/jbloomlab/SARS-CoV-2-RBD_DMS/blob/master/interactive_heatmap.ipynb). 

To get this to work I had to use the following environment build:
```
mamba create --name Ab-CGGnaive-docs pandas=1.4.4 altair=4.1 jsonschema=3.2 python=3.8 notebook git-lfs
```

In [1]:
import itertools 
import pandas as pd
import numpy as np
import altair as alt

# Remove the limit of ~5000 rows
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Process the data

Import and format the data for the interactive `Altair` heat maps. 

In [2]:
# ## == Filepaths == ##  

# Input path to variant scores 
variant_scores_path = "results/final_variant_scores/final_variant_scores.csv"

# Output path to HTML plot file
output_html = "docs/_includes/heatmap.html"

In [3]:
# Expression and Binding scores per variant background -- for the heatmap plots
scores_df = (
    pd.read_csv(variant_scores_path)
)

scores_df.head()

Unnamed: 0,target,wildtype,position,position_IMGT,chain,annotation,mutant,mutation,codon,single_nt,bind_CGG,delta_bind_CGG,n_bc_bind_CGG,n_libs_bind_CGG,expr,delta_expr,n_bc_expr,n_libs_expr
0,CGG_naive,E,1,1.0,H,FWRH1,A,E1(H)A,GAG,True,8.74239,0.11933,20,2,10.45318,-0.00246,20,2
1,CGG_naive,E,1,1.0,H,FWRH1,C,E1(H)C,GAG,False,8.60813,-0.01492,22,2,10.33348,-0.12216,22,2
2,CGG_naive,E,1,1.0,H,FWRH1,D,E1(H)D,GAG,True,8.63554,0.01249,18,2,10.50438,0.04874,18,2
3,CGG_naive,E,1,1.0,H,FWRH1,E,E1(H)E,GAG,True,8.62305,0.0,23285,2,10.45565,0.0,23285,2
4,CGG_naive,E,1,1.0,H,FWRH1,F,E1(H)F,GAG,False,8.75738,0.13433,29,2,10.34185,-0.11379,29,2


In [4]:
# Expression and Binding scores per variant background -- for the heatmap plots
scores_df = (
    pd.read_csv(variant_scores_path)
        .rename(columns = {"position":"site",
                           "delta_expr":"Expression", # rename for the selection menus 
                           "delta_bind_CGG":"Binding"    # rename for the selection menus 
                          })
)

# Drop the un-used columns - shrinks the size of the altair plot
scores_df = scores_df.drop(columns=[column for column in scores_df.columns
                                   if column not in ['target', 'wildtype', 'site', 'mutant',
                                                     'Expression', 'Binding', # 'Polyspecificty',
                                                     'n_bc_expr', 'n_bc_bind_CGG',
                                                     'chain', 'annotation', 'mutation', 'position_IMGT'
                                                    ]
                                   ]
                          )

# Set a character, `x`, to appear in the wildtype sites
scores_df['wildtype_code'] = (scores_df[['wildtype', 'mutant']].apply(lambda x: 'x' if x[0] == x[1] else '', axis=1))

# Format the x axis
scores_df = (
    scores_df
    .assign(
        IMGT_Site_tmp = lambda df: df['position_IMGT'].map(lambda pos: "link" if pd.isna(pos) else  str(int(pos)))
    ).assign(
        IMGT_Site = lambda df: [f"{site} ({IMGT})" for IMGT, site in zip(df['IMGT_Site_tmp'], df['site'])]
    ).assign(
        Chain_Site = lambda df: [f"{site} ({chain})" for chain, site in zip(df['chain'], df['site'])]
    )
)



## Define plot-wide parameters

Define the parameters that carry through to multiple plots for easy adjustment. 

In [5]:
# Width of the zoom bar
width = 1000

# Height of the heatmap
height = 300

# How to space the sites on the x axis on the zoom bar 
min_site = min({site for site in scores_df.site})
max_site = max({site for site in scores_df.site})
x_axis_spacing = 5

# Order of the amino acids on the y-axis
aa_order = ['R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y',
            'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C', '*']
    
# Tooltips and corresponding names with formatting
heatmap_tooltips = [
    alt.Tooltip('target:N', title="Variant"),
    alt.Tooltip('mutation:N', title="Mutation"),
    alt.Tooltip('Expression:Q', title="Change in Expression", format=".2f"), 
    alt.Tooltip('Binding:Q', title="Change in CGG Binding Affinity", format=".2f"),
    # alt.Tooltip('Polyspecificty:Q', title="Change in Polyspecificty", format=".2f"),
    alt.Tooltip('annotation:N', title="Annotation"),
    alt.Tooltip('chain:N', title="")
]



## Define selections for plots 

Define the selecion objects that define interaction. Many of these are shared between plots and datasets, so it's helpful to define these at the top. 

In [6]:
# Zoom bar brush to look closer at a region in the line plot. 
zoom_selection = alt.selection_interval(encodings=['x'], mark=alt.BrushConfig(stroke='black', strokeWidth=2))

 
# Cell selector for highlighting the cell you're currently mousing over 
amino_acid_selection = alt.selection_single(encodings=['x', 'y'], on='mouseover', empty='none')

# Drop down to select the metric displayed in the heatmap ( (delta) Expression or Binding )
metric_dropdown = alt.binding_select(options=['IMGT_Site', 'Chain_Site'],
                                     labels = ['IMGT Position', 'Chain Name'],
                                     name="Select x-axis annotation: ")
metric_selection = alt.selection_single(fields=['metric'], bind=metric_dropdown, init={'metric': 'IMGT_Site'})



## Define the plot objects

The final plot will be comprised of multiple **Heatmaps** that display the binding and expression for the RBD DMS from the four variants of concern and the ancestral sequence. 

### Zoom Bar

In [7]:
## == Zoom bar for the heatmap plot == ## 
zoom_bar = alt.Chart(scores_df[['site']].drop_duplicates()
    ).mark_rect(
        color='lightgray'
    ).encode(
        x=alt.X('site:O',
                title=None,
                axis=alt.Axis(values=list(range(min_site, max_site, x_axis_spacing)))
               )
    ).add_selection(
        zoom_selection
    ).properties(
        width=width,
        height=15,
        title="site zoom bar"
)

zoom_bar


### Heatmaps

In [8]:
## == Heatmaps plots with annotations == ## 
def heatmap(data, metric, reverse_scale=False, title_prefix=""):
    """
    Function to reproduce code for a heatmaps based on a different selection. 
    This shortens the amount of code needed to make three heatmaps that will 
    end up concatenated together in the final plot. 
    """
    
    # Define the input dataset once in the base plot
    heatmapbase = (
        alt.Chart(data)
        .transform_fold(
            ['IMGT_Site', 'Chain_Site'],
            as_=['metric', 'measurement']
        )
        .encode(
            x=alt.X(
                'measurement:O',
                axis=alt.Axis(titleFontSize=15),
                sort=alt.EncodingSortField(field="site", order ='ascending')
            ),
            y=alt.Y(
                'mutant:O',
                sort=aa_order,
                axis=alt.Axis(
                    labelFontSize=12,
                    titleFontSize=15
                )
            )
        )
        .transform_filter(
            metric_selection
        )
    )


    # Define the metric by which the plot is colored - i.e. (delta) expression or binding
    coloring = heatmapbase.mark_rect(
    ).encode(
        color= alt.Color(f'{metric}:Q',
                         type='quantitative',
                         scale=alt.Scale(scheme="redblue",
                                         reverse=reverse_scale,
#                                          domain=[minimum_domain, maximum_domain],
                                         domainMid=0, 
                                         clamp=True
                                        ),
                           legend=alt.Legend(orient='left',
                                             title='grey is n.d.',
                                             gradientLength=100)),
        stroke=alt.value('black'),
        strokeWidth=alt.condition(amino_acid_selection,
                                  alt.value(2),
                                  alt.value(0)),
        tooltip=heatmap_tooltips
    )

    
    # And a black 'x' to the wildtype amino acids 
    wildtype = heatmapbase.mark_text(
        color='black'
    ).encode(
        text=alt.Text('wildtype_code:N')
    )

    
    # Color the empty measurements gray
    nulls = heatmapbase.mark_rect(
    ).transform_filter(
        f"!isValid(datum.{metric})"
    ).mark_rect(
        opacity=0.5
    ).encode(
        alt.Color(f"{metric}:N",
                  scale=alt.Scale(scheme='greys'),
                  legend=None)
    ) 

    # Return the final heatmap along with annotations
    return (coloring + nulls + wildtype 
    ).interactive(
    ).add_selection(
        amino_acid_selection,
        zoom_selection,
        metric_selection
    ).transform_filter(
        zoom_selection
    ).properties(height=height,
                 title=alt.TitleParams(text = f"{title_prefix}{metric}", 
                                       anchor='start',
                                       dx=120)
    )



In [9]:
# Make a heatmap with the top selection and bottom selection
binding_heatmap = heatmap(scores_df, "Binding", title_prefix="anti-CGG ")
expression_heatmap = heatmap(scores_df, "Expression")
# polyspecificity_heatmap = heatmap(scores_df, "Polyspecificty", reverse_scale=True)


In [10]:
# Combine the two heatmaps with the zoom bar - you have to add the selections here backwards to get the right order - why!?
final_plot = (zoom_bar & binding_heatmap & expression_heatmap)


final_plot

In [11]:
final_plot.save(output_html)