# Visualize the relationship between measured metrics

We want to visualize the relationship between the measured metrics (binding, expression, and specificity) after selecting for specific substitutions or aspects of the antibody structure.

In [1]:
import itertools 
import pandas as pd
import numpy as np
import altair as alt

# Remove the limit of ~5000 rows
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
# ## == Filepaths == ##  

# Input path to variant scores 
variant_scores_path = "results/final_variant_scores/final_variant_scores.csv"


## Process the input data

Transform the data into the structure necessary to make the plots. 

In [3]:
# Expression and Binding scores per variant background
scores_df = (
    pd.read_csv(variant_scores_path)
        .rename(columns = {"position":"site",
                           "delta_expr":"Expression", # rename for the selection menus 
                           "delta_bind":"Binding",    # rename for the selection menus 
                           "delta_psr":"Polyspecificty" # rename for the selection menus 
                          })
)

# Drop the un-used columns - shrinks the final size of the altair plot
scores_df = scores_df.drop(columns=[column for column in scores_df.columns
                                   if column not in ['target', 'wildtype', 'site', 'mutant',
                                                     'Expression', 'Binding', 'Polyspecificty',
                                                     'n_bc_expr', 'n_bc_bind', 'n_bc_psr',
                                                     'chain', 'annotation', 'mutation'
                                                    ]
                                   ]
                          )

# Remove the 'NaN' in the annotations and replace with an empty string
scores_df['annotation'] = scores_df['annotation'].fillna('')


## Define plot-wide parameters

Define the parameters that carry through to multiple plots for easy adjustment. 

In [34]:
# Height of the line plot and scatter plot
height = 350

# Width of the line plot and zoom bar
width = (height * 3) * 1.13


# Tooltips for scatterplot and names
scatter_tooltip = [
    alt.Tooltip('mutation:N', title="Mutation"),
    alt.Tooltip('Expression:Q', title="Change in Expression", format=".2f"), 
    alt.Tooltip('Binding:Q', title="Change in Binding Affinity", format=".2f"),
    alt.Tooltip('Polyspecificty:Q', title="Change in Polyspecificty", format=".2f"),
    alt.Tooltip('region:N', title="Annotation"),
]


## Define shared selections

Define the selections that will be shared by the final linked plots. 

In [35]:

# Amino acid selections
wildtype_amino_selector = alt.selection_multi(fields=['wildtype'])
mutant_amino_selector = alt.selection_multi(fields=['mutant'])

# Annotation selections
annotation_selector = alt.selection_multi(encodings=['color'])




## Amino Acid Selection Bars 

Bars to select the wildtype and mutant amino acids to show in the plot.

In [36]:
def amino_acid_selection(identity, selection, scores):
    """
    Generate the plots for selecting the amino acids.
    """
    
    # Make a dataframe with the amino acids to use for selection (wildtype is missing 'H')
    amino_acids = (    
         scores[['mutant']]
        .drop_duplicates()
        .dropna()
    )
    amino_acids['wildtype'] = amino_acids['mutant']
    
    # Make the base of the plot
    bars = (alt.Chart(
        amino_acids[[f'{identity}']]
    )
     .mark_rect(
         color="darkgray",
         stroke='black'
     )
     .encode(
         x=alt.X(f'{identity}:N',
                 title=None,
                 axis=alt.Axis(labels=False, ticks=False)),
         opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
     )
     .add_selection(
         selection
     )
     .properties(
            width=width,
            height=15,
            title=f"Select {identity.capitalize()} Amino Acid"
        )
    )

    # Make the text labels for the plot
    text = (alt.Chart(
        amino_acids[[f'{identity}']]
    )
     .mark_text(
         color="white"
     )
     .encode(
         x=alt.X(f'{identity}:N',
                 title=None,
                 axis=alt.Axis(labels=False, ticks=False)),
         text=f'{identity}:N'
     )
    )

    return (bars + text)


In [37]:
# Wildtype Amino Acid Selection
wildtype_bar = amino_acid_selection("wildtype", wildtype_amino_selector, scores_df)

# Mutant Amino Acid Selection
mutant_bar = amino_acid_selection("mutant", mutant_amino_selector, scores_df)

## Annotation Selection Plot

Make a scaled plot to select regions in the anitbody.

In [38]:
## ==== Get all of the annotations and their corresponding order in the structure ==== ##

# Order of the chains 
chain_order = (
    scores_df[['site', 'chain']]
    .drop_duplicates()
    .groupby('chain')
    .apply(lambda group: group.loc[group['site'] == group['site'].max()])
    .reset_index(drop=True)
    .rename(columns={"chain": "region"})
    .sort_values('site')
    .reset_index()
    .drop(columns=['index', 'site'])
    .reset_index()
    .rename(columns={"index": "order"})
)

# Order of the 'annotations'
annotation_order = (
    scores_df[['site', 'annotation']]
    .drop_duplicates()
    .groupby('annotation')
    .apply(lambda group: group.loc[group['site'] == group['site'].max()])
    .reset_index(drop=True)
    .rename(columns={"annotation": "region"})
    .sort_values('site')
    .reset_index()
    .drop(columns=['index', 'site'])
    .reset_index()
    .rename(columns={"index": "order"})
)

# Combine the orders together 
order_df = pd.concat([annotation_order, chain_order])

# Make the final dataframe to plot the annotations
annotation_df = (
    scores_df[['site', 'annotation', 'chain']]
    .drop_duplicates()
    .melt(id_vars=['site'], value_vars=['annotation', 'chain'], var_name='type', value_name='region')
    .groupby(['type', 'region'])
    .count()
    .reset_index()
    .merge(order_df, on='region', how='left')
    .rename(columns={"site": "length"})
)

# Calculate the position of the text to center it in each annotation
position = (scores_df[['site', 'annotation', 'chain']]
 .drop_duplicates()
 .melt(id_vars=['site'], value_vars=['annotation', 'chain'], var_name='type', value_name='region')
)

position['length'] =  position.groupby(['type', 'region']).transform(lambda x: (x.max() + x.min())/2)

position_df = (position[['length','type','region']]
               .drop_duplicates()
               .merge(annotation_df.rename(columns={"length": "size"})[['type', 'region', 'order', 'size']])
              )

In [55]:
# Order of the regions
site_order = annotation_df.sort_values(['type', 'order'])['region'].to_list()


# Base of the plot to select annotations
bars = (alt.Chart(
        annotation_df
    )
    .mark_bar()
    .encode(
        x=alt.X('length:Q',
                title="Position", scale=alt.Scale(padding=0, nice=False)),
        y=alt.Y("type:N",
                title=None),
        color=alt.Color("region:N",
                        sort=site_order,
                        legend=None),
        opacity=alt.condition(annotation_selector,
                              alt.value(1), alt.value(0.2)),
        order=alt.Order('color_region_sort_index:Q')
    )
    .add_selection(
        annotation_selector
    )
)
 
# Add the names of the annotations
text = (
    alt.Chart(
        position_df
    )
    .mark_text(
        color='white',
        dx=-3, dy=1
    )
    .encode(
        x=alt.X('length:Q', scale=alt.Scale(padding=0, nice=False)),
        y=alt.Y("type:N"),
        detail='region:N',
        text='region:N',
        size=alt.Size('size:Q', legend=None, scale=alt.Scale(domain=[-140,550]))
    )
)

annotation_bar = (bars + text).properties(
        width=width,
        height=30,
        title="Select Region of Antibody"
)

annotation_bar

## Mutation Effect Scatter Plot

Make the main scatter plots. 

In [56]:
scatter_scores_df = (scores_df
 .melt(id_vars=['target', 'wildtype', 'site', 'mutant',
                'Expression', 'Binding', 'Polyspecificty',
                'n_bc_expr', 'n_bc_bind', 'n_bc_psr', 'mutation'],
       value_vars=['annotation', 'chain'],
       var_name='type',
       value_name='region')
)

In [57]:
# Base of the chart 
base = alt.Chart(scatter_scores_df)

def comparison_plot(metric_one, metric_two):
    """
    Make the scatter plot to compare between two metrics in the
    """
    
    # Background plot
    background = (
            base
        .mark_circle(
            size=15,
            color="darkgray",
            opacity=0.5
        )
        .encode(
            x=alt.X(f'{metric_one}:Q'),
            y=alt.Y(f'{metric_two}:Q')
        )
        .properties(
            width = height,
            height = height
        )
    )
    
    # Selection
    selection = (
            base
        .mark_circle(
            size=30,
            color="black",
            opacity=0.7
        )
        .encode(
            x=alt.X(f'{metric_one}:Q'),
            y=alt.Y(f'{metric_two}:Q'),
            tooltip=scatter_tooltip,
        )
        .transform_filter(
            wildtype_amino_selector
        )
        .transform_filter(
            mutant_amino_selector
        )
        .transform_filter(
            annotation_selector
        )
        .add_selection(
            wildtype_amino_selector,  
            mutant_amino_selector, 
            annotation_selector
        )
        .properties(
            width = height,
            height = height,
            title=f"{metric_one} vs. {metric_two}"
        )
    )
    
    return background + selection



In [58]:

Binding_Expression = comparison_plot("Binding", "Expression")

Polyspecificty_Expression = comparison_plot("Polyspecificty", "Expression")

Binding_Polyspecificty = comparison_plot("Binding", "Polyspecificty")


In [59]:
final_plot = (Binding_Expression | Polyspecificty_Expression | Binding_Polyspecificty) & annotation_bar & wildtype_bar & mutant_bar 


In [60]:
final_plot

In [61]:
final_plot.save("scatterplot.html")