In [9]:
import pandas as pd
import numpy as np
import altair as alt

from itertools import combinations
from natsort import natsort_keygen

import theme

In [5]:
nat_key = natsort_keygen()

aln_df = pd.read_csv('../results/structural_alignment/structural_alignment.csv')[[
    'struct_site', 'h3_wt_aa', 'h5_wt_aa', 'h7_wt_aa'
]].assign(
    ha_region=lambda x: pd.Categorical(
        np.where(
            x['struct_site'].map(lambda s: nat_key(str(s))) <= nat_key('329'),
            'HA1',
            'HA2'
        ),
        categories=['HA1', 'HA2']
    )
)

aln_df.head()

Unnamed: 0,struct_site,h3_wt_aa,h5_wt_aa,h7_wt_aa,ha_region
0,9,S,K,,HA1
1,10,T,S,,HA1
2,11,A,D,D,HA1
3,12,T,Q,K,HA1
4,13,L,I,I,HA1


In [12]:
# Calculate pairwise amino acid identity between HAs

columns = ['h3_wt_aa', 'h5_wt_aa', 'h7_wt_aa']
results = []

# Calculate identity for all pairwise combinations in both directions
for col1 in columns:
    for col2 in columns:
        if col1 == col2:
            # Self-comparison is 100% identity
            ha1 = col1.replace('_wt_aa', '').upper()
            ha2 = col2.replace('_wt_aa', '').upper()
            results.append({
                'ha_x': ha1,
                'ha_y': ha2,
                'matches': np.nan,
                'alignable_residues': np.nan,
                'percent_identity': 100.0
            })
        else:
            # Get positions where both sequences have amino acids (alignable residues)
            # i.e. not NaN in either column
            alignable = aln_df[[col1, col2]].dropna()
            
            # Count matches
            matches = (alignable[col1] == alignable[col2]).sum()
            total = len(alignable)
            
            # Calculate percent identity
            pct_identity = (matches / total * 100) if total > 0 else 0
            
            ha1 = col1.replace('_wt_aa', '').upper()
            ha2 = col2.replace('_wt_aa', '').upper()
            
            results.append({
                'ha_x': ha1,
                'ha_y': ha2,
                'matches': matches,
                'alignable_residues': total,
                'percent_identity': pct_identity
            })

prot_identity_df = pd.DataFrame(results)
prot_identity_df

Unnamed: 0,ha_x,ha_y,matches,alignable_residues,percent_identity
0,H3,H3,,,100.0
1,H3,H5,194.0,479.0,40.501044
2,H3,H7,226.0,482.0,46.887967
3,H5,H3,194.0,479.0,40.501044
4,H5,H5,,,100.0
5,H5,H7,201.0,473.0,42.494715
6,H7,H3,226.0,482.0,46.887967
7,H7,H5,201.0,473.0,42.494715
8,H7,H7,,,100.0


In [13]:
# Calculate pairwise amino acid identity between HA domains

columns = ['h3_wt_aa', 'h5_wt_aa', 'h7_wt_aa']
results = []

for region in ['HA1', 'HA2']:
    # Filter to current region
    region_df = aln_df[aln_df['ha_region'] == region]
    
    # Calculate identity for all pairwise combinations in both directions
    for col1 in columns:
        for col2 in columns:
            if col1 == col2:
                # Self-comparison is 100% identity
                ha1 = col1.replace('_wt_aa', '').upper()
                ha2 = col2.replace('_wt_aa', '').upper()
                results.append({
                    'ha_region': region,
                    'ha_x': ha1,
                    'ha_y': ha2,
                    'matches': np.nan,
                    'alignable_residues': np.nan,
                    'percent_identity': 100.0
                })
            else:
                # Get positions where both sequences have amino acids (alignable residues)
                alignable = region_df[[col1, col2]].dropna()
                
                # Count matches
                matches = (alignable[col1] == alignable[col2]).sum()
                total = len(alignable)
                
                # Calculate percent identity
                pct_identity = (matches / total * 100) if total > 0 else 0
                
                ha1 = col1.replace('_wt_aa', '').upper()
                ha2 = col2.replace('_wt_aa', '').upper()
                
                results.append({
                    'ha_region': region,
                    'ha_x': ha1,
                    'ha_y': ha2,
                    'matches': matches,
                    'alignable_residues': total,
                    'percent_identity': pct_identity
                })

domain_identity_df = pd.DataFrame(results)
domain_identity_df

Unnamed: 0,ha_region,ha_x,ha_y,matches,alignable_residues,percent_identity
0,HA1,H3,H3,,,100.0
1,HA1,H3,H5,110.0,315.0,34.920635
2,HA1,H3,H7,113.0,313.0,36.102236
3,HA1,H5,H3,110.0,315.0,34.920635
4,HA1,H5,H5,,,100.0
5,HA1,H5,H7,114.0,312.0,36.538462
6,HA1,H7,H3,113.0,313.0,36.102236
7,HA1,H7,H5,114.0,312.0,36.538462
8,HA1,H7,H7,,,100.0
9,HA2,H3,H3,,,100.0


In [52]:
# Create heatmap with Altair

def plot_identity_heatmap(df, title):
    heatmap = alt.Chart(df).mark_rect(
        opacity=0.9, stroke='black', strokeWidth=1
    ).encode(
        x=alt.X('ha_x:N', title=None, axis=alt.Axis(labelAngle=0)),
        y=alt.Y('ha_y:N', title=None),
        color=alt.Color(
            'percent_identity:Q',
            scale=alt.Scale(scheme='blues'),
            title=['Amino Acid', 'Identity (%)']
        )
    ).properties(
        width=150,
        height=150,
        title=alt.Title(title, anchor='middle')
    )

    # Add text labels on the heatmap
    text = alt.Chart(df).mark_text(baseline='middle').encode(
        x=alt.X('ha_x:N'),
        y=alt.Y('ha_y:N'),
        text=alt.Text('percent_identity:Q', format='.0f'),
        color=alt.condition(
            alt.datum.percent_identity > 70,
            alt.value('white'),
            alt.value('black')
        )
    )

    chart = (heatmap + text)
    return chart

(
    plot_identity_heatmap(prot_identity_df, 'Full ectodomain') | 
    plot_identity_heatmap(domain_identity_df.query('ha_region == "HA1"'), 'HA1') | 
    plot_identity_heatmap(domain_identity_df.query('ha_region == "HA2"'), 'HA2')
)