In [14]:
import pandas as pd
import numpy as np
import re
import string

from Bio import SeqIO
from natsort import natsort_keygen
from pathlib import Path

### Format structural alignment

In [26]:
def alignment_to_df(fasta_path, ref_id, start_site=1):
    records = []
    for rec in SeqIO.parse(fasta_path, "fasta"):
        clean_id = rec.id.split("_")[0]
        records.append((clean_id, str(rec.seq)))

    seq_dict = dict(records)
    if ref_id not in seq_dict:
        raise ValueError(f"ref_id '{ref_id}' not found. Available IDs: {list(seq_dict.keys())}")

    other_ids = [rid for rid in seq_dict if rid != ref_id]
    ref_seq = seq_dict[ref_id]

    letters = list(string.ascii_lowercase)
    def insertion_label(base_num, k):
        if k < len(letters):
            return f"{base_num}{letters[k]}"
        else:
            raise ValueError(f"Insertion index {k} exceeds available letters ({len(letters)})")

    final_df = None

    for pdb in other_ids:
        seq = seq_dict[pdb]
        if len(ref_seq) != len(seq):
            raise ValueError("Aligned sequences must be the same length.")
        
        rows = []
        last_numeric = start_site - 1
        ins_count = {}

        for ref_aa, aa in zip(ref_seq, seq):
            if ref_aa == "-" and aa == "-":
                continue

            if ref_aa != "-" and aa != "-":
                last_numeric += 1
                site = str(last_numeric)
                rows.append((site, ref_aa, aa))
                ins_count[last_numeric] = 0

            elif ref_aa == "-" and aa != "-":
                base = last_numeric
                k = ins_count.get(base, 0)
                site = insertion_label(base, k)
                ins_count[base] = k + 1
                rows.append((site, "-", aa))

            else:  # ref_aa != "-" and aa == "-"
                last_numeric += 1
                site = str(last_numeric)
                rows.append((site, ref_aa, "-"))
                ins_count[last_numeric] = 0

            df = pd.DataFrame(rows, columns=["struct_site", f"{ref_id}_aa", f"{pdb}_aa"])

        if final_df is None:
            final_df = df
        else:
            final_df = pd.merge(final_df, df, on=["struct_site", f"{ref_id}_aa"], how="outer")
        
    return final_df.sort_values("struct_site", key=natsort_keygen()).reset_index(drop=True)

ha1_aln = alignment_to_df('../results/foldmason/ha1/result_aa.fa', ref_id='4o5n', start_site=9)
ha2_aln = alignment_to_df('../results/foldmason/ha2/result_aa.fa', ref_id='4o5n', start_site=330)

ha1_aln.head()

Unnamed: 0,struct_site,4o5n_aa,4r8w_aa,4kwm_aa
0,9,P,-,P
1,10,G,-,G
2,11,A,D,D
3,12,T,K,Q
4,13,L,I,I


In [27]:
from pathlib import Path

def parse_rmsd_txt(path, id=None):
    lines = Path(path).read_text().splitlines()
    out = []
    for ln in lines[1:]:
        m = re.match(r"\s*(\d+):\s*(.*)\s*$", ln)
        if not m:
            continue
        i = int(m.group(1))
        v = m.group(2).strip()
        v = None if v == "None" else float(v)
        out.append((i, v))
    return pd.DataFrame(out, columns=["aln_idx", f"rmsd_{id}"])

h3_h5_ha1_rmsd = parse_rmsd_txt('../results/rmsd/h3_h5_ha1_rmsd.txt', id='h3h5')
h3_h5_ha2_rmsd = parse_rmsd_txt('../results/rmsd/h3_h5_ha2_rmsd.txt', id='h3h5')

h3_h7_ha1_rmsd = parse_rmsd_txt('../results/rmsd/h3_h7_ha1_rmsd.txt', id='h3h7')
h3_h7_ha2_rmsd = parse_rmsd_txt('../results/rmsd/h3_h7_ha2_rmsd.txt', id='h3h7')

h5_h7_ha1_rmsd = parse_rmsd_txt('../results/rmsd/h5_h7_ha1_rmsd.txt', id='h5h7')
h5_h7_ha2_rmsd = parse_rmsd_txt('../results/rmsd/h5_h7_ha2_rmsd.txt', id='h5h7')

In [28]:
def merge_aln_rmsd(aln_df, rmsd_df):
    assert len(aln_df) == len(rmsd_df)
    return pd.concat(
        [aln_df, rmsd_df], axis=1
    ).drop(columns=['aln_idx'])

struct_align_df = (
    pd.concat(
        [merge_aln_rmsd(
            merge_aln_rmsd(
                merge_aln_rmsd(ha1_aln, h3_h5_ha1_rmsd),
                h3_h7_ha1_rmsd
            ), h5_h7_ha1_rmsd
        ),
        merge_aln_rmsd(
            merge_aln_rmsd(
                merge_aln_rmsd(ha2_aln, h3_h5_ha2_rmsd),
                h3_h7_ha2_rmsd
            ), h5_h7_ha2_rmsd
        )], ignore_index=True
    )
    .sort_values("struct_site", key=natsort_keygen())
    .reset_index(drop=True)
)

struct_align_df.head()

### Add DMS background residues and numbering

In [77]:
# remove sites that are missing in the structural alignment

h3_missing = [*range(1, 9), *range(326, 330), *range(503, 505)]
h3_wt = pd.read_csv(
    '../data/MDCKSIAT1_entry_func_effects.csv'
)[['site', 'wildtype']].drop_duplicates().reset_index(drop=True).query(
    "site not in @h3_missing"
)

h5_missing = [*map(str, range(1, 9)),
              *map(str, range(325, 339)),
              *map(str, range(503, 552)),
              "328a", "328b", "328c", "510a"]
h5_wt = pd.read_csv(
    '../data/293T_entry_func_effects.csv'
)[['site', 'wildtype']].drop_duplicates().reset_index(drop=True).query(
    "site not in @h5_missing and ~site.str.contains('-', na=False)",
    engine="python"
)

h7_missing = [*map(str, range(326, 331)), 
              *map(str, range(500, 515)), "328a"]
h7_wt = pd.read_csv(
    '../data/293_2-6_entry_func_effects.csv'
)[['site', 'wildtype']].drop_duplicates().reset_index(drop=True).query(
    "site not in @h7_missing"
)

In [82]:
def add_wt_cols(
    aln_df, wt_df,
    ref_col="4o5n_aa",      # col to align against with letters or '-'
    site_col="site",        # col in wt_df with site numbering
    aa_col="wildtype",      # column in wt_df with wt residue
    out_site_col="wt_site",
    out_aa_col="wt_aa",
):
    wt = wt_df.reset_index(drop=True).sort_values("site", key=natsort_keygen())
    is_letter = aln_df[ref_col].astype(str).str.fullmatch(r"[A-Za-z]")

    idx = is_letter.cumsum() - 1
    take = is_letter & (idx < len(wt))

    # prefill outputs as NA
    out_site = pd.Series(np.nan, index=aln_df.index, dtype="object")
    out_aa   = pd.Series(np.nan, index=aln_df.index, dtype="object")

    # fill where we have letters and WT left
    pos = idx[take].to_numpy()
    out_site.loc[take] = wt[site_col].to_numpy()[pos]
    out_aa.loc[take]   = wt[aa_col].to_numpy()[pos]

    # summary stats
    comparable = take
    comp_idx = comparable[comparable].index
    n_compared = len(comp_idx)
    if n_compared:
        ref_up = aln_df.loc[comp_idx, ref_col].astype("string").str.upper().to_numpy()
        wt_up  = pd.Series(out_aa, dtype="string").loc[comp_idx].str.upper().to_numpy()
        n_match = int((ref_up == wt_up).sum())
        pct_match = float(np.round(100.0 * n_match / n_compared, 2))

    else:
        n_match = 0
        pct_match = np.nan

    out_df = aln_df.assign(**{out_site_col: out_site, out_aa_col: out_aa})
    return out_df, {"n_compared": n_compared, "n_match": n_match, "pct_match": pct_match}

aln_out, stats = add_wt_cols(
    struct_align_df, 
    h3_wt, 
    ref_col="4o5n_aa",
    out_aa_col="h3_wt_aa",
    out_site_col="h3_site"
)
print(stats)

aln_out, stats = add_wt_cols(
    aln_out, 
    h5_wt, 
    ref_col="4kwm_aa",
    out_aa_col="h5_wt_aa",
    out_site_col="h5_site"
)
print(stats)

aln_out, stats = add_wt_cols(
    aln_out, 
    h7_wt, 
    ref_col="4r8w_aa",
    out_aa_col="h7_wt_aa",
    out_site_col="h7_site"
)
print(stats)

{'n_compared': 490, 'n_match': 457, 'pct_match': 93.27}
{'n_compared': 487, 'n_match': 453, 'pct_match': 93.02}
{'n_compared': 485, 'n_match': 485, 'pct_match': 100.0}


In [84]:
aln_out.to_csv('../results/structural_alignment/structural_alignment.csv', index=False)
aln_out.head()

Unnamed: 0,struct_site,4o5n_aa,4r8w_aa,4kwm_aa,rmsd_h3h5,rmsd_h3h7,rmsd_h5h7,h3_site,h3_wt_aa,h5_site,h5_wt_aa,h7_site,h7_wt_aa
0,9,P,-,P,9.178066,,,9,S,9,K,,
1,10,G,-,G,8.183151,,,10,T,10,S,,
2,11,A,D,D,5.04805,1.735252,4.172437,11,A,11,D,11.0,D
3,12,T,K,Q,3.93908,1.489412,4.694308,12,T,12,Q,12.0,K
4,13,L,I,I,3.725425,1.019553,3.75921,13,L,13,I,13.0,I
