In [5]:
import pandas as pd
import re

In [6]:
def load_and_merge_mut_rsa(
    mutation_csv,
    rsa_csv,
    gene_name,
    check_aa_match=True
):
    """
    Load mutation and RSA files for a gene and merge on residue position.
    """

    # Load data
    mut = pd.read_csv(mutation_csv)
    rsa = pd.read_csv(rsa_csv)

    # Extract residue position from Mutation (e.g. R35Q -> 35)
    mut["Position"] = mut["Mutation"].str.extract(r"(\d+)").astype(int)

    # Rename columns for clarity
    mut = mut.rename(columns={
        "aaref": "WT_AA",
        "aaalt": "Mut_AA"
    })

    rsa = rsa.rename(columns={"ResNum": "Position"})

    # Merge mutation with RSA
    merged = mut.merge(
        rsa[["Position", "AA", "RSA"]],
        on="Position",
        how="left"
    )

    # Add gene label
    merged["Gene"] = gene_name

    # Optional AA consistency check
    if check_aa_match:
        merged["AA_match"] = merged["WT_AA"] == merged["AA"]

    return merged


def summarize_by_amino_acid(merged_df):
    """
    Summarize RSA statistics by wild-type amino acid.
    """

    summary = (
        merged_df
        .groupby(["Gene", "WT_AA"])
        .agg(
            n=("Mutation", "count"),
            mean_RSA=("RSA", "mean"),
            median_RSA=("RSA", "median"),
            min_RSA=("RSA", "min"),
            max_RSA=("RSA", "max")
        )
        .reset_index()
        .sort_values(["Gene", "WT_AA"])
    )

    return summary


In [7]:
# ---- PSEN1 (Sun) ----
psen1_mut = "../raw_data/PSEN1_Sun.csv"
psen1_rsa = "../processed_data/PSEN1_AF3_RSA.csv"

# ---- PSEN2 ----
psen2_mut = "../raw_data/PSEN2_Pillai.csv"
psen2_rsa = "../processed_data/PSEN2_AF3_RSA.csv"

# ---- APP ----
app_mut = "../raw_data/APP_Pillai.csv"
app_rsa = "../processed_data/APP_AF3_RSA.csv"


In [8]:
psen1_merged = load_and_merge_mut_rsa(psen1_mut, psen1_rsa, "PSEN1")
psen2_merged = load_and_merge_mut_rsa(psen2_mut, psen2_rsa, "PSEN2")
app_merged   = load_and_merge_mut_rsa(app_mut,   app_rsa,   "APP")


In [9]:
all_merged = pd.concat(
    [psen1_merged, psen2_merged, app_merged],
    ignore_index=True
)

all_merged.head()


Unnamed: 0,Variants,Mutation,AAO,Total Activity,SD of Total Activity,Ab42/Ab40 ratio,SD of ratio,Amount of Ab40,SD of Ab40,Amount of Ab42,...,AA,RSA,Gene,AA_match,Aβ40 (relative to WT),Significant?,Aβ42 (relative to WT),Significant?.1,Aβ42/40,Significant?.2
0,PSEN1:p.Arg35Gln,R35Q,59.0,0.43,0.031,1.449,0.267,0.423,0.032,0.673,...,R,0.751,PSEN1,True,,,,,,
1,PSEN1:p.Ala79Val,A79V,61.2,0.013,0.002,,,0.008,0.001,0.056,...,A,0.421,PSEN1,True,,,,,,
2,PSEN1:p.Val82Leu,V82L,55.0,0.596,0.018,0.813,0.07,0.608,0.016,0.516,...,V,0.194,PSEN1,True,,,,,,
3,PSEN1:p.Leu85Pro,L85P,26.0,0.02,0.001,,,0.011,0.001,0.177,...,L,0.005,PSEN1,True,,,,,,
4,PSEN1:p.Val89Leu,V89L,48.6,0.188,0.009,2.526,0.235,0.162,0.006,0.417,...,V,0.006,PSEN1,True,,,,,,


In [10]:
summary_by_aa = summarize_by_amino_acid(all_merged)
summary_by_aa


Unnamed: 0,Gene,WT_AA,n,mean_RSA,median_RSA,min_RSA,max_RSA
0,APP,A,6,0.42,0.421,0.025,0.843
1,APP,D,1,0.701,0.701,0.701,0.701
2,APP,E,3,0.825667,0.883,0.678,0.916
3,APP,H,1,0.866,0.866,0.866,0.866
4,APP,I,1,0.456,0.456,0.456,0.456
5,APP,K,3,0.526,0.539,0.287,0.752
6,APP,L,2,0.607,0.607,0.607,0.607
7,APP,P,2,0.649,0.649,0.649,0.649
8,APP,R,1,0.547,0.547,0.547,0.547
9,APP,S,1,0.671,0.671,0.671,0.671
