### Convert Per Interaction/Feature Scores numbering to TEM1 and MSA Numbering

In [1]:
import re
import pandas as pd
from key_interactions_finder.pymol_projections import project_pymol_top_features
from key_interactions_finder import pymol_projections

from analysis_tools.msa_tools import parse_fasta, create_pdb_to_msa_converter

KIF_FILES = {
    "TEM1": 
        {
            "benzyl": r"outputs/TEM1_1M40_benzyl/Linear_Correlations_Per_Feature_Scores.csv",
            "cefo": r"outputs/TEM1_1M40_cefo/Linear_Correlations_Per_Feature_Scores.csv"
        },
    "ENCA": 
        {
            "benzyl": r"outputs/ENCA_3ZDJ_benzyl/Linear_Correlations_Per_Feature_Scores.csv",
            "cefo": r"outputs/ENCA_3ZDJ_cefo/Linear_Correlations_Per_Feature_Scores.csv"
        },
    "GNCA": 
        {
            "benzyl": r"outputs/GNCA_4B88_benzyl/Linear_Correlations_Per_Feature_Scores.csv",
            "cefo": r"outputs/GNCA_4B88_cefo/Linear_Correlations_Per_Feature_Scores.csv"
        },
    "PNCA": 
        {
            "benzyl": r"outputs/PNCA_4C6Y_benzyl/Linear_Correlations_Per_Feature_Scores.csv",
            "cefo": r"outputs/PNCA_4C6Y_cefo/Linear_Correlations_Per_Feature_Scores.csv"
        }
}
ALINGMENT_FILE = r"raw_data/align1d.ali"

GENERALISTS = ["GNCA", "PNCA"]
SPECIALISTS = ["TEM1", "ENCA"]

load in the msa data

In [2]:
msa_seqs = parse_fasta(ALINGMENT_FILE)

msa_converters = {
    "TEM1": create_pdb_to_msa_converter(msa_seqs["1M40_TEM-1"]),
    "ENCA": create_pdb_to_msa_converter(msa_seqs["3ZDJ_ENCA"]),
    "GNCA": create_pdb_to_msa_converter(msa_seqs["4B88_GNCA"]),
    "PNCA": create_pdb_to_msa_converter(msa_seqs["4C6Y_PNCA"])
}

prepare dataframes of each system

In [3]:
all_dfs = {}
for system, substrates in KIF_FILES.items():
    benzyl_df = pd.read_csv(substrates["benzyl"])
    cefo_df = pd.read_csv(substrates["cefo"])

    # merge on feature name, calc delta
    merged_df = benzyl_df.merge(cefo_df, how="outer", on="Feature")
    merged_df.columns = ["Feature", "Score_Benzyl", "Score_Cefo"]
    merged_df = merged_df.fillna(0.0)

    merged_df["Absolute Delta"] = abs(merged_df["Score_Benzyl"] - merged_df["Score_Cefo"])
    merged_df = merged_df.sort_values("Absolute Delta", ascending=False)

    # get scores and output as pymol results files
    delta_dict = dict(zip(
        merged_df["Feature"].values,
        merged_df["Absolute Delta"].values
    ))
    
    all_dfs[system] = merged_df

In [4]:
all_dfs["TEM1"]

Unnamed: 0,Feature,Score_Benzyl,Score_Cefo,Absolute Delta
21,105Ser 209Lys Hbond,-0.285994,0.379887,0.665881
36,112Leu 116Thr Hbond,-0.248764,0.278497,0.527262
32,18Arg 40Arg Hbond,-0.253587,0.265151,0.518738
398,111Asn 141Glu Hbond,-0.009367,-0.519482,0.510115
227,132Asp 167Lys Saltbr,-0.064222,0.445340,0.509562
...,...,...,...,...
402,247Glu 250Arg Saltbr,0.007732,0.006639,0.001093
332,112Leu 115Thr Hbond,-0.026361,-0.025557,0.000805
34,199Ala 253Ala Hydrophobic,-0.251479,-0.252210,0.000731
253,251Gln 254Glu Hbond,-0.054056,-0.054730,0.000674


In [5]:
msa_converter = msa_converters["TEM1"]
interaction_name = all_dfs["TEM1"]["Feature"][0]
interaction_name

'43Met 46Thr Hbond'

In [6]:
def pdb_feats_to_msa_feats(int_names: list[str], msa_converter: dict[int, int]) -> list[str]:
    """
    Convert a list of KIF labelled interaction names to have msa numbering.
    The dictionary that converts between the numbers needs to be provided. 
    
    """
    msa_names = []
    for int_name in int_names:
        int_split = re.split(r"(\d+)", int_name)
        res1_numb, res2_numb = int(int_split[1]), int(int_split[3])

        msa_res1, msa_res2 = msa_converter[res1_numb], msa_converter[res2_numb]
        updated_name = str(msa_res1) + int_split[2] + str(msa_res2) + int_split[4]
    
        msa_names.append(updated_name)
    return msa_names

In [7]:
for system, df in all_dfs.items(): 
    msa_converter = msa_converters[system]
    int_names = list(df["Feature"])
    msa_names = pdb_feats_to_msa_feats(int_names=int_names, msa_converter=msa_converter)
    df["Feature, MSA numbered"] = msa_names

In [8]:
all_dfs["TEM1"]

Unnamed: 0,Feature,Score_Benzyl,Score_Cefo,Absolute Delta,"Feature, MSA numbered"
21,105Ser 209Lys Hbond,-0.285994,0.379887,0.665881,107Ser 211Lys Hbond
36,112Leu 116Thr Hbond,-0.248764,0.278497,0.527262,114Leu 118Thr Hbond
32,18Arg 40Arg Hbond,-0.253587,0.265151,0.518738,20Arg 42Arg Hbond
398,111Asn 141Glu Hbond,-0.009367,-0.519482,0.510115,113Asn 143Glu Hbond
227,132Asp 167Lys Saltbr,-0.064222,0.445340,0.509562,134Asp 169Lys Saltbr
...,...,...,...,...,...
402,247Glu 250Arg Saltbr,0.007732,0.006639,0.001093,250Glu 253Arg Saltbr
332,112Leu 115Thr Hbond,-0.026361,-0.025557,0.000805,114Leu 117Thr Hbond
34,199Ala 253Ala Hydrophobic,-0.251479,-0.252210,0.000731,201Ala 256Ala Hydrophobic
253,251Gln 254Glu Hbond,-0.054056,-0.054730,0.000674,254Gln 257Glu Hbond


### Now Calculate the delta between generalists and specialists

We can start by looking at all TEM1 residues to calculate it. 

In [9]:
SPECIALISTS, GENERALISTS

(['TEM1', 'ENCA'], ['GNCA', 'PNCA'])

In [10]:
tem1_msa_interactions = list(all_dfs["TEM1"]["Feature, MSA numbered"])
tem1_pdb_interactions = list(all_dfs["TEM1"]["Feature"])
tem1_msa_interactions[0:2]


['107Ser 211Lys Hbond', '114Leu 118Thr Hbond']

In [11]:
delta_benzyl_scores, delta_cefo_scores = {}, {}
for msa_interaction in tem1_msa_interactions:
    benzyl_special_sum, benzyl_general_sum = 0.0, 0.0
    cefo_special_sum, cefo_general_sum = 0.0, 0.0
    for system, df in all_dfs.items():
        try:
            row = df[df["Feature, MSA numbered"] == msa_interaction]
            benzyl_score = row["Score_Benzyl"].values[0]
            cefo_score = row["Score_Cefo"].values[0]
        except IndexError:
            # couldn't be found, set to 0. 
            benzyl_score, cefo_score = 0, 0
        if system in SPECIALISTS:
            benzyl_special_sum += benzyl_score
            cefo_special_sum += cefo_score
        else:
            benzyl_general_sum += benzyl_score
            cefo_general_sum += cefo_score

    delta_benzyl_scores[msa_interaction] = benzyl_special_sum - benzyl_general_sum
    delta_cefo_scores[msa_interaction] = cefo_special_sum - cefo_general_sum

In [17]:
def modify_column_residue_numbers(interaction_names: list[str], constant_to_add: int = 1) ->list[str]:
    """
    """
    updated_names = []
    for column_name in interaction_names:
        res_split = re.split(r"(\d+)", column_name)

        res1_numb = int(res_split[1])
        res1_name = res_split[2]
        res2_numb = int(res_split[3])
        remainder = res_split[4]

        res1_numb += constant_to_add
        res2_numb += constant_to_add

        updated_name = str(res1_numb) + res1_name + \
            str(res2_numb) + remainder
        updated_names.append(updated_name)

    return updated_names

tem1_1m40_numbering = modify_column_residue_numbers(
    interaction_names=tem1_pdb_interactions, 
    constant_to_add=25
)

In [13]:
delta_df = pd.DataFrame([delta_benzyl_scores, delta_cefo_scores]).T.reset_index()
delta_df.columns = ["Feature, MSA numbered", "delta_benzyl", "delta_cefo"]
delta_df.insert(loc=0, column="Feature, PDB numbered", value=tem1_pdb_interactions)
delta_df.insert(loc=1, column="Feature, 1M40 numbered", value=tem1_1m40_numbering)
delta_df["Max_diff"] = delta_df[["delta_benzyl", "delta_cefo"]].abs().max(axis=1)
delta_df = delta_df.sort_values(by="Max_diff", ascending=False)
delta_df

Unnamed: 0,"Feature, PDB numbered","Feature, 1M40 numbered","Feature, MSA numbered",delta_benzyl,delta_cefo,Max_diff
129,43Met 46Thr Hbond,68Met 71Thr Hbond,45Met 48Thr Hbond,-0.890221,-0.682542,0.890221
255,139Arg 146Glu Saltbr,164Arg 171Glu Saltbr,141Arg 148Glu Saltbr,0.698251,0.777444,0.777444
13,138Asp 153Arg Saltbr,163Asp 178Arg Saltbr,140Asp 155Arg Saltbr,-0.399970,-0.722645,0.722645
55,146Glu 153Arg Saltbr,171Glu 178Arg Saltbr,148Glu 155Arg Saltbr,0.108028,0.657483,0.657483
179,102Ile 209Lys Hbond,127Ile 234Lys Hbond,104Ile 211Lys Hbond,-0.638560,-0.246965,0.638560
...,...,...,...,...,...,...
400,150Asn 215Arg Hbond,175Asn 240Arg Hbond,152Asn 217Arg Hbond,0.011340,0.000000,0.011340
440,247Glu 250Arg Saltbr,272Glu 275Arg Saltbr,250Glu 253Arg Saltbr,0.007732,0.006639,0.007732
361,178Ser 181Gln Hbond,203Ser 206Gln Hbond,180Ser 183Gln Hbond,0.005416,0.000000,0.005416
420,258Ser 261Lys Hbond,283Ser 286Lys Hbond,261Ser 264Lys Hbond,-0.005079,0.000000,0.005079


Save top interaction differences for an SI table

In [15]:
top_deltas_df = delta_df.head(40).round(2).drop(["Feature, PDB numbered", "Feature, MSA numbered", "Max_diff"], axis=1)
top_deltas_df.columns = ["Interaction", "Delta Benzyl Penicillin", "Delta Cefotaxime"]
top_deltas_df.to_csv(r"peter_outputs/max_per_interaction_diffs)", index=False)

### Make pymol projections (onto TEM1) of these results

In [19]:
# get scores and output as pymol results files
benzyl_dict = dict(zip(
    delta_df["Feature, PDB numbered"].values,
    delta_df["delta_benzyl"].values
))

cefo_dict = dict(zip(
    delta_df["Feature, PDB numbered"].values,
    delta_df["delta_cefo"].values
))

project_pymol_top_features(
        per_feature_scores=benzyl_dict,
        model_name=f"benzyl",
        numb_features=100, 
        out_dir=rf"outputs/delta_special_general/"
    )

project_pymol_top_features(
        per_feature_scores=cefo_dict,
        model_name=f"cefotaxmine",
        numb_features=100, 
        out_dir=rf"outputs/delta_special_general/"
    )

The file: outputs/delta_special_general/benzyl_Pymol_Per_Feature_Scores.py was written to disk.
The file: outputs/delta_special_general/cefotaxmine_Pymol_Per_Feature_Scores.py was written to disk.
