### Workup KIF results.

First thing to do is calculate the delta of the benzyl and cefo versions for all.

In [20]:
import pandas as pd
import numpy as np
from key_interactions_finder.pymol_projections import project_pymol_top_features

kif_files = {
    "TEM1": 
        {
            "benzyl": r"outputs/TEM1_1M40_benzyl/Linear_Correlations_Per_Feature_Scores.csv",
            "cefo": r"outputs/TEM1_1M40_cefo/Linear_Correlations_Per_Feature_Scores.csv"
        },
    "ENCA": 
        {
            "benzyl": r"outputs/ENCA_3ZDJ_benzyl/Linear_Correlations_Per_Feature_Scores.csv",
            "cefo": r"outputs/ENCA_3ZDJ_cefo/Linear_Correlations_Per_Feature_Scores.csv"
        },
    "GNCA": 
        {
            "benzyl": r"outputs/GNCA_4B88_benzyl/Linear_Correlations_Per_Feature_Scores.csv",
            "cefo": r"outputs/GNCA_4B88_cefo/Linear_Correlations_Per_Feature_Scores.csv"
        },
    "PNCA": 
        {
            "benzyl": r"outputs/PNCA_4C6Y_benzyl/Linear_Correlations_Per_Feature_Scores.csv",
            "cefo": r"outputs/PNCA_4C6Y_cefo/Linear_Correlations_Per_Feature_Scores.csv"
        }
}


In [3]:
all_dfs = {}
for system, substrates in kif_files.items():
    benzyl_df = pd.read_csv(substrates["benzyl"])
    cefo_df = pd.read_csv(substrates["cefo"])

    # merge on feature name, calc delta
    merged_df = benzyl_df.merge(cefo_df, how="outer", on="Feature")
    merged_df.columns = ["Feature", "Score_Benzyl", "Score_Cefo"]
    merged_df = merged_df.fillna(0.0)

    merged_df["Absolute Delta"] = abs(merged_df["Score_Benzyl"] - merged_df["Score_Cefo"])
    merged_df = merged_df.sort_values("Absolute Delta", ascending=False)

    # get scores and output as pymol results files
    delta_dict = dict(zip(
        merged_df["Feature"].values,
        merged_df["Absolute Delta"].values
    ))
    
    project_pymol_top_features(
            per_feature_scores=delta_dict,
            model_name=f"delta_{system}",
            numb_features=150, 
            out_dir=rf"outputs/{system}_Deltas"
        )
    
    all_dfs[system] = merged_df

The file: outputs/TEM1_Deltas/delta_TEM1_Pymol_Per_Feature_Scores.py was written to disk.
The file: outputs/ENCA_Deltas/delta_ENCA_Pymol_Per_Feature_Scores.py was written to disk.
The file: outputs/GNCA_Deltas/delta_GNCA_Pymol_Per_Feature_Scores.py was written to disk.
The file: outputs/PNCA_Deltas/delta_PNCA_Pymol_Per_Feature_Scores.py was written to disk.


### Print top results

In [8]:
all_dfs["TEM1"].round(decimals=2).reset_index(drop=True).head(20)

Unnamed: 0,Feature,Score_Benzyl,Score_Cefo,Absolute Delta
0,138Asp 153Arg Saltbr,-0.03,-0.61,0.58
1,112Leu 116Thr Hbond,-0.31,0.26,0.58
2,105Ser 209Lys Hbond,-0.04,0.53,0.57
3,50Leu 123Leu Hydrophobic,-0.25,0.25,0.5
4,111Asn 141Glu Hbond,-0.02,-0.52,0.5
5,135Thr 156Thr Hbond,-0.02,-0.49,0.48
6,47Phe 123Leu Hydrophobic,-0.08,0.39,0.48
7,132Asp 167Lys Saltbr,-0.04,0.43,0.47
8,50Leu 126Phe Hydrophobic,-0.17,0.28,0.45
9,18Arg 40Arg Hbond,-0.19,0.24,0.43


In [9]:
all_dfs["ENCA"].round(decimals=2).reset_index(drop=True).head(20)

Unnamed: 0,Feature,Score_Benzyl,Score_Cefo,Absolute Delta
0,79Glu 107Asn Hbond,-0.33,0.19,0.52
1,147Ala 215Arg Hbond,-0.31,0.03,0.34
2,81Ser 106Asp Hbond,-0.18,0.12,0.3
3,80Tyr 104Met Hbond,-0.2,-0.5,0.29
4,144Leu 147Ala Hydrophobic,0.29,0.0,0.29
5,81Ser 108Thr Hbond,-0.2,0.07,0.27
6,48Lys 107Asn Hbond,-0.23,0.03,0.26
7,145Asn 212Ala Hbond,-0.31,-0.05,0.26
8,139Arg 154Asp Saltbr,-0.28,-0.03,0.24
9,44Met 145Asn Hbond,-0.24,0.0,0.24


In [7]:
all_dfs["PNCA"].round(decimals=2).reset_index(drop=True).head(20)

Unnamed: 0,Feature,Score_Benzyl,Score_Cefo,Absolute Delta
0,79Tyr 106Asn Hbond,-0.18,0.15,0.33
1,25Asp 228Arg Saltbr,0.25,-0.07,0.32
2,78Asp 106Asn Hbond,-0.22,0.09,0.31
3,100Ala 208Lys Hbond,-0.19,0.1,0.29
4,194Arg 210Gly Hbond,0.0,0.28,0.28
5,80Ser 105Asp Hbond,-0.15,0.1,0.25
6,102Thr 184Trp Hbond,0.06,0.3,0.24
7,47Lys 104Ser Hbond,-0.54,-0.3,0.24
8,188Asn 207Asp Hbond,0.11,-0.13,0.24
9,101Ile 185Leu Hydrophobic,-0.19,0.04,0.23


In [5]:
all_dfs["GNCA"].round(decimals=2).reset_index(drop=True).head(20)

Unnamed: 0,Feature,Score_Benzyl,Score_Cefo,Absolute Delta
0,45Lys 102Ser Hbond,-0.49,0.01,0.5
1,75Val 78Ser Hbond,0.11,-0.14,0.25
2,78Ser 104Asn Hbond,-0.05,0.17,0.22
3,209Thr 245Arg Hbond,0.27,0.06,0.21
4,78Ser 103Asp Hbond,-0.32,-0.11,0.21
5,240Val 245Arg Hbond,-0.2,0.0,0.2
6,77Tyr 104Asn Hbond,-0.32,-0.12,0.2
7,213Gly 245Arg Hbond,-0.19,0.01,0.2
8,192Arg 246Asp Saltbr,-0.19,0.01,0.2
9,125Arg 132Thr Hbond,-0.06,0.13,0.19


### Calc per residue substrate and delta scores 

In [22]:
def prep_per_res_scores(feat_importances: dict) -> pd.DataFrame:
    """
    Prep dataset for per residue importance calculation. 
    """
    df_feat_import = pd.DataFrame(feat_importances.items())
    df_feat_import_res = df_feat_import[0].str.split(" +", expand=True)

    res1, res2, values = [], [], []
    res1 = (df_feat_import_res[0].str.extract(r"(\d+)")).astype(int)
    res2 = (df_feat_import_res[1].str.extract(r"(\d+)")).astype(int)
    # absolute values required as want to be able to sum linear correlations.
    values = df_feat_import[1].abs()

    per_res_df = pd.concat(
        [res1, res2, values], axis=1, join="inner")
    per_res_df.columns = ["Res1", "Res2", "Score"]

    return per_res_df

def calc_per_res_scores(per_res_df: pd.DataFrame) -> dict:
    """
    Sums all per features scores to determine the per residue score for each residue.

    Parameters
    ----------
    per_res_df : pd.DataFrame
        Dataframe with columns of both residues numbers and their
        corresponding per feature score.

    Returns
    ----------
    dict
        Keys are each residue, values are the residue's relative score.
    """
    max_res = max(per_res_df[["Res1", "Res2"]].max())
    res_ids = []
    tot_scores = []
    for i in range(1, max_res+1, 1):
        res_ids.append(i + 1)
        tot_scores.append(
            per_res_df.loc[per_res_df["Res1"] == i, "Score"].sum() +
            per_res_df.loc[per_res_df["Res2"] == i, "Score"].sum())

    # Rescale scores so that new largest has size 1.0
    # (good for PyMOL sphere representation as well).
    max_ori_score = max(tot_scores)
    tot_scores_scaled = []
    for i in range(1, max_res, 1):
        tot_scores_scaled.append(tot_scores[i] / max_ori_score)

    spheres = dict(sorted(zip(
        res_ids, tot_scores_scaled), key=lambda x: x[1], reverse=True))

    spheres = {keys: np.around(values, 5)
                for keys, values in spheres.items()}

    return spheres

In [48]:
benzyl_dicts, cefo_dicts, delta_dicts = {}, {}, {}
for system, df in all_dfs.items():

    benzyl_scores = dict(zip(df["Feature"].values, df["Score_Benzyl"]))
    cefo_scores = dict(zip(df["Feature"].values, df["Score_Cefo"]))
    delta_scores = dict(zip(df["Feature"].values, df["Absolute Delta"]))

    per_res_df_benzyl = prep_per_res_scores(benzyl_scores)
    per_res_df_cefo = prep_per_res_scores(cefo_scores)
    per_res_df_delta = prep_per_res_scores(delta_scores)

    benzyl_dicts[system] = calc_per_res_scores(per_res_df_benzyl)
    cefo_dicts[system] = calc_per_res_scores(per_res_df_cefo)
    delta_dicts[system] = calc_per_res_scores(per_res_df_delta)


In [49]:
delta_dicts["TEM1"]

{123: 1.0,
 135: 0.74389,
 168: 0.70411,
 50: 0.69489,
 126: 0.66914,
 40: 0.59812,
 18: 0.57161,
 145: 0.5481,
 137: 0.54294,
 47: 0.50616,
 114: 0.49433,
 196: 0.48751,
 58: 0.47675,
 116: 0.46612,
 209: 0.46209,
 156: 0.45547,
 132: 0.43994,
 153: 0.43803,
 197: 0.43694,
 54: 0.43371,
 215: 0.41814,
 241: 0.4128,
 167: 0.40599,
 113: 0.39417,
 214: 0.39361,
 117: 0.39311,
 139: 0.39025,
 141: 0.38022,
 136: 0.37195,
 237: 0.36068,
 111: 0.34903,
 105: 0.33582,
 120: 0.33011,
 147: 0.32536,
 155: 0.31904,
 112: 0.30363,
 218: 0.30023,
 8: 0.29984,
 224: 0.28957,
 165: 0.28443,
 239: 0.27885,
 255: 0.27641,
 195: 0.27209,
 146: 0.26671,
 248: 0.25782,
 142: 0.25452,
 144: 0.25451,
 221: 0.25207,
 233: 0.25206,
 179: 0.25077,
 182: 0.25007,
 162: 0.24951,
 19: 0.2494,
 189: 0.24621,
 138: 0.24371,
 252: 0.23692,
 220: 0.23585,
 12: 0.23524,
 258: 0.23313,
 4: 0.23223,
 15: 0.23072,
 103: 0.23047,
 234: 0.21738,
 152: 0.21681,
 174: 0.21625,
 159: 0.21579,
 51: 0.21317,
 22: 0.2128,
 17

In [None]:
# TODO - make plots of per residue differences with pymol. 

### Do an MSA Alingment to compare residues

In [38]:
def parse_fasta(file_path: str) -> dict[str, str]:
    """
    Takes in a msa alignment produced by modeller in fasta formating and
    outputs a sequence as string with '-' for the gaps introduced by msa
    """
    sequences = {}
    current_name = None
    current_sequence = []

    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()

            if line.startswith(">"):
                if current_name is not None:
                    sequences[current_name] = "".join(current_sequence)
                    current_sequence = []

                current_name = line[1:]
            else:
                current_sequence.append(line)

        # Add the last sequence to the dictionary
        if current_name is not None:
            sequences[current_name] = "".join(current_sequence)

    return sequences

def create_pdb_to_msa_converter(msa_sequence: str) -> dict[int, int]:
    """
    Create a dictionary that can enable easily converting
    between pdb and msa numbers given an msa_sequence.

    Parameters
    ----------
    msa_sequence: str
        msa sequence of the target protein.

    Returns
    ----------
    dict[int, int]
        key is the pdb residue number, value is the msa residue number.
    """
    curr_msa_number, curr_pdb_numb = 0, 0
    index_pdb_msa = {}
    for msa_residue in msa_sequence:
        if msa_residue == "*":
            continue
        if msa_residue == "-":
            curr_msa_number += 1
        else:
            curr_msa_number += 1
            curr_pdb_numb += 1

            index_pdb_msa[curr_pdb_numb] = curr_msa_number
    return index_pdb_msa

In [39]:
ALINGMENT_FILE = r"raw_data/align1d.ali"
msa_seqs = parse_fasta(ALINGMENT_FILE)
msa_seqs

{'4C6Y_PNCA': '--AAALSEQLAELEKRSGGRLGVAVLDTATGRRI-AYRGDERFPMCSTFKALLAAAVLAQVDQGKERLDRRITYSKADLVDYSPVTEKHVGGGMTVAELCEAAITYSDNTAANLLLEALGGPAALTAFLRSIGDEVTRLDRWEPELNEALPGDPRDTTTPRAMAATLRKLLLGDALSPASREQLVDWLLANKTGDKRLRAGLPADWRVGDKTGTGGHGTTNDIAVIWPPNRAP-IVVAVYYTESQADAEARDAVIAEVGRLVAEAF*',
 '3ZDJ_ENCA': 'MHPQTL-EQIKESESQLSGRVGMVELDLASGRTL-SYRADERFPMMSTFKVLLCGAVLARVDAGLEQLDRRIHYRQQDLVEYSPVTEKHLADGMTVAELCAAAITMSDNTAANLLLATIGGPAGLTAFLRNIGDNVTRLDRWETELNEALPGDERDTTTPAAMAATLRKLLTGEILSAASRQQLITWMVADKVAGPLLRSVLPAGWFIADKTGAGERGSRGIIAVLGPDGKPSRIVV-IYLTETQASMDERNQQIAEIGAALIEHW*',
 '4B88_GNCA': '---Q-LSEQLAELEKRSGGRLGVAVLDTATGRRI-AYRGDERFPMCSTFKALLAAAVLARVDQGKERLDRRITYGKEDLVDYSPVTEKHVGDGMTVAELCEAAITLSDNTAANLLLEALGGPAALTAFLRSIGDEVTRLDRWEPELNEAAPGDPRDTTTPAAMAATLRTLLLGDALSPASRQQLVDWLVANKTGDKRLRAGLPADWRVGDKTGTGGHGTTNDIAVIWPPGRAP-IVVTVYLTESQVDADARDAVIAEVGRLVVEAF*',
 '1M40_TEM-1': '-HPETL-VKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFL

In [45]:
msa_converters = {
    "TEM1": create_pdb_to_msa_converter(msa_seqs["1M40_TEM-1"]),
    "ENCA": create_pdb_to_msa_converter(msa_seqs["3ZDJ_ENCA"]),
    "GNCA": create_pdb_to_msa_converter(msa_seqs["4B88_GNCA"]),
    "PNCA": create_pdb_to_msa_converter(msa_seqs["4C6Y_PNCA"])
}

In [76]:
msa_benzyl_per_res = {}
for system, benzyl_dict in benzyl_dicts.items():
    msa_converter = msa_converters[system]
    
    # convert per residue scores to msa_indexed per res scores 
    msa_per_res_dict = {msa_converter[key]: value for key, value in benzyl_dict.items() if key in msa_converter}

    msa_benzyl_per_res[system] = msa_per_res_dict

msa_benzyl_df = pd.DataFrame(msa_benzyl_per_res).sort_index().fillna(0.0)
msa_benzyl_df.head(2)

Unnamed: 0,TEM1,ENCA,GNCA,PNCA
2,0.0,0.04836,0.0,0.0
3,0.013,0.07195,0.0,0.0


In [77]:
msa_cefo_per_res = {}
for system, cefo_dict in cefo_dicts.items():
    msa_converter = msa_converters[system]
    
    # convert per residue scores to msa_indexed per res scores 
    msa_per_res_dict = {msa_converter[key]: value for key, value in cefo_dict.items() if key in msa_converter}

    msa_cefo_per_res[system] = msa_per_res_dict

msa_cefo_df = pd.DataFrame(msa_cefo_per_res).sort_index().fillna(0.0)
msa_cefo_df.head(2)

Unnamed: 0,TEM1,ENCA,GNCA,PNCA
2,0.0,0.19539,0.0,0.0
3,0.06723,0.11242,0.0,0.0


In [78]:
msa_delta_per_res = {}
for system, delta_dict in delta_dicts.items():
    msa_converter = msa_converters[system]
    
    # convert per residue scores to msa_indexed per res scores 
    msa_per_res_dict = {msa_converter[key]: value for key, value in delta_dict.items() if key in msa_converter}

    msa_delta_per_res[system] = msa_per_res_dict

msa_delta_df = pd.DataFrame(msa_delta_per_res).sort_index().fillna(0.0)
msa_delta_df.head(2)

Unnamed: 0,TEM1,ENCA,GNCA,PNCA
2,0.0,0.22505,0.0,0.0
3,0.0492,0.1022,0.0,0.0


### Make scatter plot of delta values and other ones on msa index

In [79]:
# delta per residue
import plotly.graph_objects as go
import plotly.io as pio

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=msa_delta_df.index, y=msa_delta_df["TEM1"],
                    mode='markers', name="TEM1"))
fig.add_trace(go.Scatter(x=msa_delta_df.index, y=msa_delta_df["ENCA"],
                    mode='markers', name="ENCA"))
fig.add_trace(go.Scatter(x=msa_delta_df.index, y=msa_delta_df["GNCA"],
                    mode='markers', name="GNCA"))
fig.add_trace(go.Scatter(x=msa_delta_df.index, y=msa_delta_df["PNCA"],
                    mode='markers', name="PNCA"))


fig.update_layout(
    template="plotly_white",
    xaxis=dict(title="MSA Residue Number", titlefont=dict(size=32)),
    yaxis=dict(title="Absolute Delta Per Residue Score", titlefont=dict(size=32)),
    margin=dict(l=20, r=20, t=20, b=20),
    font_family="Arial",
    width=1000,
    height=600
)
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True,
                 ticks="outside", tickwidth=2, tickcolor='black', ticklen=10,
                 tickfont=dict(color='black', size=16))
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True,
                 ticks="outside", tickwidth=2, tickcolor='black', ticklen=10,
                 tickfont=dict(color='black', size=22))

fig.show()

# fig.show("svg") # remove "svg" to make the figure interactive.

pio.write_image(fig, r"pics/msa_per_res_deltas.png", scale=6)

In [71]:
msa_delta_df.corr().round(2)

Unnamed: 0,TEM1,ENCA,GNCA,PNCA
TEM1,1.0,0.61,0.31,0.31
ENCA,0.61,1.0,0.48,0.51
GNCA,0.31,0.48,1.0,0.69
PNCA,0.31,0.51,0.69,1.0


In [80]:
msa_benzyl_df.corr().round(2)

Unnamed: 0,TEM1,ENCA,GNCA,PNCA
TEM1,1.0,0.67,0.48,0.47
ENCA,0.67,1.0,0.4,0.42
GNCA,0.48,0.4,1.0,0.7
PNCA,0.47,0.42,0.7,1.0


In [81]:
msa_cefo_df.corr().round(2)

Unnamed: 0,TEM1,ENCA,GNCA,PNCA
TEM1,1.0,0.65,0.51,0.43
ENCA,0.65,1.0,0.52,0.59
GNCA,0.51,0.52,1.0,0.7
PNCA,0.43,0.59,0.7,1.0
