### Running KIF on both substrates for all 4 enzymes 

In [1]:
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

from key_interactions_finder import data_preperation
from key_interactions_finder import stat_modelling
from key_interactions_finder import post_proccessing
from key_interactions_finder import pymol_projections

In [2]:
# input and output folders
RAW_DATA = "raw_data"
OUT_DIR = "outputs"
PDB_FILES = "pdb_files"
TARGET_DATA = "target_variable"
PROTEINS = ["TEM1_1M40", "ENCA_3ZDJ", "GNCA_4B88", "PNCA_4C6Y"]

### Prepare benzyl datasets


In [3]:
preped_benzyl_prots = {}
for protein in PROTEINS:
    regression_file = fr"{TARGET_DATA}/{protein}_Benzyl_Pen_Regress.txt"
    
    benzyl_file_path = fr"{RAW_DATA}/{protein}_Benzyl_Pen_all_contacts.csv"

    contacts_df = pd.read_csv(benzyl_file_path)

    supervised_dataset = data_preperation.SupervisedFeatureData(
        input_df=contacts_df,
        target_file=regression_file,
        is_classification=False,
        header_present=False
    )

    # loads regression values:
    regress_df = pd.read_csv(regression_file, header=None)

    # filtering 
    supervised_dataset.filter_by_occupancy(min_occupancy=25)
    supervised_dataset.filter_by_interaction_type(
        interaction_types_included=["Hbond", "Saltbr", "Hydrophobic"])
    
    combined_df = supervised_dataset.df_filtered.copy(deep=True)
    
    combined_df.insert(loc=0, column="Target", value=regress_df.values)

    preped_benzyl_prots[protein] = combined_df

Your PyContact features and target variable have been succesufully merged.
You can access this dataset through the class attribute: '.df_processed'.
Your PyContact features and target variable have been succesufully merged.
You can access this dataset through the class attribute: '.df_processed'.
Your PyContact features and target variable have been succesufully merged.
You can access this dataset through the class attribute: '.df_processed'.
Your PyContact features and target variable have been succesufully merged.
You can access this dataset through the class attribute: '.df_processed'.


### Run the stats analysis and save main results on the benzyl dataset

In [4]:
for protein, filtered_df in preped_benzyl_prots.items():

    out_dir = fr"{OUT_DIR}/{protein}_benzyl"
    stat_model = stat_modelling.RegressionStatModel(
        dataset=filtered_df,  
        out_dir=out_dir,
        interaction_types_included=["Hbond", "Hydrophobic", "Saltbr"]
    )

    stat_model.calc_mutual_info_to_target()
    stat_model.calc_linear_correl_to_target()

    post_proc = post_proccessing.StatRegressorPostProcessor(
        stat_model=stat_model,
        out_dir=out_dir
    )

    mi_per_res_scores = post_proc.get_per_res_scores(
        stat_method="mutual_information")
    lc_per_res_scores = post_proc.get_per_res_scores(
        stat_method="linear_correlation")

    pymol_projections.project_pymol_top_features(
        per_feature_scores=stat_model.mutual_infos,
        model_name="mutual_info",
        numb_features="all", 
        out_dir=out_dir
    )

    pymol_projections.project_pymol_top_features(
        per_feature_scores=stat_model.linear_correlations,
        model_name="linear_correls",
        numb_features="all", 
        out_dir=out_dir
    )

    pymol_projections.project_pymol_per_res_scores(
        per_res_scores=mi_per_res_scores,
        model_name="mutual_infos",
        out_dir=out_dir
    )

    pymol_projections.project_pymol_per_res_scores(
        per_res_scores=lc_per_res_scores,
        model_name="linnear_correls",
        out_dir=out_dir
    )


Mutual information scores calculated.
outputs/TEM1_1M40_benzyl/Mutual_Information_Per_Feature_Scores.csv written to disk.
You can also access these results via the class attribute: 'mutual_infos'.
Linear correlations calculated.
outputs/TEM1_1M40_benzyl/Linear_Correlations_Per_Feature_Scores.csv written to disk.
You can also access these results via the class attribute: 'linear_correlations'.
outputs/TEM1_1M40_benzyl/Mutual_Information_Scores_Per_Residue.csv written to disk.
outputs/TEM1_1M40_benzyl/Linear_Correlation_Scores_Per_Residue.csv written to disk.
The file: outputs/TEM1_1M40_benzyl/mutual_info_Pymol_Per_Feature_Scores.py was written to disk.
The file: outputs/TEM1_1M40_benzyl/linear_correls_Pymol_Per_Feature_Scores.py was written to disk.
The file: outputs/TEM1_1M40_benzyl/mutual_infos_Pymol_Per_Res_Scores.py was written to disk.
The file: outputs/TEM1_1M40_benzyl/linnear_correls_Pymol_Per_Res_Scores.py was written to disk.
Mutual information scores calculated.
outputs/ENCA_3

### Prepare Cefo datasets

In [5]:
preped_cefo_prots = {}
for protein in PROTEINS:
    regression_file = fr"{TARGET_DATA}/{protein}_Cefo_Regress.txt"
    
    cefo_file_path = fr"{RAW_DATA}/{protein}_Cefo_all_contacts.csv"

    contacts_df = pd.read_csv(cefo_file_path)

    supervised_dataset = data_preperation.SupervisedFeatureData(
        input_df=contacts_df,
        target_file=regression_file,
        is_classification=False,
        header_present=False
    )

    # loads regression values:
    regress_df = pd.read_csv(regression_file, header=None)

    # filtering 
    supervised_dataset.filter_by_occupancy(min_occupancy=25)
    supervised_dataset.filter_by_interaction_type(
        interaction_types_included=["Hbond", "Saltbr", "Hydrophobic"])
    
    combined_df = supervised_dataset.df_filtered.copy(deep=True)
    
    combined_df.insert(loc=0, column="Target", value=regress_df.values)

    preped_cefo_prots[protein] = combined_df

Your PyContact features and target variable have been succesufully merged.
You can access this dataset through the class attribute: '.df_processed'.
Your PyContact features and target variable have been succesufully merged.
You can access this dataset through the class attribute: '.df_processed'.
Your PyContact features and target variable have been succesufully merged.
You can access this dataset through the class attribute: '.df_processed'.
Your PyContact features and target variable have been succesufully merged.
You can access this dataset through the class attribute: '.df_processed'.


In [6]:
for protein, filtered_df in preped_cefo_prots.items():

    out_dir = fr"{OUT_DIR}/{protein}_cefo"
    stat_model = stat_modelling.RegressionStatModel(
        dataset=filtered_df,  
        out_dir=out_dir,
        interaction_types_included=["Hbond", "Hydrophobic", "Saltbr"]
    )

    stat_model.calc_mutual_info_to_target()
    stat_model.calc_linear_correl_to_target()

    post_proc = post_proccessing.StatRegressorPostProcessor(
        stat_model=stat_model,
        out_dir=out_dir
    )

    mi_per_res_scores = post_proc.get_per_res_scores(
        stat_method="mutual_information")
    lc_per_res_scores = post_proc.get_per_res_scores(
        stat_method="linear_correlation")

    pymol_projections.project_pymol_top_features(
        per_feature_scores=stat_model.mutual_infos,
        model_name="mutual_info",
        numb_features="all", 
        out_dir=out_dir
    )

    pymol_projections.project_pymol_top_features(
        per_feature_scores=stat_model.linear_correlations,
        model_name="linear_correls",
        numb_features="all", 
        out_dir=out_dir
    )

    pymol_projections.project_pymol_per_res_scores(
        per_res_scores=mi_per_res_scores,
        model_name="mutual_infos",
        out_dir=out_dir
    )

    pymol_projections.project_pymol_per_res_scores(
        per_res_scores=lc_per_res_scores,
        model_name="linnear_correls",
        out_dir=out_dir
    )


Mutual information scores calculated.
outputs/TEM1_1M40_cefo/Mutual_Information_Per_Feature_Scores.csv written to disk.
You can also access these results via the class attribute: 'mutual_infos'.
Linear correlations calculated.
outputs/TEM1_1M40_cefo/Linear_Correlations_Per_Feature_Scores.csv written to disk.
You can also access these results via the class attribute: 'linear_correlations'.
outputs/TEM1_1M40_cefo/Mutual_Information_Scores_Per_Residue.csv written to disk.
outputs/TEM1_1M40_cefo/Linear_Correlation_Scores_Per_Residue.csv written to disk.
The file: outputs/TEM1_1M40_cefo/mutual_info_Pymol_Per_Feature_Scores.py was written to disk.
The file: outputs/TEM1_1M40_cefo/linear_correls_Pymol_Per_Feature_Scores.py was written to disk.
The file: outputs/TEM1_1M40_cefo/mutual_infos_Pymol_Per_Res_Scores.py was written to disk.
The file: outputs/TEM1_1M40_cefo/linnear_correls_Pymol_Per_Res_Scores.py was written to disk.
Mutual information scores calculated.
outputs/ENCA_3ZDJ_cefo/Mutual_

### Make Violin plots of the regression Target Variable

In [7]:
preped_cefo_prots["ENCA_3ZDJ"]["Target"]

0       16.01
1       17.95
2       16.96
3       17.79
4       15.64
        ...  
9995    17.75
9996    17.18
9997    16.45
9998    16.49
9999    16.85
Name: Target, Length: 10000, dtype: float64

In [8]:
preped_cefo_prots.keys()

dict_keys(['TEM1_1M40', 'ENCA_3ZDJ', 'GNCA_4B88', 'PNCA_4C6Y'])

In [9]:
target_list, protein_list, substrate_list = [], [], []

for protein, df in preped_benzyl_prots.items():
    target_values =df["Target"]
    protein_values = [protein] * len(target_values)
    substrate_values = ["Benzyl Penicillin"] * len(target_values)

    target_list.extend(target_values)
    protein_list.extend(protein_values) 
    substrate_list.extend(substrate_values) 

for protein, df in preped_cefo_prots.items():
    target_values =df["Target"]
    protein_values = [protein] * len(target_values)
    substrate_values = ["Cefotaxime"] * len(target_values)

    target_list.extend(target_values)
    protein_list.extend(protein_values) 
    substrate_list.extend(substrate_values) 


regress_df = pd.DataFrame(
    {
        "Substrate": substrate_list,
        "Protein": protein_list,
        "Target Value": target_list,
    }
)
regress_df

Unnamed: 0,Substrate,Protein,Target Value
0,Benzyl Penicillin,TEM1_1M40,16.5814
1,Benzyl Penicillin,TEM1_1M40,18.0127
2,Benzyl Penicillin,TEM1_1M40,18.9981
3,Benzyl Penicillin,TEM1_1M40,18.4842
4,Benzyl Penicillin,TEM1_1M40,19.1135
...,...,...,...
79995,Cefotaxime,PNCA_4C6Y,17.1000
79996,Cefotaxime,PNCA_4C6Y,17.8300
79997,Cefotaxime,PNCA_4C6Y,17.0200
79998,Cefotaxime,PNCA_4C6Y,17.0100


In [15]:
fig = go.Figure()

fig.add_trace(go.Violin(x=regress_df["Protein"][ regress_df["Substrate"] == "Benzyl Penicillin" ],
                        y=regress_df["Target Value"][ regress_df["Substrate"] == "Benzyl Penicillin" ],
                        legendgroup="Benzyl Penicillin", scalegroup="Benzyl Penicillin", name="Benzyl Penicillin",
                        line_color="mediumpurple")
             )
fig.add_trace(go.Violin(x=regress_df["Protein"][ regress_df["Substrate"] == "Cefotaxime" ],
                        y=regress_df["Target Value"][ regress_df["Substrate"] == "Cefotaxime" ],
                        legendgroup="Cefotaxime", scalegroup="Cefotaxime", name="Cefotaxime",
                        line_color="lightseagreen")
             )

# update characteristics shared by all traces
fig.update_traces(meanline_visible=True,
                #   points="all", # show all points
                #   jitter=0.40,  # add some jitter on points for better visibility
)
fig.update_layout(
    violinmode="group",
    template="plotly_white",
    yaxis= dict(title="Sum of Catalytic Distances (Å)", titlefont=dict(size=32)),
    margin=dict(l=20, r=20, t=20, b=20),
    showlegend=True,
    font_family="Arial",
    width=1000,
    height=600,
    yaxis_range=[14,28],
    #legend_title_text="Substrate"
)

fig.update_layout(
    legend=dict(
        x=0.7, y=0.97,
        font=dict(family="Arial", size=20, color="black"),
        bordercolor="Black",
        borderwidth=1.5
    )
)

fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True,
                 ticks="outside", tickwidth=2, tickcolor='black', ticklen=10,
                 tickfont=dict(color='black', size=16),
                 showgrid=False)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True,
                 ticks="outside", tickwidth=2, tickcolor='black', ticklen=10,
                 tickfont=dict(color='black', size=22),
                 showgrid=False)

fig.show()
pio.write_image(fig, r"pics/compare_regress_dists.png", scale=6)