### Notebook script to create the target variable for each system for KIF

The target variable is the sum of 6 catalytic distances. Distances were precalculated using CPPTRAJ and are loaded into this notebook here.

In [1]:
import pandas as pd
PROTEINS = ["TEM1_1M40", "ENCA_3ZDJ", "GNCA_4B88", "PNCA_4C6Y"]

In [2]:
dfs = {}
for protein in PROTEINS:

    benzyl_dists = rf"target_variable/{protein}_Benzyl_Pen_dists.dat"
    cefo_dists = rf"target_variable/{protein}_Cefo_dists.dat"

    benzyl_df = pd.read_csv(benzyl_dists, sep=r"\s+")
    cefo_df = pd.read_csv(cefo_dists, sep=r"\s+")
    
    benzyl_df["min_wat_glu"] = benzyl_df[["wat_glu_1", "wat_glu_2","wat_glu_3", "wat_glu_4"]].min(axis=1)
    cefo_df["min_wat_glu"] = cefo_df[["wat_glu_1", "wat_glu_2","wat_glu_3", "wat_glu_4"]].min(axis=1)

    dfs[f"benzyl_{protein}"] = benzyl_df
    dfs[f"cefo_{protein}"] = cefo_df


In [3]:
dfs["benzyl_TEM1_1M40"].head(3)

Unnamed: 0,#Frame,wat_glu_1,wat_glu_2,wat_glu_3,wat_glu_4,wat_sub,ser130_sub,lys73_sub,asn132_lys73,a237_sub,ser70_sub,min_wat_glu
0,1,2.7292,3.2782,1.5387,2.931,3.4647,4.247,2.9703,2.6982,3.0044,2.9051,1.5387
1,2,2.6014,3.6341,2.3339,3.8409,3.2838,4.5521,3.8695,2.6282,3.0248,2.8725,2.3339
2,3,1.9322,2.9753,1.9513,3.4573,4.1776,4.4883,4.3973,3.129,2.7006,2.6614,1.9322


calc the target variable and append to each dataframe as a new column

In [4]:
for protein, df in dfs.items():
    target_variable = []
    for idx, row in df.iterrows():
        
        # distances:
        wat_sub_dist = row["wat_sub"]
        wat_glu_dist = row["min_wat_glu"]
        lys73_sub_dist = row["lys73_sub"]
        asn132_lys73_dist = row["asn132_lys73"]
        a237_sub_dist = row["a237_sub"]
        ser70_sub_dist = row["ser70_sub"]
        
        # add regression values here...
        total_dist = sum([
            wat_sub_dist, wat_glu_dist, lys73_sub_dist, asn132_lys73_dist, a237_sub_dist, ser70_sub_dist
        ])
        target_variable.append(total_dist)
        
    df["Target Variable"] = target_variable

In [5]:
dfs["benzyl_TEM1_1M40"].head(3)

Unnamed: 0,#Frame,wat_glu_1,wat_glu_2,wat_glu_3,wat_glu_4,wat_sub,ser130_sub,lys73_sub,asn132_lys73,a237_sub,ser70_sub,min_wat_glu,Target Variable
0,1,2.7292,3.2782,1.5387,2.931,3.4647,4.247,2.9703,2.6982,3.0044,2.9051,1.5387,16.5814
1,2,2.6014,3.6341,2.3339,3.8409,3.2838,4.5521,3.8695,2.6282,3.0248,2.8725,2.3339,18.0127
2,3,1.9322,2.9753,1.9513,3.4573,4.1776,4.4883,4.3973,3.129,2.7006,2.6614,1.9322,18.9981


write the target variable out for each system

In [6]:
for system_name, df in dfs.items():
    target_values = df["Target Variable"]
    out_file = rf"target_variable/{system_name}_Regress.txt"
    with open(out_file, "w") as file:
        for value in target_values:
            file.write(f"{value}\n")