### Comparisons to relevant external data

#### Part 1: First to test is the tem1_fitness_data


TODO - look at unoberved contacts with high conservation, compare to fitness data... 


In [1]:
import json
import numpy as np
import pandas as pd
from scipy.stats import spearmanr, pearsonr
import plotly.express as px

from tools_proj.utils import normalise_dict_values
from tools_proj.pymol_projections import project_pymol_per_res_scores

In [2]:
# load the tem1_fitness data
TEM1_DATA = r"tem1_fitness_data/per_res_fitness_scores.json"

# save file
with open(TEM1_DATA, "r", encoding="utf-8") as file_in:
    tem1_fitness_data = json.load(file_in)
# convert res numbers from str to ints. 
tem1_fitness_data = {int(k):v for (k,v) in tem1_fitness_data.items()}
print(tem1_fitness_data)

{1: 18.036298153152153, 2: 19.088503571727024, 3: 19.18729778432305, 4: 13.055777177182305, 5: 15.530248192415675, 6: 15.98892164800085, 7: 16.307660560842073, 8: 7.283753938308355, 9: 17.959080225465684, 10: 17.77356066587851, 11: 12.604555670016731, 12: 9.60842517226543, 13: 17.518344605223238, 14: 19.108812964771367, 15: 18.52885969077448, 16: 16.871478380317203, 17: 12.949976516018966, 18: 18.350437003118046, 19: 9.626997801042519, 20: 3.7038516974963547, 21: 6.639720366414989, 22: 11.492548634343676, 23: 17.021092974193273, 24: 11.687420455013871, 25: 12.782299084654516, 26: 13.40580973452623, 27: 19.681490946613874, 28: 13.902983033544452, 29: 15.264542371255716, 30: 18.78892091462137, 31: 17.87428464294577, 32: 16.872858648778788, 33: 16.611167174991138, 34: 18.56946629021208, 35: 13.721766977376808, 36: 8.816286288996901, 37: 19.558122960069156, 38: 18.670717716784793, 39: 19.019322962125276, 40: 17.184303765182065, 41: 4.602471273274842, 42: 12.611231396004513, 43: 9.850341187

#### Now extract the res-res scores

In [3]:
TEM1_RES_RES_SCORES_FILE = r"../contact_analysis/multi_structure_test/conservation_2_dict_tem1.txt"

tem1_res_res_data = {}
with open(TEM1_RES_RES_SCORES_FILE, "r", encoding="utf-8") as file_in:
    for line in file_in:
        line_parts = line.split()
        res1, res2, score = int(line_parts[0]), int(line_parts[1]), float(line_parts[2])
        tem1_res_res_data[(res1, res2)] = score
print(tem1_res_res_data)

{(1, 4): 0.043478260869565216, (2, 6): 0.17391304347826086, (3, 7): 0.0, (4, 8): 0.13043478260869565, (5, 9): 0.43478260869565216, (6, 10): 0.5652173913043478, (7, 10): 0.10144927536231885, (7, 11): 0.463768115942029, (7, 254): 0.08695652173913043, (8, 12): 0.014492753623188406, (8, 19): 0.0, (8, 35): 0.15942028985507245, (9, 13): 1.0, (10, 14): 1.0, (11, 15): 0.927536231884058, (12, 16): 1.0, (12, 19): 0.10144927536231885, (12, 36): 0.10144927536231885, (15, 241): 0.08695652173913043, (18, 238): 0.37681159420289856, (19, 35): 0.10144927536231885, (20, 237): 0.7101449275362319, (21, 35): 0.9710144927536232, (22, 31): 0.08695652173913043, (22, 158): 0.855072463768116, (22, 162): 0.08695652173913043, (22, 235): 0.6666666666666666, (23, 33): 0.9855072463768116, (23, 232): 0.08695652173913043, (24, 233): 0.6666666666666666, (25, 28): 0.07246376811594203, (25, 29): 0.2463768115942029, (25, 30): 0.6086956521739131, (26, 166): 0.2318840579710145, (26, 231): 0.7681159420289855, (26, 233): 0.40

### Convert the res-res scores to per res scores

In [4]:
all_per_res_conserv_1, all_per_res_conserv_2 = {}, {}
for res_numb in range(1, 263+1):

    per_res_scores = []
    numb_contacts = 0
    for res_pair, score in tem1_res_res_data.items():
        if res_numb in res_pair:
            numb_contacts += 1
            # treats very rarely observed and very highly observed as the same.  
            conserv_score = max( (1 - score), score )
            per_res_scores.append(conserv_score)
    
    # obtain both conservation metrics. 
    per_res_conserv_1 = np.prod(per_res_scores)  
    per_res_conserv_2 = per_res_conserv_1 * numb_contacts 
    all_per_res_conserv_1[res_numb] = per_res_conserv_1
    all_per_res_conserv_2[res_numb] = per_res_conserv_2

print(all_per_res_conserv_1)

{1: 0.9565217391304348, 2: 0.8260869565217391, 3: 1.0, 4: 0.831758034026465, 5: 0.5652173913043479, 6: 0.46691871455576556, 7: 0.4399331525163694, 8: 0.7203455613088225, 9: 0.5652173913043479, 10: 0.5078764965343415, 11: 0.4973745011552194, 12: 0.7956920510549178, 13: 1.0, 14: 1.0, 15: 0.8468809073724008, 16: 1.0, 17: 1.0, 18: 0.6231884057971014, 19: 0.8073934047469019, 20: 0.7101449275362319, 21: 0.9710144927536232, 22: 0.4752198569902195, 23: 0.899810964083176, 24: 0.6666666666666666, 25: 0.42548605974265546, 26: 0.35058095820814644, 27: 1.0, 28: 0.927536231884058, 29: 0.7536231884057971, 30: 0.6086956521739131, 31: 0.9130434782608696, 32: 1.0, 33: 0.9855072463768116, 34: 1.0, 35: 0.7334106523717767, 36: 0.8855282503675699, 37: 0.8695652173913043, 38: 1.0, 39: 0.9855072463768116, 40: 0.5614458051377587, 41: 0.1903938307824801, 42: 0.45368620037807184, 43: 0.8443604284814115, 44: 0.7971014492753623, 45: 1.0, 46: 0.914723797521529, 47: 0.4614181042224109, 48: 0.831758034026465, 49: 0.7

In [5]:
all_per_res_conserv_2 = normalise_dict_values(original_dict=all_per_res_conserv_2)
print(all_per_res_conserv_2)

{1: 0.2603061782394751, 2: 0.2248098812068194, 3: 0.2721382772503603, 4: 0.4527063969382175, 5: 0.15381728714150802, 6: 0.25413290919031756, 7: 0.47889060092449914, 8: 0.7841344003181071, 9: 0.30763457428301605, 10: 0.41463790446841287, 11: 0.27070927978527753, 12: 0.8661530559835643, 13: 0.2721382772503603, 14: 0.2721382772503603, 15: 0.4609374223370943, 16: 0.2721382772503603, 17: 0.0, 18: 0.16959341915602166, 19: 0.6591679506933743, 20: 0.19325761717779213, 21: 0.2642502112431035, 22: 0.5173020527859238, 23: 0.4897460112331627, 24: 0.18142551816690689, 25: 0.34737312987723046, 26: 0.2862194940106366, 27: 0.0, 28: 0.2524181122322183, 29: 0.20508971618867736, 30: 0.16564938615239325, 31: 0.24847407922858988, 32: 0.0, 33: 0.26819424424673194, 34: 0.0, 35: 0.5987673343605546, 36: 0.48197226502311247, 37: 0.23664198021770463, 38: 0.0, 39: 0.26819424424673194, 40: 0.4583726825388934, 41: 0.20725379643296302, 42: 0.24693076196630048, 43: 0.45956558477061477, 44: 0.21692181519956258, 45: 0.

### Make some plots to compare to experiment

In [6]:
per_res_df = pd.DataFrame([tem1_fitness_data, all_per_res_conserv_1, all_per_res_conserv_2]).T
per_res_df.columns = ["fitness", "conserv score1", "conserv score2"]
per_res_df

Unnamed: 0,fitness,conserv score1,conserv score2
1,18.036298,0.956522,0.260306
2,19.088504,0.826087,0.224810
3,19.187298,1.000000,0.272138
4,13.055777,0.831758,0.452706
5,15.530248,0.565217,0.153817
...,...,...,...
259,10.214448,0.739130,0.402291
260,13.961234,0.589792,0.481515
261,18.816735,1.000000,0.272138
262,17.459732,1.000000,0.272138


In [43]:
pearson, p = pearsonr(per_res_df["fitness"], per_res_df["conserv score2"])
print(f"Pearson r: {pearson:.3f}")
spearman, p = spearmanr(per_res_df["fitness"], per_res_df["conserv score2"])
print(f"Spearmans correlation coefficient: {spearman:.3f}")

Pearson r: -0.385
Spearmans correlation coefficient: -0.372


In [42]:
fig = px.scatter(per_res_df, x="fitness", y="conserv score2")
fig.update_layout(xaxis_title="Fitness (from Experiment)",
                  yaxis_title="Residue Conservation Score",
                  legend_title="Legend Title",
                  font=dict(size=18))
fig.show()

### Make some pymol figures of conservation scores

In [9]:
project_pymol_per_res_scores(
    per_res_scores=all_per_res_conserv_2,
    out_file= r"comparison_results/tem1_per_res_scores.py"
)

The file: comparison_results/tem1_per_res_scores.py was written to disk.


## Part 2: Next is the coupling strengths data comparisons 

Question is, how do I do the comparisons here? 

In [10]:
COUPLING_STRENGTHS_FILE = r"coupling_strengths_data/TARGET_b0.5_CouplingScores.csv"
coupling_df = pd.read_csv(COUPLING_STRENGTHS_FILE)
drop_columns = ["A_i", "A_j", "fn", "cn", "segment_i", "segment_j", "score", "mad_score"]
coupling_df = coupling_df.drop(drop_columns, axis=1)
coupling_df.columns = ["res1", "res2", "probability"]
coupling_df

Unnamed: 0,res1,res2,probability
0,68,116,1.000000
1,34,37,1.000000
2,187,205,1.000000
3,58,63,1.000000
4,24,166,1.000000
...,...,...,...
30623,130,243,0.008726
30624,148,162,0.008361
30625,155,243,0.008062
30626,156,243,0.006281


## Part 3: Compare Fitness data to contacts with high conversation not in TEM1

- First, need to look at structure, determine what is going on with missing ones.
- Then need to consider what would happen if mutation occured between them. 
- Can we reason something from this?

In [11]:
CONSERVED_NOT_IN_TEM1_FILE = r"../contact_analysis/multi_structure_test/int_type_not_in_target_pdb.txt"

tem1_conserved_not_present = {}
with open(CONSERVED_NOT_IN_TEM1_FILE, "r", encoding="utf-8") as file_in:
    for line in file_in:
        line_parts = line.replace(")", "").replace("(", "").replace(",", "").split()
        res1, res2, score = int(line_parts[0]), int(line_parts[1]), float(line_parts[2])
        tem1_conserved_not_present[(res1, res2)] = score
print(tem1_conserved_not_present)

{(13, 20): 61.0, (19, 239): 59.0, (22, 35): 63.0, (23, 235): 56.0, (24, 33): 77.0, (25, 233): 64.0, (27, 231): 71.0, (43, 219): 56.0, (48, 107): 65.0, (50, 123): 60.0, (54, 117): 58.0, (58, 62): 60.0, (79, 107): 61.0, (100, 104): 71.0, (143, 146): 68.0, (13, 18): 57.0, (13, 36): 56.0, (26, 30): 62.0, (53, 168): 58.0, (78, 142): 62.0, (124, 137): 65.0, (166, 171): 63.0, (186, 189): 61.0, (189, 191): 65.0, (189, 208): 60.0, (200, 204): 60.0, (204, 224): 61.0}


## Part 4: ASR lactamses contact conservation 

Not sure on what the question is here? do the major contacts get preserved in ASR?

In [12]:
from tools_proj.utils import open_many_single_frame_contacts_files
ASR_PROTEINS = ["3zdj", "4b88", "4c6y", "4c75"]
ASR_CONTACTS_FOLDER = r"asr_lactamases/crystal_contacts"
ASR_SEQUENCES = "asr_lactamases/sequences/tem1_asr_alingments.ali"

all_asr_protein_contacts = open_many_single_frame_contacts_files(
    folder_path=ASR_CONTACTS_FOLDER, protein_names=ASR_PROTEINS
)
print(all_asr_protein_contacts)

{'3zdj': ['MET1 GLU261 vdw wc-sc', 'HIP2 GLN8 vdw sc-sc', 'HIP2 GLU254 hbond sc-sc', 'HIP2 ALA257 vdw sc-wc', 'HIP2 ALA258 vdw sc-wc', 'HIP2 GLU261 hbond mc-sc', 'HIP2 HIP262 hbond mc-sc', 'PRO3 GLU7 vdw sc-sc', 'PRO3 GLN8 vdw wc-wc', 'PRO3 ALA258 vdw mc-wc', 'PRO3 HIP262 vdw mc-sc', 'GLN4 GLN8 hbond mc-mc', 'GLN4 HIP262 vdw wc-sc', 'THR5 ILE9 hbond mc-mc', 'THR5 MET22 vdw sc-sc', 'THR5 LEU33 vdw sc-sc', 'THR5 ILE255 vdw sc-mc', 'THR5 ALA258 vdw wc-wc', 'THR5 LEU259 vdw sc-wc', 'THR5 HIP262 vdw wc-sc', 'LEU6 LYS10 hbond mc-mc', 'LEU6 LEU33 hydrophobic sc-sc', 'LEU6 SER34 vdw sc-mc', 'LEU6 TYR35 vdw sc-sc', 'GLU7 LYS10 saltbridge sc-sc', 'GLU7 GLU11 hbond mc-mc', 'GLN8 SER12 hbond mc-mc', 'GLN8 SER12 hbond sc-sc', 'GLN8 GLU254 vdw sc-wc', 'GLN8 ILE255 vdw sc-wc', 'GLN8 ALA258 vdw sc-sc', 'ILE9 GLU13 hbond mc-mc', 'ILE9 VAL20 hydrophobic sc-sc', 'ILE9 MET22 hydrophobic sc-sc', 'ILE9 TYR35 vdw sc-sc', 'ILE9 ILE255 hydrophobic sc-sc', 'LYS10 SER14 hbond mc-mc', 'LYS10 SER14 hbond mc-sc', '

In [13]:
from tools_proj.sequences import seq_align_file_to_sequences

asr_tem1_alinged = seq_align_file_to_sequences(alignment_file=ASR_SEQUENCES, output_msa_style=True)
print(asr_tem1_alinged)

{'1M40_TEM-1': ['H', '-', 'P', 'E', 'T', 'L', 'V', 'K', 'V', 'K', 'D', 'A', 'E', 'D', 'Q', 'L', 'G', 'A', 'R', 'V', 'G', 'Y', 'I', 'E', 'L', 'D', 'L', 'N', 'S', 'G', 'K', 'I', 'L', 'E', 'S', 'F', 'R', 'P', 'E', 'E', 'R', 'F', 'P', 'M', 'M', 'S', 'T', 'F', 'K', 'V', 'L', 'L', 'C', 'G', 'A', 'V', 'L', 'S', 'R', 'V', 'D', 'A', 'G', 'Q', 'E', 'Q', 'L', 'G', 'R', 'R', 'I', 'H', 'Y', 'S', 'Q', 'N', 'D', 'L', 'V', 'E', 'Y', 'S', 'P', 'V', 'T', 'E', 'K', 'H', 'L', 'T', 'D', 'G', 'M', 'T', 'V', 'R', 'E', 'L', 'C', 'S', 'A', 'A', 'I', 'T', 'M', 'S', 'D', 'N', 'T', 'A', 'A', 'N', 'L', 'L', 'L', 'T', 'T', 'I', 'G', 'G', 'P', 'K', 'E', 'L', 'T', 'A', 'F', 'L', 'H', 'N', 'M', 'G', 'D', 'H', 'V', 'T', 'R', 'L', 'D', 'R', 'W', 'E', 'P', 'E', 'L', 'N', 'E', 'A', 'I', 'P', 'N', 'D', 'E', 'R', 'D', 'T', 'T', 'M', 'P', 'A', 'A', 'M', 'A', 'T', 'T', 'L', 'R', 'K', 'L', 'L', 'T', 'G', 'E', 'L', 'L', 'T', 'L', 'A', 'S', 'R', 'Q', 'Q', 'L', 'I', 'D', 'W', 'M', 'E', 'A', 'D', 'K', 'V', 'A', 'G', 'P', 'L', 'L',

In [14]:
asr_tem1_alinged.keys()

dict_keys(['1M40_TEM-1', '3zdj', '4c75', '4b88', '4c6y'])

In [15]:
TEM1_RES_RES_SCORES_FILE = r"../contact_analysis/multi_structure_test/conservation_2_dict_tem1.txt"

tem1_res_res_data = {}
with open(TEM1_RES_RES_SCORES_FILE, "r", encoding="utf-8") as file_in:
    for line in file_in:
        line_parts = line.split()
        res1, res2, score = int(line_parts[0]), int(line_parts[1]), float(line_parts[2])
        tem1_res_res_data[(res1, res2)] = score

tem1_res_res_data = sorted(tem1_res_res_data.items(), key=lambda x:x[1], reverse=True)
print(tem1_res_res_data)

[((9, 13), 1.0), ((10, 14), 1.0), ((12, 16), 1.0), ((45, 48), 1.0), ((47, 51), 1.0), ((48, 52), 1.0), ((49, 53), 1.0), ((50, 54), 1.0), ((51, 55), 1.0), ((52, 56), 1.0), ((92, 97), 1.0), ((93, 97), 1.0), ((94, 98), 1.0), ((95, 99), 1.0), ((96, 100), 1.0), ((97, 101), 1.0), ((98, 102), 1.0), ((100, 106), 1.0), ((106, 110), 1.0), ((107, 111), 1.0), ((108, 112), 1.0), ((109, 113), 1.0), ((110, 114), 1.0), ((111, 115), 1.0), ((112, 116), 1.0), ((113, 117), 1.0), ((123, 127), 1.0), ((124, 128), 1.0), ((157, 161), 1.0), ((158, 162), 1.0), ((159, 163), 1.0), ((160, 164), 1.0), ((161, 165), 1.0), ((162, 166), 1.0), ((163, 167), 1.0), ((175, 179), 1.0), ((176, 180), 1.0), ((177, 181), 1.0), ((178, 182), 1.0), ((179, 183), 1.0), ((180, 184), 1.0), ((252, 256), 1.0), ((253, 257), 1.0), ((254, 258), 1.0), ((255, 259), 1.0), ((256, 260), 1.0), ((257, 261), 1.0), ((258, 262), 1.0), ((23, 33), 0.9855072463768116), ((36, 39), 0.9855072463768116), ((53, 57), 0.9855072463768116), ((54, 58), 0.9855072463

## Part 5: Sequence conservation score 

In [21]:
MSA_SCORES_FILE = r"../msa_scores/align_and_score.out" 
# take the tem1 and score results, and can use that to align... 
relevant_part = False
count = 0
summary_section = []
with open(MSA_SCORES_FILE, "r", encoding="utf-8") as file_in:
    for line in file_in:

        if relevant_part:
            summary_section.append(line)
        if "---------------------------------------------------------------------------------------------------------" in line:
            count += 1
            if count == 1:
                relevant_part = True
            else:
                relevant_part = False

print(summary_section)

['   1    0.00    0.00    0.00    0.00   -       -       N       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       A       -       -       -       -       -       -       -       -       -       -       -       N       -       -       N       R       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       \n', '   2    0.00    0.00    0.00    0.00   -       -       K       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       -       S       -       -       -       -       -       -       -       -       -       -       -       A       -       -       R 

In [36]:
df = pd.read_csv("../comparitive_data/msa_scores/alignment_scores.csv", delimiter="\t")
df = df.set_index("msa_numb")
msa_scores = list(df["pos_scr"])
tem1_seq_msa_scores = list(df["1M40_TEM-1"])
print(msa_scores)

[0.0, 0.0, 0.28, 0.0, 0.27, 0.0, 0.35, 0.0, 0.5, 0.0, 0.0, 0.55, 0.46, 0.0, 0.0, 0.59, 0.0, 0.61, 0.59, 0.55, 0.5, 0.82, 0.47, 0.54, 0.5, 0.7, 0.7, 0.75, 0.7, 0.96, 0.42, 0.46, 0.46, 0.71, 0.76, 0.45, 0.5, 0.64, 0.76, 0.61, 0.44, 0.67, 0.13, 0.56, 0.56, 0.81, 0.51, 0.58, 0.78, 0.82, 0.9, 0.73, 0.64, 0.28, 0.89, 0.83, 0.7, 0.86, 0.62, 0.64, 0.7, 0.38, 0.65, 0.75, 0.83, 0.87, 0.54, 0.61, 0.51, 0.0, 0.62, 0.58, 0.56, 0.49, 0.57, 0.43, 0.81, 0.5, 0.61, 0.61, 0.76, 0.45, 0.0, 0.72, 0.57, 0.53, 0.56, 0.73, 0.83, 0.0, 0.0, 0.76, 0.0, 0.53, 0.0, 0.7, 0.79, 0.0, 0.0, 0.92, 0.74, 0.74, 0.75, 0.76, 0.67, 0.68, 0.46, 0.0, 0.0, 0.56, 0.87, 0.0, 0.0, 0.75, 0.83, 0.76, 0.47, 0.79, 0.82, 0.51, 0.56, 0.83, 0.75, 0.72, 0.53, 0.39, 0.89, 0.89, 0.8, 0.72, 0.85, 0.65, 0.8, 0.75, 0.86, 0.78, 0.53, 0.0, 0.49, 0.73, 0.94, 0.94, 0.87, 0.0, 0.0, 0.0, 0.0, 0.57, 0.52, 0.71, 0.76, 0.66, 0.74, 0.67, 0.44, 0.55, 0.58, 0.95, 0.84, 0.42, 0.64, 0.58, 0.76, 0.78, 0.79, 0.82, 0.44, 0.85, 0.71, 0.73, 0.84, 0.77, 0.55, 0.

In [37]:
tem1_per_res_seq_score = []
for msa_score, tem1_res in zip(msa_scores, tem1_seq_msa_scores):
    if tem1_res == "-":
        continue
    
    tem1_per_res_seq_score.append(msa_score)
print(len(tem1_per_res_seq_score))

263


In [39]:
per_res_df["seq_align_score"] = tem1_per_res_seq_score
per_res_df

Unnamed: 0,fitness,conserv score1,conserv score2,seq_align_score
1,18.036298,0.956522,0.260306,0.28
2,19.088504,0.826087,0.224810,0.27
3,19.187298,1.000000,0.272138,0.35
4,13.055777,0.831758,0.452706,0.50
5,15.530248,0.565217,0.153817,0.55
...,...,...,...,...
259,10.214448,0.739130,0.402291,0.62
260,13.961234,0.589792,0.481515,0.56
261,18.816735,1.000000,0.272138,0.58
262,17.459732,1.000000,0.272138,0.41


In [41]:
fig = px.scatter(per_res_df, x="seq_align_score", y="conserv score2")
fig.update_layout(xaxis_title="MSA score (seq conservation)",
                  yaxis_title="Residue Conservation Score",
                  legend_title="Legend Title",
                  font=dict(size=18))
fig.show()
# TODO - make figure for this.

In [45]:
pearson, p = pearsonr(per_res_df["conserv score2"], per_res_df["seq_align_score"])
print(f"Pearson r: {pearson:.3f}")
spearman, p = spearmanr(per_res_df["conserv score2"], per_res_df["seq_align_score"])
print(f"Spearmans correlation coefficient: {spearman:.3f}")

Pearson r: 0.206
Spearmans correlation coefficient: 0.190


In [47]:
from typing import Optional
import numpy as np
from MDAnalysis.analysis import distances
import MDAnalysis as mda
import csv

def per_residue_distance_to_site(pdb_file: str,
                                 site_defintion: str,
                                 first_residue: int,
                                 last_residue: int,
                                 side_chain_only: bool = False,
                                 out_file: Optional[str] = None,
                                 ) -> dict:
    """
    Calculate the closest heavy atom distance of each residue to an mdtraj defined
    selection of a site of interest. You can write the results to file if desired.

    Parameters
    ----------
    pdb_file : str
        Path to pdb file to use for the distance calculation.

    site_defintion : str
        mdtraj compatable defintion of the site of interest
        (i.e. binding site, active site etc..)
        See here for examples: https://mdtraj.org/1.9.3/atom_selection.html

    first_residue : int
        First residue to measure the distance from.

    last_residue : int
        Last residue to measure the distance to.

    side_chain_only: bool = False,
        Choose whether you want to measure the minimum distance using only the
        side chain of each residue. If true, only the side chain atoms are used.
        For glycines (no side chain), the CA of the glycine is used instead.

    out_file : Optional[str]
        Path to output file to write out data.

    Returns
    ----------
    dict
        Residue numbers are the keys and minimum distances are the values.
    """
    universe = mda.Universe(pdb_file)
    group2 = universe.select_atoms(site_defintion)
    min_dists = {}

    if side_chain_only:
        for residue in range(first_residue, last_residue+1):
            selection_str = "not backbone and not name H* and resid " + \
                str(residue)
            group1 = universe.select_atoms(selection_str)

            res_dist_arr = distances.distance_array(
                group1.positions, group2.positions, box=universe.dimensions)

            try:
                min_res_dist = np.round(res_dist_arr.min(), 2)

            # catches "zero-size array to reduction operation minimum which has no identity"
            except ValueError:
                # This happens for glycines which have no side chain...
                selection_str = "name CA and resid " + str(residue)
                group1 = universe.select_atoms(selection_str)

                res_dist_arr = distances.distance_array(
                    group1.positions, group2.positions, box=universe.dimensions)

                min_res_dist = np.round(res_dist_arr.min(), 2)

            min_dists.update({residue: min_res_dist})

    else:  # both side and main chain route.
        for residue in range(first_residue, last_residue+1):
            selection_str = "not name H* and resid " + str(residue)
            group1 = universe.select_atoms(selection_str)

            res_dist_arr = distances.distance_array(
                group1.positions, group2.positions, box=universe.dimensions)

            min_res_dist = np.round(res_dist_arr.min(), 2)
            min_dists.update({residue: min_res_dist})

    if out_file is None:
        return min_dists

    with open(out_file, "w", newline="", encoding="utf-8") as file_out:
        csv_out = csv.writer(file_out)
        csv_out.writerow(["Residue Number", "Minimum Distance"])
        csv_out.writerows(min_dists.items())
        print(f"{out_file} written to disk.")
    return min_dists


In [48]:
min_dists = per_residue_distance_to_site(
    pdb_file="../protein_prep/5_tleap/1BTL_tem1_apo_postleap.pdb",
    site_defintion="resid 45 and name OG",
    first_residue=1,
    last_residue=263,
    side_chain_only=False,
    out_file=r"per_res_dist_to_active_site.csv")
min_dists


Element information is missing, elements attribute will not be populated. If needed these can be guessed using MDAnalysis.topology.guessers.



per_res_dist_to_active_site.csv written to disk.


{1: 28.97,
 2: 30.22,
 3: 28.76,
 4: 24.56,
 5: 25.98,
 6: 27.74,
 7: 24.02,
 8: 22.27,
 9: 25.37,
 10: 25.18,
 11: 21.56,
 12: 22.43,
 13: 25.16,
 14: 23.62,
 15: 18.37,
 16: 22.42,
 17: 19.54,
 18: 17.95,
 19: 18.64,
 20: 17.05,
 21: 19.12,
 22: 19.14,
 23: 21.82,
 24: 22.26,
 25: 25.14,
 26: 22.17,
 27: 26.91,
 28: 28.54,
 29: 26.4,
 30: 26.59,
 31: 25.59,
 32: 26.77,
 33: 24.09,
 34: 24.26,
 35: 21.54,
 36: 20.8,
 37: 20.63,
 38: 21.77,
 39: 18.43,
 40: 16.52,
 41: 12.79,
 42: 9.71,
 43: 6.5,
 44: 4.11,
 45: 0.0,
 46: 4.84,
 47: 5.63,
 48: 2.9,
 49: 8.58,
 50: 10.77,
 51: 9.78,
 52: 11.81,
 53: 14.3,
 54: 16.0,
 55: 15.35,
 56: 17.66,
 57: 20.27,
 58: 20.52,
 59: 20.48,
 60: 24.16,
 61: 26.25,
 62: 27.58,
 63: 24.91,
 64: 20.96,
 65: 22.23,
 66: 20.08,
 67: 23.49,
 68: 21.21,
 69: 22.4,
 70: 18.45,
 71: 20.87,
 72: 17.2,
 73: 19.12,
 74: 18.87,
 75: 18.29,
 76: 14.55,
 77: 14.29,
 78: 10.33,
 79: 8.91,
 80: 6.86,
 81: 10.33,
 82: 10.17,
 83: 12.8,
 84: 12.66,
 85: 15.27,
 86: 17.61