In [1]:
#option to be by residue
import numpy as np
import scipy
from scipy import stats

In [2]:
#Input:
#(a) clash_dict data from clashes run
#(b) clash_key what type of clashes to work with- atom or residue
def analyze_res_clashes(clash_dict, clash_key):
    clash_d = np.load(clash_dict, allow_pickle = True).item()
    #print(clash_d.keys())
    #Create overall dictionary
    aa_list = sorted(["ASP",
            "ALA",
            "ARG",
            "CYS",
            "GLU",
            "PHE",
            "GLY",
            "HIS",
            "ILE",
            "LYS",
            "LEU",
            "MET",
            "ASN",
            "PRO",
            "GLN",
            "SER",
            "THR",
            "VAL",
            "TRP",
            "TYR"])
    
    overall_clash_dict = {}
    overall_clash_dict["clashes"] = {}
    overall_clash_dict["binding_site_res"] = {}
    for aa in aa_list:
        overall_clash_dict["clashes"][aa] = 0
        overall_clash_dict["binding_site_res"][aa] = 0
        
    #print(overall_clash_dict)
    #Iterate over pdbs
    for pdb_a in clash_d.keys():
    
        #iterate over amino acids
        #Clash and binding site update
        for aa_add_info in overall_clash_dict["clashes"].keys():
            overall_clash_dict["clashes"][aa_add_info] += clash_d[pdb_a][clash_key][aa_add_info]
            overall_clash_dict["binding_site_res"][aa_add_info] += clash_d[pdb_a]["binding_site_comp"][aa_add_info]

    #Analyze amino acids
    #Total clash count
    all_clashes = 0
    for aa_c in overall_clash_dict["clashes"].keys():
        all_clashes += overall_clash_dict["clashes"][aa_c]
        
    #Number of residues in all binding sites
    all_binding_site_particip = 0
    for aa_b in overall_clash_dict["binding_site_res"].keys():
        all_binding_site_particip += overall_clash_dict["binding_site_res"][aa_b]
    
    max_clash_per_binding = -1
    
    #max ratio for reporting
    for aa_report in overall_clash_dict["clashes"].keys():
        clash_per_binding_check = overall_clash_dict["clashes"][aa_report] / overall_clash_dict["binding_site_res"][aa_report]
        if clash_per_binding_check > max_clash_per_binding:
            max_clash_per_binding = clash_per_binding_check
            
    print("amino acid percent clashes percent binding site c/b ratio c/b ratio norm.")
    #print percent of each residue in clashes and in binding sites   
    for aa_report in overall_clash_dict["clashes"].keys():
        
        #Counts and percents of total - clashes and binding sites
        num_clashes = overall_clash_dict["clashes"][aa_report]
        pct_clashes = overall_clash_dict["clashes"][aa_report] / all_clashes
        num_binding = overall_clash_dict["binding_site_res"][aa_report]
        pct_binding = overall_clash_dict["binding_site_res"][aa_report] / all_binding_site_particip
        
        #Clashes relative to binding site prevalence
        clash_bind_ratio = overall_clash_dict["clashes"][aa_report] / overall_clash_dict["binding_site_res"][aa_report]
        clash_bind_ratio_norm = clash_bind_ratio / max_clash_per_binding
        print(f"{aa_report}  {pct_clashes} {pct_binding} {clash_bind_ratio} {clash_bind_ratio_norm}")

In [3]:
#
print("DiffDock, 1.0 clashes")
analyze_res_clashes("/Users/dsharon/Documents/MIT/6.8701/Project/Analysis/RMSD_and_Chem_Feats/Clashes_231210_1900/Clash_dictionary_DiffDock_Test_Set_only_231210_1920_m_1.0.npy", "clashes")
#
print("DiffDock, 1.0 clashes residue level")
analyze_res_clashes("/Users/dsharon/Documents/MIT/6.8701/Project/Analysis/RMSD_and_Chem_Feats/Clashes_231210_1900/Clash_dictionary_DiffDock_Test_Set_only_231210_1920_m_1.0.npy", "clashes_res_level")

DiffDock, 1.0 clashes
amino acid percent clashes percent binding site c/b ratio c/b ratio norm.
ALA  0.028700972839723556 0.04334438654666035 3.5628415300546448 0.31538772299121853
ARG  0.07135625302636792 0.05139744197063003 7.470046082949309 0.6612589431404543
ASN  0.04208302152572963 0.04097584083372809 5.526011560693641 0.48917028406918417
ASP  0.04930228463265396 0.08266224538133586 3.2091690544412605 0.2840801400332244
CYS  0.020601311792930406 0.022027475130270015 5.032258064516129 0.44546253294841215
GLN  0.03288286305410045 0.02297489341544292 7.701030927835052 0.6817060451682779
GLU  0.03446757934586433 0.05542396968261488 3.3461538461538463 0.29620622568093385
GLY  0.058854602280230664 0.09545239223117007 3.3176178660049627 0.29368018074557545
HIS  0.04080644451291984 0.03837044054950261 5.722222222222222 0.5065391266753134
ILE  0.04569265307919179 0.0556608242539081 4.417021276595745 0.3910009106714132
LEU  0.08222916758374785 0.0665561345333965 6.647686832740214 0.58846255

In [4]:
#
print("HF, 1.0 clashes")
analyze_res_clashes("/Users/dsharon/Documents/MIT/6.8701/Project/Analysis/RMSD_and_Chem_Feats/Clashes_231210_1900/Clash_dictionary_HF_Test_Set_only_231210_1920_m_1.0.npy", "clashes")
#
print("HF, 1.0 clashes residue level")
analyze_res_clashes("/Users/dsharon/Documents/MIT/6.8701/Project/Analysis/RMSD_and_Chem_Feats/Clashes_231210_1900/Clash_dictionary_HF_Test_Set_only_231210_1920_m_1.0.npy", "clashes_res_level")

HF, 1.0 clashes
amino acid percent clashes percent binding site c/b ratio c/b ratio norm.
ALA  0.011423055265063148 0.033164128595600674 1.4489795918367347 0.16356064857201924
ARG  0.07843294988335613 0.05549915397631134 5.945121951219512 0.6710846775616816
ASN  0.03443005389751428 0.03824027072758037 3.7876106194690267 0.42754504821792194
ASP  0.05655216796717883 0.09813874788494077 2.424137931034483 0.27363640900244524
CYS  0.013353712492961146 0.018612521150592216 3.018181818181818 0.34069201420865675
GLN  0.028557638162657873 0.02707275803722504 4.4375 0.500904486251809
GLU  0.027753197651033706 0.06125211505922166 1.9060773480662982 0.21515779037506694
GLY  0.0437615638323546 0.10964467005076142 1.6790123456790123 0.18952671919386824
HIS  0.03475183010216395 0.04094754653130288 3.5702479338842976 0.4030091734341175
ILE  0.040704689888182766 0.05313028764805414 3.2229299363057327 0.36380395807792637
LEU  0.07473252352988496 0.05854483925549915 5.369942196531792 0.6061584534435307
L

In [5]:
#
print("DiffDock, 0.87 clashes")
analyze_res_clashes("/Users/dsharon/Documents/MIT/6.8701/Project/Analysis/RMSD_and_Chem_Feats/Clashes_231210_1900/Clash_dictionary_DiffDock_Test_Set_only_231210_1920_m_0.87.npy", "clashes")
#
print("DiffDock, 0.87 clashes residue level")
analyze_res_clashes("/Users/dsharon/Documents/MIT/6.8701/Project/Analysis/RMSD_and_Chem_Feats/Clashes_231210_1900/Clash_dictionary_DiffDock_Test_Set_only_231210_1920_m_0.87.npy", "clashes_res_level")

DiffDock, 0.87 clashes
amino acid percent clashes percent binding site c/b ratio c/b ratio norm.
ALA  0.02328299172959367 0.04334438654666035 1.4153005464480874 0.24649253536225063
ARG  0.07182668104998202 0.05139744197063003 3.6820276497695854 0.6412718012038895
ASN  0.043329737504494786 0.04097584083372809 2.786127167630058 0.4852393727355699
ASP  0.0457569219705142 0.08266224538133586 1.4584527220630372 0.25400803389040455
CYS  0.020406328658755843 0.022027475130270015 2.4408602150537635 0.425106755157689
GLN  0.04270046745774901 0.02297489341544292 4.896907216494846 0.8528584817244612
GLU  0.03281193815174398 0.05542396968261488 1.5598290598290598 0.2716640085061138
GLY  0.051869830996044586 0.09545239223117007 1.4317617866004964 0.24935946905386636
HIS  0.04306005034160374 0.03837044054950261 2.95679012345679 0.5149624904010869
ILE  0.042071197411003236 0.0556608242539081 1.9914893617021276 0.34684312328209305
LEU  0.09007551240560949 0.0665561345333965 3.565836298932384 0.6210356

In [6]:
#
print("HarmonicFlow, 0.87 clashes")
analyze_res_clashes("/Users/dsharon/Documents/MIT/6.8701/Project/Analysis/RMSD_and_Chem_Feats/Clashes_231210_1900/Clash_dictionary_HF_Test_Set_only_231210_1920_m_0.87.npy", "clashes")
#
print("HarmonicFlow, 0.87 clashes residue level")
analyze_res_clashes("/Users/dsharon/Documents/MIT/6.8701/Project/Analysis/RMSD_and_Chem_Feats/Clashes_231210_1900/Clash_dictionary_HF_Test_Set_only_231210_1920_m_0.87.npy", "clashes_res_level")

HarmonicFlow, 0.87 clashes
amino acid percent clashes percent binding site c/b ratio c/b ratio norm.
ALA  0.006905812392096681 0.033164128595600674 0.3673469387755102 0.08281231567771617
ARG  0.07289468636102052 0.05549915397631134 2.317073170731707 0.5223459749048357
ASN  0.03568003069249952 0.03824027072758037 1.6460176991150441 0.37106757378894056
ASP  0.051601764818722425 0.09813874788494077 0.9275862068965517 0.2091090293003787
CYS  0.015346249760214847 0.018612521150592216 1.4545454545454546 0.32790331056227007
GLN  0.031076155764435065 0.02707275803722504 2.025 0.45650289017341034
GLU  0.023786687128333015 0.06125211505922166 0.6850828729281768 0.15444064765432888
GLY  0.028582390178400155 0.10964467005076142 0.45987654320987653 0.10367159066581032
HIS  0.024553999616343757 0.04094754653130288 1.0578512396694215 0.23847513495437825
ILE  0.038173796278534435 0.05313028764805414 1.267515923566879 0.28574058392548135
LEU  0.07903318626510647 0.05854483925549915 2.3815028901734103 0

In [7]:
test_d = {"pdb1" : 
          { 
              "clashes" : {"ALA" : 1,
                                "CYS" : 1,
                                "ASP" : 1,
                                "GLU" : 1,
                                "PHE" : 0,
                                "GLY" : 0,
                                "HIS" : 0,
                                "ILE" : 0,
                                "LYS" : 0,
                                "LEU" : 0,
                                "MET" : 0,
                                "ASN" : 0,
                                "PRO" : 0,
                                "GLN" : 0,
                                "ARG" : 0,
                                "SER" : 0,
                                "THR" : 0,
                                "VAL" : 0,
                                "TRP" : 0,
                                "TYR" : 0},
                     "binding_site_comp" : {"ALA" : 1,
                                "CYS" : 1,
                                "ASP" : 1,
                                "GLU" : 1,
                                "PHE" : 1,
                                "GLY" : 1,
                                "HIS" : 1,
                                "ILE" : 1,
                                "LYS" : 1,
                                "LEU" : 1,
                                "MET" : 1,
                                "ASN" : 1,
                                "PRO" : 1,
                                "GLN" : 1,
                                "ARG" : 1,
                                "SER" : 1,
                                "THR" : 1,
                                "VAL" : 1,
                                "TRP" : 1,
                                "TYR" : 1}
              },
            "pdb2" : 
          { 
              "clashes" : {"ALA" : 1,
                                "CYS" : 3,
                                "ASP" : 3,
                                "GLU" : 10,
                                "PHE" : 0,
                                "GLY" : 0,
                                "HIS" : 0,
                                "ILE" : 0,
                                "LYS" : 0,
                                "LEU" : 0,
                                "MET" : 0,
                                "ASN" : 0,
                                "PRO" : 0,
                                "GLN" : 0,
                                "ARG" : 0,
                                "SER" : 0,
                                "THR" : 0,
                                "VAL" : 0,
                                "TRP" : 5,
                                "TYR" : 0},
                     "binding_site_comp" : {"ALA" : 1,
                                "CYS" : 1,
                                "ASP" : 1,
                                "GLU" : 1,
                                "PHE" : 1,
                                "GLY" : 1,
                                "HIS" : 1,
                                "ILE" : 1,
                                "LYS" : 1,
                                "LEU" : 1,
                                "MET" : 1,
                                "ASN" : 1,
                                "PRO" : 1,
                                "GLN" : 1,
                                "ARG" : 1,
                                "SER" : 1,
                                "THR" : 2,
                                "VAL" : 2,
                                "TRP" : 2,
                                "TYR" : 2}
          }
         }
              
np.save("/Users/dsharon/Documents/MIT/6.8701/Project/Analysis/RMSD_and_Chem_Feats/Clash_check_sample.npy", test_d)

In [9]:
analyze_res_clashes("/Users/dsharon/Documents/MIT/6.8701/Project/Analysis/RMSD_and_Chem_Feats/Clash_check_sample.npy", "clashes")

amino acid percent clashes percent binding site c/b ratio c/b ratio norm.
ALA  0.07692307692307693 0.045454545454545456 1.0 0.18181818181818182
ARG  0.0 0.045454545454545456 0.0 0.0
ASN  0.0 0.045454545454545456 0.0 0.0
ASP  0.15384615384615385 0.045454545454545456 2.0 0.36363636363636365
CYS  0.15384615384615385 0.045454545454545456 2.0 0.36363636363636365
GLN  0.0 0.045454545454545456 0.0 0.0
GLU  0.4230769230769231 0.045454545454545456 5.5 1.0
GLY  0.0 0.045454545454545456 0.0 0.0
HIS  0.0 0.045454545454545456 0.0 0.0
ILE  0.0 0.045454545454545456 0.0 0.0
LEU  0.0 0.045454545454545456 0.0 0.0
LYS  0.0 0.045454545454545456 0.0 0.0
MET  0.0 0.045454545454545456 0.0 0.0
PHE  0.0 0.045454545454545456 0.0 0.0
PRO  0.0 0.045454545454545456 0.0 0.0
SER  0.0 0.045454545454545456 0.0 0.0
THR  0.0 0.06818181818181818 0.0 0.0
TRP  0.19230769230769232 0.06818181818181818 1.6666666666666667 0.30303030303030304
TYR  0.0 0.06818181818181818 0.0 0.0
VAL  0.0 0.06818181818181818 0.0 0.0


In [10]:
#expect 44 binding site entries
print(2/44)
print(3/44)

0.045454545454545456
0.06818181818181818


In [11]:
#good binding sites match
#26 clashes
#ala 2 cys and asp 4 each glu 11 trp 5
print(f"{2/26} {4/26} {11/26} {5/26}")

0.07692307692307693 0.15384615384615385 0.4230769230769231 0.19230769230769232


In [12]:
#match good
#glu should have max ratio
print(f"{11 /2}")

5.5


In [13]:
#ala - 2 clashes 2 entries 1 good, asp and cys 2 entries 4 clashes 2 good, trp 5 clashes 3 entries 1.66 good
print(1/5.5)
print(1.666/5.5)
print(2/5.5)

0.18181818181818182
0.3029090909090909
0.36363636363636365


In [None]:
#great, code works as expected on this example