In [1]:
#12/8/23 find clashes
#12/10/23 by res option
import MDAnalysis
from MDAnalysis.analysis import distances
from MDAnalysis.analysis.hydrogenbonds.hbond_analysis import HydrogenBondAnalysis
import numpy as np
import matplotlib.pyplot as plt
import os
import scipy
from scipy import stats

  from .autonotebook import tqdm as notebook_tqdm
  import xdrlib


In [2]:
#Check for clashes and report per residue type
#Input:
#(a) pdb_list
#(b) l_path
#(c) protein_path
#(d) hf_run boolean True if HarmonicFlow, False if DiffDdock
#(e) cutoff_d cutoff distance
#(f) odir output directory
#(g) jstr job string
#(h) multip multiplier to scale since this is very stringent for HB
def clash_checking(pdb_list, 
                   l_path, 
                   protein_path, 
                   hf_run, 
                   cutoff_d,
                   odir,
                   jstr,
                   multip):
    
    #Removing from analysis
    remove_list = []
    
    #Clashing residue dictionary
    clash_res_dist = {}
    
    #See Obtain_Radii_231208.ipynb, also:
    #Ref https://pubs.acs.org/doi/10.1021/jp8111556
    #Ref http://ursula.chem.yale.edu/~chem220/chem220js/STUDYAIDS/vanderwaalsradius.html
    vdw_radius_dict = { "H" : 1.2,
                        "C" : 1.7,
                        "N" :  1.6,
                        "O" :  1.55,
                        "F" :  1.5,
                        "S" :  1.8,
                        "Cl" :  1.8}
    
    max_vdw = max(list(vdw_radius_dict.values()))
    
    #Iterate over each pdb
    for ipdb, pdb_analyze in enumerate(pdb_list):
        
        print(f"On {pdb_analyze} index {ipdb}")
        
        #############
        #Download docking output files, make universes
        #############
        if hf_run:
            protein_file = f"{protein_path}/{pdb_analyze}/{pdb_analyze}_protein_processed.pdb"
            ligand_file = f"{l_path}/{pdb_analyze}_x20.pdb"
            
        if not hf_run:
            protein_file = f"{protein_path}/{pdb_analyze}/{pdb_analyze}_protein_processed.pdb"
            ligand_file = f"{l_path}/{pdb_analyze}_ligand_dd.pdb"
        u_prot = MDAnalysis.Universe(protein_file)
        
        #Copied below from https://github.com/gcorso/DiffDock/blob/main/datasets/pdbbind.py
        #Only work with mol2 though revise because that is compatible with MDAnalysis- sdf is not
        #for file in os.listdir(os.path.join(pdb_path, pdb_analyze)):
        #    if file.endswith(".mol2") and 'rdkit' not in file:
        #        ligand_file = os.path.join(pdb_path, pdb_analyze, file)
        #        #if lig is None and os.path.exists(os.path.join(pdb_path, pdb_analyze, file[:-4] + ".mol2")):  # read mol2 file if sdf file cannot be sanitized
        #        #    print('Using the .sdf file failed. We found a .mol2 file instead and are trying to use that.')
        #        #    lig = read_molecule(os.path.join(pdb_path, pdb_analyze, file[:-4] + ".mol2"), remove_hs=False, sanitize=True)
        
        u_ligand = MDAnalysis.Universe(ligand_file)

        #Ref https://userguide.mdanalysis.org/stable/universe.html
        #Ref https://docs.mdanalysis.org/2.6.1/documentation_pages/core/universe.html#MDAnalysis.core.universe.Merge
        #Merge
        u_pl = MDAnalysis.core.universe.Merge(u_prot.atoms, u_ligand.atoms)
        
        #############
        #Protein and ligand info setup
        #############
        #For now- all segids allowed
        #May want to only have one later?
        psegids = list(set([a.segid for a in u_prot.atoms]))
        psegid_str = " ".join([p for p in psegids])
        #For now- all segids allowed
        #May want to only have one later?
        #psegids = list(set([a.segid for a in u_prot.atoms]))
        #if len(psegids) > 1:
        #    print("over 1 protein segids")
        #    print(psegids)
        #psegid = psegids[0]
        lresnames = list(set([a.resname for a in u_ligand.atoms]))
        
        #for now let only 1 ligand resname be used, simpifies
        if len(lresnames) > 1:
            print("over 1 ligand resnames REMOVAL")
            print(lresnames)
            print(f"resname string {lresname}")
            remove_list.append(pdb_analyze)
            
        else:
            lresname = lresnames[0]

            #If ligand resname is a protein resid- exclude
            if lresname in ["ALA",
            "CYS",
            "ASP",
            "GLU",
            "PHE",
            "GLY",
            "HIS",
            "ILE",
            "LYS",
            "LEU",
            "MET",
            "ASN",
            "PRO",
            "GLN",
            "ARG",
            "SER",
            "THR",
            "VAL",
            "TRP",
            "TYR",
            "MSE"]:
                remove_list.append(pdb_analyze)
                print(f"Ligand resname {lresname} is protein resname: REMOVAL")
            
            #############
            #Clash checking
            #############
            else:
                
                #Clashing residues
                #Each clash
                res_clash_dict = {"ALA" : 0,
                                "CYS" : 0,
                                "ASP" : 0,
                                "GLU" : 0,
                                "PHE" : 0,
                                "GLY" : 0,
                                "HIS" : 0,
                                "ILE" : 0,
                                "LYS" : 0,
                                "LEU" : 0,
                                "MET" : 0,
                                "ASN" : 0,
                                "PRO" : 0,
                                "GLN" : 0,
                                "ARG" : 0,
                                "SER" : 0,
                                "THR" : 0,
                                "VAL" : 0,
                                "TRP" : 0,
                                "TYR" : 0}
                
                #Residue level
                res_clash_dict_by_res = {"ALA" : 0,
                                "CYS" : 0,
                                "ASP" : 0,
                                "GLU" : 0,
                                "PHE" : 0,
                                "GLY" : 0,
                                "HIS" : 0,
                                "ILE" : 0,
                                "LYS" : 0,
                                "LEU" : 0,
                                "MET" : 0,
                                "ASN" : 0,
                                "PRO" : 0,
                                "GLN" : 0,
                                "ARG" : 0,
                                "SER" : 0,
                                "THR" : 0,
                                "VAL" : 0,
                                "TRP" : 0,
                                "TYR" : 0}
                
                #Binding site composition
                bsite_comp_dict = {"ALA" : 0,
                                "CYS" : 0,
                                "ASP" : 0,
                                "GLU" : 0,
                                "PHE" : 0,
                                "GLY" : 0,
                                "HIS" : 0,
                                "ILE" : 0,
                                "LYS" : 0,
                                "LEU" : 0,
                                "MET" : 0,
                                "ASN" : 0,
                                "PRO" : 0,
                                "GLN" : 0,
                                "ARG" : 0,
                                "SER" : 0,
                                "THR" : 0,
                                "VAL" : 0,
                                "TRP" : 0,
                                "TYR" : 0}
                
                #12/10/23 list of clashing resids so each esid will be recorded once not for each clash
                clash_resid_list = []
                
                #For each ligand atom - clash check
                for l in u_pl.select_atoms(f"resname {lresname}"):
                    
                    #Record element
                    l_element = l.element
                    #print(f"{l_element} {l.name}")
                    
                    #Check clashes
                    if l_element in vdw_radius_dict.keys():
                        
                        #Ligand radius
                        l_radius = vdw_radius_dict[l_element]
                        
                        #Now find whether any protein atoms are beloww vdw distance sum with this ligand atom
                        for e_check in vdw_radius_dict.keys():
                            sum_vdw = multip * (l_radius + vdw_radius_dict[e_check])
                            
                            #Find atoms in protein within that cutoff
                            #Record resnames
                            p_clash_l = u_pl.select_atoms(f"protein and element {e_check} and around {sum_vdw} (resname {lresname} and index {l.index})")
                            #if len(p_clash_l) > 0:
                                #print(p_clash_l)
                                #print(f"vdw cutoff for above was {sum_vdw}")
                            for p_clash_l_atom in p_clash_l:
                                #print(p_clash_l_atom.resname)
                                if p_clash_l_atom.resname in list(res_clash_dict.keys()): #MSE protect
                                    #print(list(res_clash_dict.keys()))
                                    res_clash_dict[p_clash_l_atom.resname] += 1
                                    clash_resid_list.append(p_clash_l_atom.residue)
                            
                                
                #Also find binding site composition- this is approx.
                #Which protein atoms are near the ligand?
                #12/6/23 segid update
                #p_near_l = u_pl.select_atoms(f"protein and segid {psegid} and not element H and around 4 (resname {lresname} and not element H)")
                p_near_l = u_pl.select_atoms(f"protein and not element H and segid {psegid_str} and around {cutoff_d} (resname {lresname} and not element H)")


                #############
                #Find resids near the ligand
                #############
                #for a in p_near_l: print(a.residue)
                unique_resids = list(set([a.residue for a in p_near_l]))
                #print("nearby residues")
                #print(unique_resids)
                if len(unique_resids) == 0:
                    remove_list.append(pdb_analyze)
                    #print(f"NO NEARBY RESIDS REMOVAL")
                    
                else:
                    for u in unique_resids:
                        u_resname = u.resname
                        if u_resname in bsite_comp_dict.keys():
                            bsite_comp_dict[u_resname] += 1
                            
                    
                    #Add to overall dictionary
                    clash_res_dist[pdb_analyze] = {}
                    clash_res_dist[pdb_analyze]["clashes"] = res_clash_dict
                    clash_res_dist[pdb_analyze]["binding_site_comp"] = bsite_comp_dict
                    
                    #12/10/23 also record on a residue level
                    for c_res in list(set(list(clash_resid_list))):
                        res_clash_dict_by_res[c_res.resname] += 1
                    
                    clash_res_dist[pdb_analyze]["clashes_res_level"] = res_clash_dict_by_res
                    #print("res level")
                    #print(res_clash_dict_by_res)
    
    #print("clash dictionary")
    #print(clash_res_dist)
    
    np.save(f"{odir}/Clash_dictionary_{jstr}_m_{multip}.npy", clash_res_dist)
    np.save(f"{odir}/REMOVALS_{jstr}_{multip}.npy", remove_list)

In [3]:
npy_pdbs = "/Users/dsharon/Documents/MIT/6.8701/Project/Data/From_Hannes/HarmonicFlow_complex_names.npy"
#From Hannes
with open(npy_pdbs, 'rb') as f:
    pdb_l = np.load(f)
l_pa = "/Users/dsharon/Documents/MIT/6.8701/Project/Data/From_Hannes/HarmonicFlow/inference_output_last_xt/"
protein_pa = "/Users/dsharon/Documents/MIT/6.8701/Project/Code/HarmonicFlow/FlowSite/data/PDBBind_processed"
hf_r = True
output_dir = "/Users/dsharon/Documents/MIT/6.8701/Project/Analysis/RMSD_and_Chem_Feats/Clashes_231210_1900"
j_str = "HF_Test_Set_only_231210_1920"
multval = 1.00
clash_checking(pdb_l, 
               l_pa, 
               protein_pa, 
               hf_r, 
               3.70,
               output_dir,
               j_str,
               multval)

On 6jt3 index 0
On 6jbb index 1
On 6ufo index 2
On 6os6 index 3
On 6jbe index 4
On 6qra index 5
On 6qsz index 6
On 6jam index 7
On 6a87 index 8
On 6oie index 9
On 6n4b index 10
On 6dyz index 11
On 6hhh index 12
On 6i5p index 13
On 6e6v index 14
On 6o9c index 15
On 6c85 index 16
On 6e13 index 17
On 6cjj index 18
On 6nri index 19
On 6mja index 20
On 6qr7 index 21
On 6o0h index 22
On 6qmt index 23
On 6ibz index 24
On 6d3y index 25
On 6qr0 index 26
On 6s9w index 27
On 6p8x index 28
On 6s07 index 29
On 6fe5 index 30
On 6oxp index 31
On 5zlf index 32
On 6n8x index 33
On 6qtw index 34
On 6mhd index 35
On 6jut index 36
On 6uhu index 37
On 6i8m index 38
On 6ckl index 39
On 6e6j index 40
On 5zjz index 41
On 6qts index 42
On 6h12 index 43
On 6pno index 44
On 6mo2 index 45
On 6oxv index 46
On 5zxk index 47
On 6cjs index 48
On 6mjq index 49
On 6mo0 index 50
On 6ahs index 51
On 6gj7 index 52
On 6cyh index 53
On 6gzy index 54
On 6oxy index 55
On 6oxr index 56
On 6qlr index 57
On 6i8t index 58
On 6qtr

In [4]:
npy_pdbs = "/Users/dsharon/Documents/MIT/6.8701/Project/Data/From_Hannes/HarmonicFlow_complex_names.npy"
#From Hannes
with open(npy_pdbs, 'rb') as f:
    pdb_l = np.load(f)
l_pa = "/Users/dsharon/Documents/MIT/6.8701/Project/Data/From_Hannes/HarmonicFlow/inference_output_last_xt/"
protein_pa = "/Users/dsharon/Documents/MIT/6.8701/Project/Code/HarmonicFlow/FlowSite/data/PDBBind_processed"
hf_r = True
output_dir = "/Users/dsharon/Documents/MIT/6.8701/Project/Analysis/RMSD_and_Chem_Feats/Clashes_231210_1900"
j_str = "HF_Test_Set_only_231210_1920"
multval = 0.87
clash_checking(pdb_l, 
               l_pa, 
               protein_pa, 
               hf_r, 
               3.70,
               output_dir,
               j_str,
               multval)

On 6jt3 index 0
On 6jbb index 1
On 6ufo index 2
On 6os6 index 3
On 6jbe index 4
On 6qra index 5
On 6qsz index 6
On 6jam index 7
On 6a87 index 8
On 6oie index 9
On 6n4b index 10
On 6dyz index 11
On 6hhh index 12
On 6i5p index 13
On 6e6v index 14
On 6o9c index 15
On 6c85 index 16
On 6e13 index 17
On 6cjj index 18
On 6nri index 19
On 6mja index 20
On 6qr7 index 21
On 6o0h index 22
On 6qmt index 23
On 6ibz index 24
On 6d3y index 25
On 6qr0 index 26
On 6s9w index 27
On 6p8x index 28
On 6s07 index 29
On 6fe5 index 30
On 6oxp index 31
On 5zlf index 32
On 6n8x index 33
On 6qtw index 34
On 6mhd index 35
On 6jut index 36
On 6uhu index 37
On 6i8m index 38
On 6ckl index 39
On 6e6j index 40
On 5zjz index 41
On 6qts index 42
On 6h12 index 43
On 6pno index 44
On 6mo2 index 45
On 6oxv index 46
On 5zxk index 47
On 6cjs index 48
On 6mjq index 49
On 6mo0 index 50
On 6ahs index 51
On 6gj7 index 52
On 6cyh index 53
On 6gzy index 54
On 6oxy index 55
On 6oxr index 56
On 6qlr index 57
On 6i8t index 58
On 6qtr

In [5]:
#12/9/23 for DiffDock
npy_pdbs_dd = "/Users/dsharon/Documents/MIT/6.8701/Project/Data/From_Hannes/user_predictions_testset/complex_names.npy"
#From Hannes
with open(npy_pdbs_dd, 'rb') as f:
    pdb_l_dd = np.load(f)
pdb_l_dd_cleaned = [pdbparse.split("/")[-1][0:4] for pdbparse in pdb_l_dd]
l_pa_dd = "/Users/dsharon/Documents/MIT/6.8701/Project/Data/From_Hannes/user_predictions_testset/mol_files_231209/"
protein_pa_dd = "/Users/dsharon/Documents/MIT/6.8701/Project/Code/HarmonicFlow/FlowSite/data/PDBBind_processed"
hf_r = False
output_dir = "/Users/dsharon/Documents/MIT/6.8701/Project/Analysis/RMSD_and_Chem_Feats/Clashes_231210_1900"
j_str = "DiffDock_Test_Set_only_231210_1920"
multval = 1.00
clash_checking(pdb_l_dd_cleaned, 
               l_pa_dd, 
               protein_pa_dd, 
               hf_r, 
               3.70,
               output_dir,
               j_str,
               multval)

On 6qqw index 0
On 6d08 index 1
On 6jap index 2
On 6np2 index 3
On 6uvp index 4
On 6oxq index 5
On 6jsn index 6
On 6hzb index 7
On 6qrc index 8
On 6oio index 9
On 6jag index 10
On 6moa index 11
On 6hld index 12
On 6i9a index 13
On 6e4c index 14
On 6g24 index 15
On 6jb4 index 16
On 6s55 index 17
On 6seo index 18
On 6dyz index 19
On 5zk5 index 20
On 6jid index 21
On 5ze6 index 22
On 6qlu index 23
On 6a6k index 24
On 6qgf index 25
On 6e3z index 26
On 6te6 index 27
On 6pka index 28
On 6g2o index 29
On 6jsf index 30
On 5zxk index 31
On 6qxd index 32
On 6n97 index 33
On 6jt3 index 34
On 6qtr index 35
On 6oy1 index 36
On 6n96 index 37
On 6qzh index 38
On 6qqz index 39
On 6qmt index 40
On 6ibx index 41
On 6hmt index 42
On 5zk7 index 43
On 6k3l index 44
On 6cjs index 45
On 6n9l index 46
On 6ibz index 47
On 6ott index 48
On 6gge index 49
On 6hot index 50
On 6e3p index 51
On 6md6 index 52
On 6hlb index 53
On 6fe5 index 54
On 6uwp index 55
On 6npp index 56
On 6g2f index 57
On 6mo7 index 58
On 6bqd

In [6]:
#12/9/23 for DiffDock
npy_pdbs_dd = "/Users/dsharon/Documents/MIT/6.8701/Project/Data/From_Hannes/user_predictions_testset/complex_names.npy"
#From Hannes
with open(npy_pdbs_dd, 'rb') as f:
    pdb_l_dd = np.load(f)
pdb_l_dd_cleaned = [pdbparse.split("/")[-1][0:4] for pdbparse in pdb_l_dd]
l_pa_dd = "/Users/dsharon/Documents/MIT/6.8701/Project/Data/From_Hannes/user_predictions_testset/mol_files_231209/"
protein_pa_dd = "/Users/dsharon/Documents/MIT/6.8701/Project/Code/HarmonicFlow/FlowSite/data/PDBBind_processed"
hf_r = False
output_dir = "/Users/dsharon/Documents/MIT/6.8701/Project/Analysis/RMSD_and_Chem_Feats/Clashes_231210_1900"
j_str = "DiffDock_Test_Set_only_231210_1920"
multval = 0.87
clash_checking(pdb_l_dd_cleaned, 
               l_pa_dd, 
               protein_pa_dd, 
               hf_r, 
               3.70,
               output_dir,
               j_str,
               multval)

On 6qqw index 0
On 6d08 index 1
On 6jap index 2
On 6np2 index 3
On 6uvp index 4
On 6oxq index 5
On 6jsn index 6
On 6hzb index 7
On 6qrc index 8
On 6oio index 9
On 6jag index 10
On 6moa index 11
On 6hld index 12
On 6i9a index 13
On 6e4c index 14
On 6g24 index 15
On 6jb4 index 16
On 6s55 index 17
On 6seo index 18
On 6dyz index 19
On 5zk5 index 20
On 6jid index 21
On 5ze6 index 22
On 6qlu index 23
On 6a6k index 24
On 6qgf index 25
On 6e3z index 26
On 6te6 index 27
On 6pka index 28
On 6g2o index 29
On 6jsf index 30
On 5zxk index 31
On 6qxd index 32
On 6n97 index 33
On 6jt3 index 34
On 6qtr index 35
On 6oy1 index 36
On 6n96 index 37
On 6qzh index 38
On 6qqz index 39
On 6qmt index 40
On 6ibx index 41
On 6hmt index 42
On 5zk7 index 43
On 6k3l index 44
On 6cjs index 45
On 6n9l index 46
On 6ibz index 47
On 6ott index 48
On 6gge index 49
On 6hot index 50
On 6e3p index 51
On 6md6 index 52
On 6hlb index 53
On 6fe5 index 54
On 6uwp index 55
On 6npp index 56
On 6g2f index 57
On 6mo7 index 58
On 6bqd

In [None]:
pdb_l = ["6d08"]
l_pa = "/Users/dsharon/Documents/MIT/6.8701/Project/Data/From_Hannes/HarmonicFlow/inference_output_last_xt/"
protein_pa = "/Users/dsharon/Documents/MIT/6.8701/Project/Data/From_Hannes/HarmonicFlow/inference_output"
hf_r = True
output_dir = "/Users/dsharon/Documents/MIT/6.8701/Project/Analysis/RMSD_and_Chem_Feats/Clash_Check_231208_Test"
j_str = "HF_Test_231210"
clash_checking(pdb_l, 
               l_pa, 
               protein_pa, 
               hf_r, 
               3.70,
               output_dir,
               j_str,
               1.00)

In [None]:
#indeed val 4 and 5 appear
#glu 1sx, asp 1x, gly 1x, lys 1x, asn 1x, trp 1x