In [3]:
#12/8/23 find clashes
import MDAnalysis
from MDAnalysis.analysis import distances
from MDAnalysis.analysis.hydrogenbonds.hbond_analysis import HydrogenBondAnalysis
import numpy as np
import matplotlib.pyplot as plt
import os
import scipy
from scipy import stats

  from .autonotebook import tqdm as notebook_tqdm
  import xdrlib


In [32]:
#Check for clashes and report per residue type
#Input:
#(a) pdb_list
#(b) l_path
#(c) protein_path
#(d) hf_run boolean True if HarmonicFlow, False if DiffDdock
#(e) cutoff_d cutoff distance
#(f) odir output directory
#(g) jstr job string
def clash_checking(pdb_list, 
                   l_path, 
                   protein_path, 
                   hf_run, 
                   cutoff_d,
                   odir,
                   jstr):
    
    #Removing from analysis
    remove_list = []
    
    #Clashing residue dictionary
    clash_res_dist = {}
    
    #Ref https://pubs.acs.org/doi/10.1021/jp8111556
    #Ref http://ursula.chem.yale.edu/~chem220/chem220js/STUDYAIDS/vanderwaalsradius.html
    vdw_radius_dict = { "H" : 1.2,
                        "C" : 1.7,
                        "N" :  1.6,
                        "O" :  1.55,
                        "F" :  1.5,
                        "S" :  1.8,
                        "Cl" :  1.8}
    
    max_vdw = max(list(vdw_radius_dict.values()))
    
    #Iterate over each pdb
    for ipdb, pdb_analyze in enumerate(pdb_list):
        
        print(f"On {pdb_analyze} index {ipdb}")
        
        #############
        #Download docking output files, make universes
        #############
        if hf_run:
            protein_file = f"{protein_path}/{pdb_analyze}/{pdb_analyze}_protein_processed.pdb"
            ligand_file = f"{l_path}/{pdb_analyze}_x20.pdb"
        u_prot = MDAnalysis.Universe(protein_file)
        
        #Copied below from https://github.com/gcorso/DiffDock/blob/main/datasets/pdbbind.py
        #Only work with mol2 though revise because that is compatible with MDAnalysis- sdf is not
        #for file in os.listdir(os.path.join(pdb_path, pdb_analyze)):
        #    if file.endswith(".mol2") and 'rdkit' not in file:
        #        ligand_file = os.path.join(pdb_path, pdb_analyze, file)
        #        #if lig is None and os.path.exists(os.path.join(pdb_path, pdb_analyze, file[:-4] + ".mol2")):  # read mol2 file if sdf file cannot be sanitized
        #        #    print('Using the .sdf file failed. We found a .mol2 file instead and are trying to use that.')
        #        #    lig = read_molecule(os.path.join(pdb_path, pdb_analyze, file[:-4] + ".mol2"), remove_hs=False, sanitize=True)
        
        u_ligand = MDAnalysis.Universe(ligand_file)

        #Ref https://userguide.mdanalysis.org/stable/universe.html
        #Ref https://docs.mdanalysis.org/2.6.1/documentation_pages/core/universe.html#MDAnalysis.core.universe.Merge
        #Merge
        u_pl = MDAnalysis.core.universe.Merge(u_prot.atoms, u_ligand.atoms)
        
        #############
        #Protein and ligand info setup
        #############
        #For now- all segids allowed
        #May want to only have one later?
        psegids = list(set([a.segid for a in u_prot.atoms]))
        psegid_str = " ".join([p for p in psegids])
        #For now- all segids allowed
        #May want to only have one later?
        #psegids = list(set([a.segid for a in u_prot.atoms]))
        #if len(psegids) > 1:
        #    print("over 1 protein segids")
        #    print(psegids)
        #psegid = psegids[0]
        lresnames = list(set([a.resname for a in u_ligand.atoms]))
        
        #for now let only 1 ligand resname be used, simpifies
        if len(lresnames) > 1:
            print("over 1 ligand resnames REMOVAL")
            print(lresnames)
            print(f"resname string {lresname}")
            remove_list.append(pdb_analyze)
            
        else:
            lresname = lresnames[0]

            #If ligand resname is a protein resid- exclude
            if lresname in ["ALA",
            "CYS",
            "ASP",
            "GLU",
            "PHE",
            "GLY",
            "HIS",
            "ILE",
            "LYS",
            "LEU",
            "MET",
            "ASN",
            "PRO",
            "GLN",
            "ARG",
            "SER",
            "THR",
            "VAL",
            "TRP",
            "TYR",
            "MSE"]:
                remove_list.append(pdb_analyze)
                print(f"Ligand resname {lresname} is protein resname: REMOVAL")
            
            #############
            #Clash checking
            #############
            else:
                
                #Clashing residues
                res_clash_dict = {"ALA" : 0,
                                "CYS" : 0,
                                "ASP" : 0,
                                "GLU" : 0,
                                "PHE" : 0,
                                "GLY" : 0,
                                "HIS" : 0,
                                "ILE" : 0,
                                "LYS" : 0,
                                "LEU" : 0,
                                "MET" : 0,
                                "ASN" : 0,
                                "PRO" : 0,
                                "GLN" : 0,
                                "ARG" : 0,
                                "SER" : 0,
                                "THR" : 0,
                                "VAL" : 0,
                                "TRP" : 0,
                                "TYR" : 0}
                
                #Binding site composition
                bsite_comp_dict = {"ALA" : 0,
                                "CYS" : 0,
                                "ASP" : 0,
                                "GLU" : 0,
                                "PHE" : 0,
                                "GLY" : 0,
                                "HIS" : 0,
                                "ILE" : 0,
                                "LYS" : 0,
                                "LEU" : 0,
                                "MET" : 0,
                                "ASN" : 0,
                                "PRO" : 0,
                                "GLN" : 0,
                                "ARG" : 0,
                                "SER" : 0,
                                "THR" : 0,
                                "VAL" : 0,
                                "TRP" : 0,
                                "TYR" : 0}
                
                #For each ligand atom - clash check
                for l in u_pl.select_atoms(f"resname {lresname}"):
                    
                    #Record element
                    l_element = l.element
                    print(f"{l_element} {l.name}")
                    
                    #Check clashes
                    if l_element in vdw_radius_dict.keys():
                        
                        #Ligand radius
                        l_radius = vdw_radius_dict[l_element]
                        
                        #Now find whether any protein atoms are beloww vdw distance sum with this ligand atom
                        for e_check in vdw_radius_dict.keys():
                            sum_vdw = l_radius + vdw_radius_dict[e_check]
                            
                            #Find atoms in protein within that cutoff
                            #Record resnames
                            p_clash_l = u_pl.select_atoms(f"protein and element {e_check} and around {sum_vdw} (resname {lresname} and index {l.index})")
                            if len(p_clash_l) > 0:
                                print(p_clash_l)
                                print(f"vdw cutoff for above was {sum_vdw}")
                            for p_clash_l_atom in p_clash_l:
                                print(p_clash_l_atom.resname)
                                res_clash_dict[p_clash_l_atom.resname] += 1
                                
                #Also find binding site composition- this is approx.
                #Which protein atoms are near the ligand?
                #12/6/23 segid update
                #p_near_l = u_pl.select_atoms(f"protein and segid {psegid} and not element H and around 4 (resname {lresname} and not element H)")
                p_near_l = u_pl.select_atoms(f"protein and not element H and segid {psegid_str} and around {cutoff_d} (resname {lresname} and not element H)")


                #############
                #Find resids near the ligand
                #############
                #for a in p_near_l: print(a.residue)
                unique_resids = list(set([a.residue for a in p_near_l]))
                print("nearby residues")
                print(unique_resids)
                if len(unique_resids) == 0:
                    remove_list.append(pdb_analyze)
                    print(f"NO NEARBY RESIDS REMOVAL")
                    
                else:
                    for u in unique_resids:
                        u_resname = u.resname
                        if u_resname in bsite_comp_dict.keys():
                            bsite_comp_dict[u_resname] += 1
                            
                    
                    #Add to overall dictionary
                    clash_res_dist[pdb_analyze] = {}
                    clash_res_dist[pdb_analyze]["clashes"] = res_clash_dict
                    clash_res_dist[pdb_analyze]["binding_site_comp"] = bsite_comp_dict
    
    print("clash dictionary")
    print(clash_res_dist)
    
    np.save(f"{odir}/Clash_dictionary_{jstr}.npy", clash_res_dist)

In [33]:
pdb_l = ["6d08"]
l_pa = "/Users/dsharon/Documents/MIT/6.8701/Project/Data/From_Hannes/HarmonicFlow/inference_output_last_xt/"
protein_pa = "/Users/dsharon/Documents/MIT/6.8701/Project/Data/From_Hannes/HarmonicFlow/inference_output"
hf_r = True
output_dir = "/Users/dsharon/Documents/MIT/6.8701/Project/Analysis/RMSD_and_Chem_Feats/Clash_Check_231208_Test"
j_str = "HF_Test_1"
clash_checking(pdb_l, 
               l_pa, 
               protein_pa, 
               hf_r, 
               3.70,
               output_dir,
               j_str)

On 6d08 index 0
N N1
C C1
C C2
O O1
C C3
N N2
C C4
C C5
<AtomGroup [<Atom 68: H of type H of resname VAL, resid 5 and segid A and altLoc >]>
vdw cutoff for above was 2.9
VAL
O O2
<AtomGroup [<Atom 53: HA of type H of resname VAL, resid 4 and segid A and altLoc >, <Atom 68: H of type H of resname VAL, resid 5 and segid A and altLoc >]>
vdw cutoff for above was 2.75
VAL
VAL
<AtomGroup [<Atom 46: CA of type C of resname VAL, resid 4 and segid A and altLoc >]>
vdw cutoff for above was 3.25
VAL
<AtomGroup [<Atom 61: N of type N of resname VAL, resid 5 and segid A and altLoc >]>
vdw cutoff for above was 3.1500000000000004
VAL
C C6
<AtomGroup [<Atom 68: H of type H of resname VAL, resid 5 and segid A and altLoc >]>
vdw cutoff for above was 2.9
VAL
<AtomGroup [<Atom 61: N of type N of resname VAL, resid 5 and segid A and altLoc >]>
vdw cutoff for above was 3.3
VAL
C C7
<AtomGroup [<Atom 57: HG13 of type H of resname VAL, resid 4 and segid A and altLoc >, <Atom 68: H of type H of resname VAL, r

O O12
nearby residues
[<Residue GLY, 1>, <Residue GLU, 2>, <Residue PHE, 3>, <Residue VAL, 4>, <Residue VAL, 5>, <Residue GLU, 6>, <Residue ASN, 39>, <Residue ASP, 41>, <Residue TRP, 24>, <Residue LYS, 25>]
clash dictionary
{'6d08': {'clashes': {'ALA': 0, 'CYS': 0, 'ASP': 1, 'GLU': 19, 'PHE': 5, 'GLY': 38, 'HIS': 0, 'ILE': 0, 'LYS': 3, 'LEU': 0, 'MET': 0, 'ASN': 16, 'PRO': 0, 'GLN': 0, 'ARG': 0, 'SER': 0, 'THR': 0, 'VAL': 45, 'TRP': 7, 'TYR': 0}, 'binding_site_comp': {'ALA': 0, 'CYS': 0, 'ASP': 1, 'GLU': 2, 'PHE': 1, 'GLY': 1, 'HIS': 0, 'ILE': 0, 'LYS': 1, 'LEU': 0, 'MET': 0, 'ASN': 1, 'PRO': 0, 'GLN': 0, 'ARG': 0, 'SER': 0, 'THR': 0, 'VAL': 2, 'TRP': 1, 'TYR': 0}}}
