In [2]:
#Find binding site composition and average Kyte-Doolittle hydropathy for set of PDBBind entries of interest
#Now run for all pdbs of interest, save to npy
import MDAnalysis
from MDAnalysis.analysis import distances
from MDAnalysis.analysis.hydrogenbonds.hbond_analysis import HydrogenBondAnalysis
import numpy as np
import matplotlib.pyplot as plt
import os
import scipy
from scipy import stats

  from .autonotebook import tqdm as notebook_tqdm
  import xdrlib


In [18]:
#############
#Run a binding site analysis for a set of structures
#Inputs:
#(a) pdb_path path to the pdb overall dir
#(b) list_pdbs list of pdb structures to work with
#(c) output_dir where to place output.npy
#(d) cutoff_d cutoff for contact
#############
def run_for_pdbs(pdb_path, list_pdbs, output_dir, cutoff_d):
    
    #Keys- pdbs, values- resid lists and also average KD hydropathies
    pdb_binding_site_info = {}
    
    #Iterate over each pdb
    for ipdb, pdb_analyze in enumerate(list_pdbs):
        
        print(f"On {pdb_analyze} index {ipdb}")
        
        #############
        #Download PDBBind files, make universes
        #############
        protein_file = f"{pdb_path}/{pdb_analyze}/{pdb_analyze}_protein_processed.pdb"
        u_prot = MDAnalysis.Universe(protein_file)
        
        #Copied below from https://github.com/gcorso/DiffDock/blob/main/datasets/pdbbind.py
        #Only work with mol2 though revise because that is compatible with MDAnalysis- sdf is not
        for file in os.listdir(os.path.join(pdb_path, pdb_analyze)):
            if file.endswith(".mol2") and 'rdkit' not in file:
                ligand_file = os.path.join(pdb_path, pdb_analyze, file)
                #if lig is None and os.path.exists(os.path.join(pdb_path, pdb_analyze, file[:-4] + ".mol2")):  # read mol2 file if sdf file cannot be sanitized
                #    print('Using the .sdf file failed. We found a .mol2 file instead and are trying to use that.')
                #    lig = read_molecule(os.path.join(pdb_path, pdb_analyze, file[:-4] + ".mol2"), remove_hs=False, sanitize=True)
                
        
        #ligand_file = obtaif"{pdb_path}/5zxk/5zxk_ligand.mol2"
        
        u_ligand = MDAnalysis.Universe(ligand_file)

        #Ref https://userguide.mdanalysis.org/stable/universe.html
        #Ref https://docs.mdanalysis.org/2.6.1/documentation_pages/core/universe.html#MDAnalysis.core.universe.Merge
        #Merge
        u_pl = MDAnalysis.core.universe.Merge(u_prot.atoms, u_ligand.atoms)
        
        #############
        #Protein and ligand info setup
        #############
        
        #For now- all segids allowed
        #May want to only have one later?
        #psegids = list(set([a.segid for a in u_prot.atoms]))
        #if len(psegids) > 1:
        #    print("over 1 protein segids")
        #    print(psegids)
        #psegid = psegids[0]
        lresnames = list(set([a.resname for a in u_ligand.atoms]))
        if len(lresnames) > 1:
            print("over 1 ligand resnames")
            print(lresnames)
        lresname = lresnames[0]
        
        #Which protein atoms are near the ligand?
        #p_near_l = u_pl.select_atoms(f"protein and segid {psegid} and not element H and around 4 (resname {lresname} and not element H)")
        p_near_l = u_pl.select_atoms(f"protein and not element H and around {cutoff_d} (resname {lresname} and not element H)")
    
    
        #############
        #Find resids near the ligand and average KD
        #############
        #for a in p_near_l: print(a.residue)
        unique_resids = list(set([a.residue for a in p_near_l]))
        #print("unique resids")
        #print(unique_resids)
        unique_resid_names = [r.resname for r in unique_resids]
        #print("names of unique resids")
        #print(unique_resid_names)
        
        #Ref https://resources.qiagenbioinformatics.com/manuals/clcgenomicsworkbench/650/Hydrophobicity_scales.html
        kd_dict = {"ALA" : 1.8,
        "CYS" : 2.5,
        "ASP" : -3.5,
        "GLU" : -3.5,
        "PHE" : 2.8,
        "GLY" : -0.4,
        "HIS" : -3.2,
        "ILE" : 4.5,
        "LYS" : -3.9,
        "LEU" : 3.8,
        "MET" : 1.9,
        "ASN" : -3.5,
        "PRO" : -1.6,
        "GLN" : -3.5,
        "ARG" : -4.5,
        "SER" : -0.8,
        "THR" : -0.7,
        "VAL" : 4.2,
        "TRP" : -0.9,
        "TYR" : -1.3}
        
        kd_of_aas = []
        for aa_add in unique_resid_names:
            if aa_add in kd_dict.keys():
                kd_of_aas.append(kd_dict[aa_add])
            else:
                print(f"PROBELM {aa_add} not in dictionary")
        kd_average = np.average(kd_of_aas)
        print(f"average {kd_average}")
        
        pdb_binding_site_info[pdb_analyze] = {"average_kd" : kd_average,
                                              "residues" : unique_resid_names}
        
    #Save this info
    np.save(f"{output_dir}/Binding_site_composition_Cutoff_{cutoff_d}.npy", pdb_binding_site_info)

In [20]:
pdb_dir = "/Users/dsharon/Documents/MIT/6.8701/Project/Code/HarmonicFlow/FlowSite/data/PDBBind_processed"
pdb_list = "/Users/dsharon/Documents/MIT/6.8701/Project/Data/From_Hannes/TEST3_top40_epoch75_FILTER_restart_cacheNewRestart_big_ema_ESM2emb_tr34_WITH_fixedSamples28_id1_FILTERFROM_temp_restart_ema_ESM2emb_tr34/complex_names.npy"
odir = "/Users/dsharon/Documents/MIT/6.8701/Project/Analysis/RMSD_and_Chem_Feats/Site_Comp_231128"
dcutoff = 3.70

#From Hannes
with open(pdb_list, 'rb') as f:
    complex_names = np.load(f)

run_for_pdbs(pdb_dir, complex_names, odir, dcutoff)

On 6qqw index 0
average -0.49375
On 6d08 index 1
average -0.17368421052631577
On 6jap index 2
average -1.3999999999999995
On 6np2 index 3
average -1.791666666666667
On 6uvp index 4
average -0.31428571428571433
On 6oxq index 5
average -0.4555555555555555
On 6jsn index 6
average -0.40625000000000006
On 6hzb index 7
average -0.4914772727272727
On 6qrc index 8
average -0.8066666666666665
On 6oio index 9
average -0.13999999999999999
On 6jag index 10
average -1.59375
On 6moa index 11
average 0.17142857142857132
On 6hld index 12
average -1.2476190476190476
On 6i9a index 13
average -0.7533333333333334
On 6e4c index 14
average -2.68
On 6g24 index 15
average -0.675
On 6jb4 index 16
average -1.81875
On 6s55 index 17
average 0.31250000000000006
On 6seo index 18
average -1.1111111111111112
On 6dyz index 19
average -0.20000000000000007
On 5zk5 index 20
average -0.7199999999999999
On 6jid index 21
average -0.7500000000000001
On 5ze6 index 22
average -2.211111111111111
On 6qlu index 23
average -2.3
On

average -0.7354838709677418
On 6jse index 192
average -0.4062500000000001
On 5zjy index 193
average -1.4142857142857144
On 6o3y index 194
average -0.24181818181818182
On 6rpg index 195
average -1.0238095238095237
On 6rr0 index 196
average -1.5775510204081635
On 6gzy index 197
average 0.2454545454545454
On 6qlt index 198
average -2.511111111111111
On 6ufo index 199
average -2.064285714285714
On 6o0h index 200
average 0.19230769230769232
On 6o3x index 201
average -0.1589041095890411
On 5zjz index 202
average -1.5090909090909093
On 6i8t index 203
average -1.0125
On 6ooy index 204
average -0.050000000000000086
On 6oiq index 205
average -0.47142857142857136
On 6od6 index 206
average -0.1461538461538462
On 6nrh index 207
average -1.0444444444444443
On 6qra index 208
average -0.975
On 6hhh index 209
average -0.888235294117647
On 6m7h index 210
PROBELM MSE not in dictionary
PROBELM MSE not in dictionary
average -1.7333333333333334
On 6ufn index 211
average -1.68
On 6qr0 index 212
average -0.28