In [1]:
from Bio.PDB import PDBParser, NeighborSearch
from rdkit import Chem

In [2]:
def parse_ligand_sdf(ligand_path):
    mol = Chem.MolFromMolFile(str(ligand_path))
    if mol is None:
        logger.warning(f"Could not parse ligand: {ligand_path}")
    return mol

In [3]:
parser = PDBParser(QUIET=True)

In [6]:
pdb_file = "../../../Downloads/refined-set/1a1e/1a1e_protein.pdb"
ligand_file = "../../../Downloads/refined-set/1a1e/1a1e_ligand.sdf"

In [7]:
structure = parser.get_structure("complex", pdb_file)

In [8]:
all_atoms = [atom for atom in structure.get_atoms()]

In [9]:
all_atoms[0]

<Atom N>

In [82]:
ns = NeighborSearch(all_atoms)

In [83]:
ligand = parse_ligand_sdf(ligand_file)



In [84]:
ligand_atoms = ligand.GetConformers()[0].GetPositions()

In [85]:
radius = 6

In [86]:
pocket_residues = set()
for atom in ligand_atoms:
    close_atoms = ns.search(atom, radius)
    for close_atom in close_atoms:
        pocket_residues.add(close_atom.get_parent().get_id())

In [87]:
pocket_residues

{(' ', 156, ' '),
 (' ', 157, ' '),
 (' ', 158, ' '),
 (' ', 178, ' '),
 (' ', 179, ' '),
 (' ', 180, ' '),
 (' ', 181, ' '),
 (' ', 182, ' '),
 (' ', 183, ' '),
 (' ', 186, ' '),
 (' ', 187, ' '),
 (' ', 188, ' '),
 (' ', 189, ' '),
 (' ', 190, ' '),
 (' ', 203, ' '),
 (' ', 204, ' '),
 (' ', 205, ' '),
 (' ', 206, ' '),
 (' ', 217, ' '),
 (' ', 218, ' '),
 (' ', 233, ' '),
 (' ', 238, ' '),
 (' ', 239, ' '),
 (' ', 240, ' '),
 (' ', 241, ' '),
 ('W', 28, ' '),
 ('W', 31, ' '),
 ('W', 32, ' '),
 ('W', 43, ' '),
 ('W', 44, ' ')}

In [88]:
from pathlib import Path

In [93]:
list(Path("../../../Downloads/refined-set").glob("*/*_protein.pdb"))

[PosixPath('../../../Downloads/refined-set/6ugp/6ugp_protein.pdb'),
 PosixPath('../../../Downloads/refined-set/4rdn/4rdn_protein.pdb'),
 PosixPath('../../../Downloads/refined-set/4mo4/4mo4_protein.pdb'),
 PosixPath('../../../Downloads/refined-set/3s0b/3s0b_protein.pdb'),
 PosixPath('../../../Downloads/refined-set/6r1d/6r1d_protein.pdb'),
 PosixPath('../../../Downloads/refined-set/3fwv/3fwv_protein.pdb'),
 PosixPath('../../../Downloads/refined-set/1qin/1qin_protein.pdb'),
 PosixPath('../../../Downloads/refined-set/4b5d/4b5d_protein.pdb'),
 PosixPath('../../../Downloads/refined-set/3sus/3sus_protein.pdb'),
 PosixPath('../../../Downloads/refined-set/3b26/3b26_protein.pdb'),
 PosixPath('../../../Downloads/refined-set/6o0m/6o0m_protein.pdb'),
 PosixPath('../../../Downloads/refined-set/2v25/2v25_protein.pdb'),
 PosixPath('../../../Downloads/refined-set/6g3a/6g3a_protein.pdb'),
 PosixPath('../../../Downloads/refined-set/4oc2/4oc2_protein.pdb'),
 PosixPath('../../../Downloads/refined-set/4f5y/

In [104]:
pdb_files = list(Path("../../../Downloads/refined-set").glob("*/*_protein.pdb"))
for pdb_file in pdb_files:
    ligand_path = pdb_file.parent / f"{pdb_file.parts[-2]}_ligand.sdf"
    print(ligand_path)

../../../Downloads/refined-set/6ugp/6ugp_ligand.sdf
../../../Downloads/refined-set/4rdn/4rdn_ligand.sdf
../../../Downloads/refined-set/4mo4/4mo4_ligand.sdf
../../../Downloads/refined-set/3s0b/3s0b_ligand.sdf
../../../Downloads/refined-set/6r1d/6r1d_ligand.sdf
../../../Downloads/refined-set/3fwv/3fwv_ligand.sdf
../../../Downloads/refined-set/1qin/1qin_ligand.sdf
../../../Downloads/refined-set/4b5d/4b5d_ligand.sdf
../../../Downloads/refined-set/3sus/3sus_ligand.sdf
../../../Downloads/refined-set/3b26/3b26_ligand.sdf
../../../Downloads/refined-set/6o0m/6o0m_ligand.sdf
../../../Downloads/refined-set/2v25/2v25_ligand.sdf
../../../Downloads/refined-set/6g3a/6g3a_ligand.sdf
../../../Downloads/refined-set/4oc2/4oc2_ligand.sdf
../../../Downloads/refined-set/4f5y/4f5y_ligand.sdf
../../../Downloads/refined-set/1bai/1bai_ligand.sdf
../../../Downloads/refined-set/2avo/2avo_ligand.sdf
../../../Downloads/refined-set/1ppi/1ppi_ligand.sdf
../../../Downloads/refined-set/5c28/5c28_ligand.sdf
../../../Dow

In [None]:
import urllib
from tqdm import tqdm

url = "https://pdbbind-1301734146.cos.ap-shanghai.myqcloud.com/subscribe/v2020/PDBbind_v2020_refined.tar.gz?sign=q-sign-algorithm%3Dsha1%26q-ak%3DAKIDzoOinb9RTFkyvc3D6j5AxVmmVyAyVllV%26q-sign-time%3D1760291725%3B1760295385%26q-key-time%3D1760291725%3B1760295385%26q-header-list%3Dhost%26q-url-param-list%3D%26q-signature%3D63674eda79e88de7db449c4540c704e85b65de84"
dest_path = "refined.tar.gz"
with urllib.request.urlopen(url) as response, open(dest_path, "wb") as out_file:
    total = int(response.info().get("Content-Length", 0))
    with tqdm(total=total, unit='B', unit_scale=True, desc=f"Downloading {dest_path}") as pbar:
        for data in iter(lambda: response.read(1024 * 1024), b""):
            out_file.write(data)
            pbar.update(len(data))

Downloading refined.tar.gz:   0%|                               | 2.10M/691M [00:02<11:15, 1.02MB/s]

In [None]:
https://pdbbind-1301734146.cos.ap-shanghai.myqcloud.com/subscribe/v2020/PDBbind_v2020_refined.tar.gz?sign=q-sign-algorithm%3Dsha1%26q-ak%3DAKIDzoOinb9RTFkyvc3D6j5AxVmmVyAyVllV%26q-sign-time%3D1760291725%3B1760295385%26q-key-time%3D1760291725%3B1760295385%26q-header-list%3Dhost%26q-url-param-list%3D%26q-signature%3D63674eda79e88de7db449c4540c704e85b65de84

In [None]:
https://pdbbind-1301734146.cos.ap-shanghai.myqcloud.com/subscribe/v2020/PDBbind_v2020_plain_text_index.tar.gz?sign=q-sign-algorithm%3Dsha1%26q-ak%3DAKIDzoOinb9RTFkyvc3D6j5AxVmmVyAyVllV%26q-sign-time%3D1760291763%3B1760295423%26q-key-time%3D1760291763%3B1760295423%26q-header-list%3Dhost%26q-url-param-list%3D%26q-signature%3D7306ebcb5870277acf58ffcb991b3210bb7a1f2b