# overview

We start from the raw PDBbind dataset downloaded from http://www.pdbbind.org.cn/download.php

1. filter out those unable to process using RDKit.

2. Process the protein by only preserving the chains that with at least one atom within 10Å from any atom of the ligand.

3. Use p2rank to segment protein into blocks.

4. extract protein and ligand features.

5. construct the training and test dataset.


In [1]:
tankbind_src_folder_path = "../tankbind/"
import sys
sys.path.insert(0, tankbind_src_folder_path)

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# process the raw PDBbind dataset.

In [3]:
from utils import read_pdbbind_data

In [4]:
# raw PDBbind dataset could be downloaded from http://www.pdbbind.org.cn/download.php
pre = "./pdbbind/pdbbind2020/"
df_pdb_id = pd.read_csv(f'{pre}/pdbbind_index/INDEX_general_PL_name.2020', sep="  ", comment='#', header=None, names=['pdb', 'year', 'uid', 'd', 'e','f','g','h','i','j','k','l','m','n','o'], engine='python')

if True:
    df_pdb_id = df_pdb_id[:100]

df_pdb_id = df_pdb_id[['pdb','uid']]
data = read_pdbbind_data(f'{pre}/pdbbind_index/INDEX_general_PL_data.2020')
data = data.merge(df_pdb_id, on=['pdb'])
print(len(data))


100


# ligand file should be readable by RDKit.

In [5]:
from feature_utils import read_mol

In [6]:
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
pdb_list = []
problem_list = []
for pdb in tqdm(data.pdb):
    sdf_fileName = f"{pre}/pdbbind_files/{pdb}/{pdb}_ligand.sdf"
    mol2_fileName = f"{pre}/pdbbind_files/{pdb}/{pdb}_ligand.mol2"
    mol, problem = read_mol(sdf_fileName, mol2_fileName)
    if problem:
        problem_list.append((pdb, problem))
        continue
    pdb_list.append(pdb)

100%|██████████| 100/100 [00:01<00:00, 71.10it/s]


In [7]:
data = data.query("pdb in @pdb_list").reset_index(drop=True)

In [8]:
data.shape

(100, 7)

In [9]:
len(problem_list)

0

In [10]:
len(pdb_list)

100

In [11]:
problem_list[:10]

[]

### for ease of RMSD evaluation later, we renumber the atom index to be consistent with the smiles

In [12]:
from feature_utils import write_renumbered_sdf

In [13]:
import os
toFolder = f"{pre}/renumber_atom_index_same_as_smiles"
os.system(f"mkdir -p {toFolder}")
for pdb in tqdm(pdb_list):
    sdf_fileName = f"{pre}/pdbbind_files/{pdb}/{pdb}_ligand.sdf"
    mol2_fileName = f"{pre}/pdbbind_files/{pdb}/{pdb}_ligand.mol2"
    toFile = f"{toFolder}/{pdb}.sdf"
    write_renumbered_sdf(toFile, sdf_fileName, mol2_fileName)


100%|██████████| 100/100 [00:03<00:00, 29.87it/s]


# process PDBbind proteins, removing extra chains, cutoff 10A

In [14]:
toFolder = f"{pre}/protein_remove_extra_chains_10A/"
os.system(f"mkdir -p {toFolder}")

0

In [15]:
input_ = []
cutoff = 10
for pdb in data.pdb.values:
    pdbFile = f"{pre}/pdbbind_files/{pdb}/{pdb}_protein.pdb"
    ligandFile = f"{pre}/renumber_atom_index_same_as_smiles/{pdb}.sdf"
    toFile = f"{toFolder}/{pdb}_protein.pdb"
    x = (pdbFile, ligandFile, cutoff, toFile)
    input_.append(x)

In [16]:
from feature_utils import select_chain_within_cutoff_to_ligand_v2

In [17]:
import mlcrate as mlc
import os
pool = mlc.SuperPool(64)
pool.pool.restart()
_ = pool.map(select_chain_within_cutoff_to_ligand_v2,input_)
pool.exit()

[mlcrate] 64 CPUs: 100%|██████████| 100/100 [00:08<00:00, 12.43it/s]


In [18]:
# previously, I found that 2r1w has no chain near the ligand.
data = data.query("pdb != '2r1w'").reset_index(drop=True)

# p2rank segmentation

In [19]:
p2rank_prediction_folder = f"{pre}/p2rank_protein_remove_extra_chains_10A"
os.system(f"mkdir -p {p2rank_prediction_folder}")
ds = f"{p2rank_prediction_folder}/protein_list.ds"
with open(ds, "w") as out:
    for pdb in data.pdb.values:
        out.write(f"../protein_remove_extra_chains_10A/{pdb}_protein.pdb\n")

In [20]:
# takes about 30 minutes.
#p2rank = "bash /packages/p2rank_2.3/prank"
p2rank = "bash /home/jovyan/TankBind/p2rank_2.3/prank"
cmd = f"{p2rank} predict {ds} -o {p2rank_prediction_folder}/p2rank -threads 16"
os.system(cmd)

----------------------------------------------------------------------------------------------
 P2Rank 2.3
----------------------------------------------------------------------------------------------

predicting pockets for proteins from dataset [protein_list.ds]
processing [3mwe_protein.pdb] (1/100)
processing [5uah_protein.pdb] (2/100)
processing [5ual_protein.pdb] (3/100)
processing [6nzk_protein.pdb] (5/100)
processing [5swg_protein.pdb] (7/100)
processing [4zh4_protein.pdb] (4/100)
processing [4zh3_protein.pdb] (6/100)
processing [2j9n_protein.pdb] (9/100)
processing [4xwk_protein.pdb] (8/100)
processing [5sxk_protein.pdb] (10/100)
processing [1t3t_protein.pdb] (11/100)
processing [1i6v_protein.pdb] (12/100)
processing [4waf_protein.pdb] (13/100)
processing [4xsx_protein.pdb] (14/100)
processing [1i1e_protein.pdb] (16/100)
processing [4zh2_protein.pdb] (15/100)
processing [4qsh_protein.pdb] (17/100)
processing [4qsk_protein.pdb] (18/100)
processing [5urj_protein.pdb] (19/100)
pr

0

In [21]:
data.to_csv(f"{pre}/data.csv")

In [22]:
pdb_list = data.pdb.values
print(len(pdb_list))

100


In [23]:
tankbind_data_path = f"{pre}/tankbind_data"
os.system(f"mkdir -p {tankbind_data_path}")

0

In [24]:
name_list = pdb_list
d_list = []

for name in tqdm(name_list):
    p2rankFile = f"{pre}/p2rank_protein_remove_extra_chains_10A/p2rank/{name}_protein.pdb_predictions.csv"
    d = pd.read_csv(p2rankFile)
    d.columns = d.columns.str.strip()
    d_list.append(d.assign(name=name))
d = pd.concat(d_list).reset_index(drop=True)
d.reset_index(drop=True).to_feather(f"{tankbind_data_path}/p2rank_result.feather")

100%|██████████| 100/100 [00:01<00:00, 75.61it/s]


In [25]:
d = pd.read_feather(f"{tankbind_data_path}/p2rank_result.feather")

In [26]:
pockets_dict = {}
for name in tqdm(name_list):
    pockets_dict[name] = d[d.name == name].reset_index(drop=True)

100%|██████████| 100/100 [00:00<00:00, 1601.85it/s]


# protein feature

In [27]:
from feature_utils import get_protein_feature

In [28]:
input_ = []
protein_embedding_folder = f"{tankbind_data_path}/gvp_protein_embedding"
os.system(f"mkdir -p {protein_embedding_folder}")
for pdb in pdb_list:
    proteinFile = f"{pre}/protein_remove_extra_chains_10A/{pdb}_protein.pdb"
    toFile = f"{protein_embedding_folder}/{pdb}.pt"
    x = (pdb, proteinFile, toFile)
    input_.append(x)

In [29]:
from Bio.PDB import PDBParser
from feature_utils import get_clean_res_list
import torch
torch.set_num_threads(1)

def batch_run(x):
    protein_dict = {}
    pdb, proteinFile, toFile = x
    parser = PDBParser(QUIET=True)
    s = parser.get_structure(pdb, proteinFile)
    res_list = get_clean_res_list(s.get_residues(), verbose=False, ensure_ca_exist=True)
    protein_dict[pdb] = get_protein_feature(res_list)
    torch.save(protein_dict, toFile)

In [30]:
import mlcrate as mlc
import os
pool = mlc.SuperPool(64)
pool.pool.restart()
_ = pool.map(batch_run,input_)
pool.exit()

[mlcrate] 64 CPUs: 100%|██████████| 100/100 [00:09<00:00, 10.82it/s]


In [31]:
protein_dict = {}
for pdb in tqdm(pdb_list):
    protein_dict.update(torch.load(f"{protein_embedding_folder}/{pdb}.pt"))


100%|██████████| 100/100 [00:04<00:00, 20.78it/s]


# Compound Features

In [32]:
from feature_utils import extract_torchdrug_feature_from_mol
compound_dict = {}
skip_pdb_list = []
for pdb in tqdm(pdb_list):
    mol, _ = read_mol(f"{pre}/renumber_atom_index_same_as_smiles/{pdb}.sdf", None)
    # extract features from sdf.
    try:
        compound_dict[pdb] = extract_torchdrug_feature_from_mol(mol, has_LAS_mask=True)  # self-dock set has_LAS_mask to true
    except Exception as e:
        print(e)
        skip_pdb_list.append(pdb)
        print(pdb)

100%|██████████| 100/100 [00:03<00:00, 31.86it/s]


In [33]:
torch.save(compound_dict, f"{tankbind_data_path}/compound_torchdrug_features.pt")

In [34]:
skip_pdb_list

[]

In [35]:
data = data.query("pdb not in @skip_pdb_list").reset_index(drop=True)

# construct dataset.

In [36]:
# we use the time-split defined in EquiBind paper.
# https://github.com/HannesStark/EquiBind/tree/main/data
valid = np.loadtxt(f"{pre}/equibind-data/timesplit_no_lig_overlap_val", dtype=str)
test = np.loadtxt(f"{pre}/equibind-data/timesplit_test", dtype=str)
def assign_group(pdb, valid=valid, test=test):
    if pdb in valid:
        return 'valid'
    if pdb in test:
        return 'test'
    return 'train'

data['group'] = data.pdb.map(assign_group)

In [37]:
data.value_counts("group")

group
train    92
valid     7
test      1
dtype: int64

In [38]:
data['name'] = data['pdb']

In [39]:
info = []
for i, line in tqdm(data.iterrows(), total=data.shape[0]):
    pdb = line['pdb']
    uid = line['uid']
    # smiles = line['smiles']
    smiles = ""
    affinity = line['affinity']
    group = line['group']

    compound_name = line['name']
    protein_name = line['name']

    pocket = pockets_dict[pdb].head(10)
    pocket.columns = pocket.columns.str.strip()
    pocket_coms = pocket[['center_x', 'center_y', 'center_z']].values
    # native block.
    info.append([protein_name, compound_name, pdb, smiles, affinity, uid, None, True, False, group])
    # protein center as a block.
    protein_com = protein_dict[protein_name][0].numpy().mean(axis=0).astype(float).reshape(1, 3)
    info.append([protein_name, compound_name, pdb+"_c", smiles, affinity, uid, protein_com, False, False, group])
    
    for idx, pocket_line in pocket.iterrows():
        pdb_idx = f"{pdb}_{idx}"
        info.append([protein_name, compound_name, pdb_idx, smiles, affinity, uid, pocket_coms[idx].reshape(1, 3), False, False, group])
info = pd.DataFrame(info, columns=['protein_name', 'compound_name', 'pdb', 'smiles', 'affinity', 'uid', 'pocket_com', 
                                   'use_compound_com', 'use_whole_protein',
                                  'group'])



100%|██████████| 100/100 [00:00<00:00, 770.18it/s]


In [40]:
info.shape

(1167, 10)

In [41]:
from data import TankBindDataSet

In [42]:
toFilePre = f"{pre}/dataset"
os.system(f"mkdir -p {toFilePre}")
dataset = TankBindDataSet(toFilePre, data=info, protein_dict=protein_dict, compound_dict=compound_dict)

Processing...
Done!


['pdbbind/pdbbind2020/dataset/processed/data.pt', 'pdbbind/pdbbind2020/dataset/processed/protein.pt', 'pdbbind/pdbbind2020/dataset/processed/compound.pt']


In [43]:
dataset = TankBindDataSet(toFilePre)


['pdbbind/pdbbind2020/dataset/processed/data.pt', 'pdbbind/pdbbind2020/dataset/processed/protein.pt', 'pdbbind/pdbbind2020/dataset/processed/compound.pt']


In [44]:
t = []
data = dataset.data
pre_pdb = None
for i, line in tqdm(data.iterrows(), total=data.shape[0]):
    pdb = line['compound_name']
    d = dataset[i]
    p_length = d['node_xyz'].shape[0]
    c_length = d['coords'].shape[0]
    y_length = d['y'].shape[0]
    num_contact = (d.y > 0).sum()
    t.append([i, pdb, p_length, c_length, y_length, num_contact])



100%|██████████| 1167/1167 [00:06<00:00, 176.61it/s]


In [45]:
# data = data.drop(['p_length', 'c_length', 'y_length', 'num_contact'], axis=1)

In [46]:
t = pd.DataFrame(t, columns=['index', 'pdb' ,'p_length', 'c_length', 'y_length', 'num_contact'])
t['num_contact'] = t['num_contact'].apply(lambda x: x.item())

In [47]:
data = pd.concat([data, t[['p_length', 'c_length', 'y_length', 'num_contact']]], axis=1)

In [48]:
native_num_contact = data.query("use_compound_com").set_index("protein_name")['num_contact'].to_dict()
data['native_num_contact'] = data.protein_name.map(native_num_contact)
# data['fract_of_native_contact'] = data['num_contact'] / data['native_num_contact']

In [49]:
torch.save(data, f"{toFilePre}/processed/data.pt")

In [50]:
info = torch.load(f"{toFilePre}/processed/data.pt")


In [51]:
test = info.query("group == 'test'").reset_index(drop=True)
test_pdb_list = info.query("group == 'test'").protein_name.unique()

In [52]:
subset_protein_dict = {}
for pdb in tqdm(test_pdb_list):
    subset_protein_dict[pdb] = protein_dict[pdb]

100%|██████████| 1/1 [00:00<00:00, 4917.12it/s]


In [53]:
subset_compound_dict = {}
for pdb in tqdm(test_pdb_list):
    subset_compound_dict[pdb] = compound_dict[pdb]

100%|██████████| 1/1 [00:00<00:00, 4702.13it/s]


In [54]:

toFilePre = f"{pre}/test_dataset"
os.system(f"mkdir -p {toFilePre}")
dataset = TankBindDataSet(toFilePre, data=test, protein_dict=subset_protein_dict, compound_dict=subset_compound_dict)

Processing...
Done!


['pdbbind/pdbbind2020/test_dataset/processed/data.pt', 'pdbbind/pdbbind2020/test_dataset/processed/protein.pt', 'pdbbind/pdbbind2020/test_dataset/processed/compound.pt']


In [55]:
def canonical_smiles(smiles):
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles))