# overview

We start from the raw PDBbind dataset downloaded from http://www.pdbbind.org.cn/download.php

1. filter out those unable to process using RDKit.

2. Process the protein by only preserving the chains that with at least one atom within 10Å from any atom of the ligand.

3. Use p2rank to segment protein into blocks.

4. extract protein and ligand features.

5. construct the training and test dataset.


In [1]:
import numpy as np
# test = info.query("group == 'test'").reset_index(drop=True)
# test_pdb_list = info.query("group == 'test'").protein_name.unique()
pre = "/home/jovyan/frag_protein/fragmentation/pdb_data/all_pdbbind"
# we use the time-split defined in EquiBind paper.
# https://github.com/HannesStark/EquiBind/tree/main/data
valid = np.loadtxt("../equbind/timesplit_no_lig_overlap_val", dtype=str)
test = np.loadtxt("../equbind/timesplit_test", dtype=str)
p2rank = "bash /home/jovyan/frag_protein/p2rank_2.3/prank"

In [2]:
tankbind_src_folder_path = "../tankbind/"
import sys
sys.path.insert(0, tankbind_src_folder_path)

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# process the raw PDBbind dataset.

In [4]:
from utils import read_pdbbind_data

In [5]:
# raw PDBbind dataset could be downloaded from http://www.pdbbind.org.cn/download.php

df_pdb_id = pd.read_csv(f'{pre}/index/INDEX_general_PL_name.2020', sep="  ", comment='#', header=None, names=['pdb', 'year', 'uid', 'd', 'e','f','g','h','i','j','k','l','m','n','o'], engine='python')
df_pdb_id = df_pdb_id[['pdb','uid']]
data = read_pdbbind_data(f'{pre}/index/INDEX_general_PL_data.2020')
data = data.merge(df_pdb_id, on=['pdb'])


# ligand file should be readable by RDKit.

In [6]:
from feature_utils import read_mol

In [7]:
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
pdb_list = []
probem_list = []
for pdb in tqdm(data.pdb):
    sdf_fileName = f"{pre}/{pdb}/{pdb}_ligand.sdf"
    mol2_fileName = f"{pre}/{pdb}/{pdb}_ligand.mol2"
    mol, problem = read_mol(sdf_fileName, mol2_fileName)
    if problem:
        probem_list.append(pdb)
        continue
    if pdb=="2r1w":
        continue
    pdb_list.append(pdb)

100%|██████████| 19443/19443 [01:44<00:00, 186.18it/s]


In [8]:
data = data.query("pdb in @pdb_list").reset_index(drop=True)

In [9]:
data.shape

(19127, 7)

### for ease of RMSD evaluation later, we renumber the atom index to be consistent with the smiles

In [10]:
from feature_utils import write_renumbered_sdf
import os

In [11]:
toFolder = f"{pre}/renumber_atom_index_same_as_smiles"
os.system(f"mkdir -p {toFolder}")
for pdb in tqdm(pdb_list):
    sdf_fileName = f"{pre}/{pdb}/{pdb}_ligand.sdf"
    mol2_fileName = f"{pre}/{pdb}/{pdb}_ligand.mol2"
    toFile = f"{toFolder}/{pdb}.sdf"
    write_renumbered_sdf(toFile, sdf_fileName, mol2_fileName)


100%|██████████| 19127/19127 [04:31<00:00, 70.39it/s]


# process PDBbind proteins, removing extra chains, cutoff 10A

In [12]:
toFolder = f"{pre}/protein_remove_extra_chains_10A/"
os.system(f"mkdir -p {toFolder}")

0

In [14]:
input_ = []
cutoff = 10
for pdb in data.pdb.values:
    pdbFile = f"{pre}/{pdb}/{pdb}_protein.pdb"
    ligandFile = f"{pre}/renumber_atom_index_same_as_smiles/{pdb}.sdf"
    toFile = f"{toFolder}/{pdb}_protein.pdb"
    x = (pdbFile, ligandFile, cutoff, toFile)
    input_.append(x)

In [11]:
from feature_utils import select_chain_within_cutoff_to_ligand_v2

In [15]:
import mlcrate as mlc
import os
pool = mlc.SuperPool(64)
pool.pool.restart()
_ = pool.map(select_chain_within_cutoff_to_ligand_v2,input_)
pool.exit()

[mlcrate] 64 CPUs:  16%|█▋        | 3154/19127 [01:13<05:11, 51.34it/s][03:50:08] bond with order 0 found on line 35. This is not part of the MDL specification.
[mlcrate] 64 CPUs:  30%|███       | 5797/19127 [02:05<04:50, 45.84it/s][03:51:00] bond with order 0 found on line 37. This is not part of the MDL specification.
[mlcrate] 64 CPUs:  39%|███▉      | 7542/19127 [02:38<03:26, 56.10it/s][03:51:34] bond with order 0 found on line 29. This is not part of the MDL specification.
[mlcrate] 64 CPUs:  43%|████▎     | 8158/19127 [02:49<03:16, 55.79it/s][03:51:45] bond with order 0 found on line 31. This is not part of the MDL specification.
[mlcrate] 64 CPUs:  46%|████▌     | 8781/19127 [03:02<03:23, 50.84it/s][03:51:57] bond with order 0 found on line 59. This is not part of the MDL specification.
[03:51:57] bond with order 0 found on line 61. This is not part of the MDL specification.
[mlcrate] 64 CPUs:  47%|████▋     | 8908/19127 [03:04<03:03, 55.57it/s][03:51:59] bond with order 0 found

In [12]:
# previously, I found that 2r1w has no chain near the ligand.
data = data.query("pdb != '2r1w'").reset_index(drop=True)

# p2rank segmentation

In [17]:
p2rank_prediction_folder = f"{pre}/p2rank_protein_remove_extra_chains_10A"
os.system(f"mkdir -p {p2rank_prediction_folder}")
ds = f"{p2rank_prediction_folder}/protein_list.ds" 
with open(ds, "w") as out:
    for pdb in data.pdb.values:
        out.write(f"../protein_remove_extra_chains_10A/{pdb}_protein.pdb\n")

In [16]:
# # takes about 30 minutes.
cmd = f"{p2rank} predict {ds} -o {p2rank_prediction_folder}/p2rank -threads 16"
os.system(cmd)


KeyboardInterrupt



In [19]:
data.to_csv(f"{pre}/data.csv")

In [13]:
pdb_list = data.pdb.values

In [22]:
tankbind_data_path = f"{pre}/tankbind_data"
name_list = pdb_list
d_list = []

for name in tqdm(name_list):
    p2rankFile = f"{pre}/p2rank_protein_remove_extra_chains_10A/p2rank/{name}_protein.pdb_predictions.csv"
    d = pd.read_csv(p2rankFile)
    d.columns = d.columns.str.strip()
    d_list.append(d.assign(name=name))
d = pd.concat(d_list).reset_index(drop=True)
d.reset_index(drop=True).to_feather(f"{tankbind_data_path}/p2rank_result.feather")

  0%|          | 0/19127 [00:00<?, ?it/s]


NameError: name 'tjnt' is not defined

In [15]:
tankbind_data_path = f"{pre}/tankbind_data"
d = pd.read_feather(f"{tankbind_data_path}/p2rank_result.feather")

In [17]:
pockets_dict = {}
name_list = pdb_list
for name in tqdm(name_list):
    pockets_dict[name] = d[d.name == name].reset_index(drop=True)

100%|██████████| 19127/19127 [02:41<00:00, 118.41it/s]


# protein feature with protein fragmentation

In [18]:
from feature_utils import get_protein_feature_qsar

In [19]:
input_ = []
import os
tankbind_data_path = f"{pre}/tankbind_data"
protein_embedding_folder = f"{tankbind_data_path}/gvp_protein_embedding_frag"
os.system(f"mkdir -p {protein_embedding_folder}")
for pdb in pdb_list:
    proteinFile = f"{pre}/protein_remove_extra_chains_10A/{pdb}_protein.pdb"
    toFile = f"{protein_embedding_folder}/{pdb}.pt"
    x = (pdb, proteinFile, toFile)
    input_.append(x)

In [20]:
from Bio.PDB import PDBParser
from feature_utils import get_clean_res_list
import torch
torch.set_num_threads(1)

def batch_run(x):
    protein_dict = {}
    pdb, proteinFile, toFile = x
    parser = PDBParser(QUIET=True)
    s = parser.get_structure(pdb, proteinFile)
    res_list = get_clean_res_list(s.get_residues(), verbose=False, ensure_ca_exist=True)
    protein_dict[pdb] = get_protein_feature_qsar(res_list)
    torch.save(protein_dict, toFile)

In [21]:
import mlcrate as mlc
import os
pool = mlc.SuperPool(64)
pool.pool.restart()
_ = pool.map(batch_run,input_)
pool.exit()

[mlcrate] 64 CPUs: 100%|██████████| 19127/19127 [07:47<00:00, 40.91it/s]


In [22]:
import torch
protein_dict = {}
for pdb in tqdm(pdb_list):
    protein_dict.update(torch.load(f"{protein_embedding_folder}/{pdb}.pt"))


100%|██████████| 19127/19127 [06:17<00:00, 50.65it/s] 


# Compound Features

In [23]:
from feature_utils import extract_torchdrug_feature_from_mol
compound_dict = {}
skip_pdb_list = []
for pdb in tqdm(pdb_list):
    mol, _ = read_mol(f"{pre}/renumber_atom_index_same_as_smiles/{pdb}.sdf", None)
    # extract features from sdf.
    try:
        compound_dict[pdb] = extract_torchdrug_feature_from_mol(mol, has_LAS_mask=True)  # self-dock set has_LAS_mask to true
    except Exception as e:
        print(e)
        skip_pdb_list.append(pdb)
        print(pdb)

 28%|██▊       | 5276/19127 [01:30<04:06, 56.28it/s]


3kqs


100%|██████████| 19127/19127 [06:04<00:00, 52.47it/s]


In [24]:
torch.save(compound_dict, f"{tankbind_data_path}/compound_torchdrug_features.pt")

In [25]:
skip_pdb_list

['3kqs']

In [26]:
data = data.query("pdb not in @skip_pdb_list").reset_index(drop=True)

# construct dataset.

In [27]:

def assign_group(pdb, valid=valid, test=test):
    if pdb in valid:
        return 'valid'
    if pdb in test:
        return 'test'
    return 'train'

data['group'] = data.pdb.map(assign_group)

In [28]:
data.value_counts("group")

group
train    17795
valid      968
test       363
dtype: int64

In [29]:
data['name'] = data['pdb']

In [30]:
info = []
err_pdb_list=[]
for i, line in tqdm(data.iterrows(), total=data.shape[0]):
    pdb = line['pdb']
    uid = line['uid']
    # smiles = line['smiles']
    smiles = ""
    affinity = line['affinity']
    group = line['group']

    protein_name = line['name']
    try:
        pocket = pockets_dict[pdb].head(10)
    except:
        err_pdb_list.append(pdb)
        continue
    pocket.columns = pocket.columns.str.strip()
    pocket_coms = pocket[['center_x', 'center_y', 'center_z']].values
    # native block.
    info.append([protein_name, compound_name, pdb, smiles, affinity, uid, None, True, False, group])
    # protein center as a block.
    protein_com = protein_dict[protein_name][0].numpy().mean(axis=0).astype(float).reshape(1, 3)
    info.append([protein_name, compound_name, pdb+"_c", smiles, affinity, uid, protein_com, False, False, group])
    
    for idx, pocket_line in pocket.iterrows():
        pdb_idx = f"{pdb}_{idx}"
        info.append([protein_name, compound_name, pdb_idx, smiles, affinity, uid, pocket_coms[idx].reshape(1, 3), False, False, group])
info = pd.DataFrame(info, columns=['protein_name', 'compound_name', 'pdb', 'smiles', 'affinity', 'uid', 'pocket_com', 
                                   'use_compound_com', 'use_whole_protein',
                                  'group'])
print(len(info))


100%|██████████| 19126/19126 [00:19<00:00, 982.71it/s] 


162036


In [31]:
len(err_pdb_list)

0

In [32]:
len(pockets_dict)

19127

In [33]:
info.shape

(162036, 10)

In [34]:
from data import TankBindDataSet
import os

In [35]:
toFilePre = f"{pre}/dataset"
os.system(f"mkdir -p {toFilePre}")
dataset = TankBindDataSet(toFilePre, data=info, protein_dict=protein_dict, compound_dict=compound_dict)

Processing...
Done!


['/home/jovyan/frag_protein/fragmentation/pdb_data/all_pdbbind/dataset/processed/data.pt', '/home/jovyan/frag_protein/fragmentation/pdb_data/all_pdbbind/dataset/processed/protein.pt', '/home/jovyan/frag_protein/fragmentation/pdb_data/all_pdbbind/dataset/processed/compound.pt']


In [36]:
toFilePre = f"{pre}/dataset"
dataset = TankBindDataSet(toFilePre)


['/home/jovyan/frag_protein/fragmentation/pdb_data/all_pdbbind/dataset/processed/data.pt', '/home/jovyan/frag_protein/fragmentation/pdb_data/all_pdbbind/dataset/processed/protein.pt', '/home/jovyan/frag_protein/fragmentation/pdb_data/all_pdbbind/dataset/processed/compound.pt']


In [37]:
info = torch.load(f"/home/jovyan/frag_protein/fragmentation/pdb_data/all_pdbbind/dataset/processed/data.pt")
info.shape

(162036, 10)

In [38]:
t = []
data = dataset.data
pre_pdb = None
for i, line in tqdm(data.iterrows(), total=data.shape[0]):
    pdb = line['compound_name']
    d = dataset[i]
    p_length = d['node_xyz'].shape[0]
    c_length = d['coords'].shape[0]
    y_length = d['y'].shape[0]
    num_contact = (d.y > 0).sum()
    t.append([i, pdb, p_length, c_length, y_length, num_contact])



100%|██████████| 162036/162036 [10:15<00:00, 263.35it/s]


In [45]:
# data = data.drop(['p_length', 'c_length', 'y_length', 'num_contact'], axis=1)

In [39]:
t = pd.DataFrame(t, columns=['index', 'pdb' ,'p_length', 'c_length', 'y_length', 'num_contact'])
t['num_contact'] = t['num_contact'].apply(lambda x: x.item())

In [40]:
data = pd.concat([data, t[['p_length', 'c_length', 'y_length', 'num_contact']]], axis=1)

In [41]:
native_num_contact = data.query("use_compound_com").set_index("protein_name")['num_contact'].to_dict()
data['native_num_contact'] = data.protein_name.map(native_num_contact)
# data['fract_of_native_contact'] = data['num_contact'] / data['native_num_contact']

In [42]:
torch.save(data, f"{toFilePre}/processed/data.pt")

In [59]:
toFilePre

'/home/jovyan/frag_protein/fragmentation/pdb_data/all_pdbbind/test_dataset'

In [54]:
import torch
info = torch.load(f"{toFilePre}/processed/data.pt")

In [55]:
info.head()

Unnamed: 0,protein_name,compound_name,pdb,smiles,affinity,uid,pocket_com,use_compound_com,use_whole_protein,group,p_length,c_length,y_length,num_contact,native_num_contact,is_ligand_pocket,base_pdb,cover_contact_ratio
0,6h9v,6h9v,6h9v,,0.66,B5BTR7,,True,False,test,64,12,768,73,73,True,6h9v,1.0
1,6h9v,6h9v,6h9v_c,,0.66,B5BTR7,"[[8.631393432617188, -20.263063430786133, 54.7...",False,False,test,162,12,1944,0,73,False,6h9v,0.0
2,6h9v,6h9v,6h9v_0,,0.66,B5BTR7,"[[10.4502, -35.1582, 63.1358]]",False,False,test,121,12,1452,0,73,False,6h9v,0.0
3,6h9v,6h9v,6h9v_1,,0.66,B5BTR7,"[[-1.7136, -18.0661, 43.1919]]",False,False,test,127,12,1524,73,73,False,6h9v,1.0
4,6h9v,6h9v,6h9v_2,,0.66,B5BTR7,"[[18.2234, -7.3891, 59.1918]]",False,False,test,122,12,1464,0,73,False,6h9v,0.0


In [56]:
info.shape

(2879, 18)

In [57]:
def is_ligand_pocket(pdb):
    if len(pdb) == 4:
        return True
    else:
        return False
    
info['is_ligand_pocket'] = info.pdb.apply(lambda x:is_ligand_pocket(x))
pdb_to_num_contact = info.query("is_ligand_pocket").set_index("pdb")['num_contact'].to_dict()
info['base_pdb'] = info.pdb.apply(lambda x: x.split("_")[0])
info['native_num_contact'] = info.base_pdb.apply(lambda x: pdb_to_num_contact[x])
info['cover_contact_ratio'] = info['num_contact'] / info['native_num_contact']
use_whole_protein_list = set(info.base_pdb.unique()) - set(info.query("not is_ligand_pocket").base_pdb)
# assume we don't know the true ligand binding site.


In [64]:
info['index'] = info.index
info

Unnamed: 0,protein_name,compound_name,pdb,smiles,affinity,uid,pocket_com,use_compound_com,use_whole_protein,group,p_length,c_length,y_length,num_contact,native_num_contact,is_ligand_pocket,base_pdb,cover_contact_ratio,index
0,6h9v,6h9v,6h9v,,0.66,B5BTR7,,True,False,test,64,12,768,73,73,True,6h9v,1.000000,0
1,6h9v,6h9v,6h9v_c,,0.66,B5BTR7,"[[8.631393432617188, -20.263063430786133, 54.7...",False,False,test,162,12,1944,0,73,False,6h9v,0.000000,1
2,6h9v,6h9v,6h9v_0,,0.66,B5BTR7,"[[10.4502, -35.1582, 63.1358]]",False,False,test,121,12,1452,0,73,False,6h9v,0.000000,2
3,6h9v,6h9v,6h9v_1,,0.66,B5BTR7,"[[-1.7136, -18.0661, 43.1919]]",False,False,test,127,12,1524,73,73,False,6h9v,1.000000,3
4,6h9v,6h9v,6h9v_2,,0.66,B5BTR7,"[[18.2234, -7.3891, 59.1918]]",False,False,test,122,12,1464,0,73,False,6h9v,0.000000,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2874,6oxp,6oxp,6oxp_0,,11.00,Q8ULI9,"[[71.2014, 55.9, 16.6762]]",False,False,test,117,40,4680,389,389,False,6oxp,1.000000,2874
2875,6oxq,6oxq,6oxq,,11.30,Q8ULI9,,True,False,test,119,41,4879,394,394,True,6oxq,1.000000,2875
2876,6oxq,6oxq,6oxq_c,,11.30,Q8ULI9,"[[70.9427490234375, 59.3531608581543, 12.63163...",False,False,test,132,41,5412,394,394,False,6oxq,1.000000,2876
2877,6oxq,6oxq,6oxq_0,,11.30,Q8ULI9,"[[71.0965, 56.1167, 16.6608]]",False,False,test,119,41,4879,394,394,False,6oxq,1.000000,2877


In [65]:
info.to_csv(f"{toFilePre}/apr23_testset_pdbbind_gvp_pocket_radius20_info.csv")

In [66]:
a = pd.read_csv(f"{toFilePre}/apr23_testset_pdbbind_gvp_pocket_radius20_info.csv")
a

Unnamed: 0.1,Unnamed: 0,protein_name,compound_name,pdb,smiles,affinity,uid,pocket_com,use_compound_com,use_whole_protein,group,p_length,c_length,y_length,num_contact,native_num_contact,is_ligand_pocket,base_pdb,cover_contact_ratio,index
0,0,6h9v,6h9v,6h9v,,0.66,B5BTR7,,True,False,test,64,12,768,73,73,True,6h9v,1.000000,0
1,1,6h9v,6h9v,6h9v_c,,0.66,B5BTR7,[[ 8.63139343 -20.26306343 54.77074432]],False,False,test,162,12,1944,0,73,False,6h9v,0.000000,1
2,2,6h9v,6h9v,6h9v_0,,0.66,B5BTR7,[[ 10.4502 -35.1582 63.1358]],False,False,test,121,12,1452,0,73,False,6h9v,0.000000,2
3,3,6h9v,6h9v,6h9v_1,,0.66,B5BTR7,[[ -1.7136 -18.0661 43.1919]],False,False,test,127,12,1524,73,73,False,6h9v,1.000000,3
4,4,6h9v,6h9v,6h9v_2,,0.66,B5BTR7,[[18.2234 -7.3891 59.1918]],False,False,test,122,12,1464,0,73,False,6h9v,0.000000,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2874,2874,6oxp,6oxp,6oxp_0,,11.00,Q8ULI9,[[71.2014 55.9 16.6762]],False,False,test,117,40,4680,389,389,False,6oxp,1.000000,2874
2875,2875,6oxq,6oxq,6oxq,,11.30,Q8ULI9,,True,False,test,119,41,4879,394,394,True,6oxq,1.000000,2875
2876,2876,6oxq,6oxq,6oxq_c,,11.30,Q8ULI9,[[70.94274902 59.35316086 12.63163853]],False,False,test,132,41,5412,394,394,False,6oxq,1.000000,2876
2877,2877,6oxq,6oxq,6oxq_0,,11.30,Q8ULI9,[[71.0965 56.1167 16.6608]],False,False,test,119,41,4879,394,394,False,6oxq,1.000000,2877


In [101]:
toFilePre

'/home/jovyan/TankBind/fragmentation/pdb_data/all_pdbbind/test_dataset'

In [49]:
test = info.query("group == 'test'").reset_index(drop=True)
test_pdb_list = info.query("group == 'test'").protein_name.unique()

In [50]:
test = info.query("group == 'test'").reset_index(drop=True)
test_pdb_list = info.query("group == 'test'").protein_name.unique()

In [51]:
subset_protein_dict = {}
for pdb in tqdm(test_pdb_list):
    subset_protein_dict[pdb] = protein_dict[pdb]

100%|██████████| 363/363 [00:00<00:00, 212555.12it/s]


In [52]:
subset_compound_dict = {}
for pdb in tqdm(test_pdb_list):
    subset_compound_dict[pdb] = compound_dict[pdb]

100%|██████████| 363/363 [00:00<00:00, 121191.78it/s]


In [53]:

toFilePre = f"{pre}/test_dataset"
os.system(f"mkdir -p {toFilePre}")
dataset = TankBindDataSet(toFilePre, data=test, protein_dict=subset_protein_dict, compound_dict=subset_compound_dict)

Processing...
Done!


['/home/jovyan/frag_protein/fragmentation/pdb_data/all_pdbbind/test_dataset/processed/data.pt', '/home/jovyan/frag_protein/fragmentation/pdb_data/all_pdbbind/test_dataset/processed/protein.pt', '/home/jovyan/frag_protein/fragmentation/pdb_data/all_pdbbind/test_dataset/processed/compound.pt']


In [56]:
def canonical_smiles(smiles):
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles))