# overview

We start from the raw PDBbind dataset downloaded from http://www.pdbbind.org.cn/download.php

1. filter out those unable to process using RDKit.

2. Process the protein by only preserving the chains that with at least one atom within 10Å from any atom of the ligand.

3. Use p2rank to segment protein into blocks.

4. extract protein and ligand features.

5. construct the training and test dataset.


In [1]:
import numpy as np
# test = info.query("group == 'test'").reset_index(drop=True)
# test_pdb_list = info.query("group == 'test'").protein_name.unique()
pre = "/home/jovyan/data/all_pdbbind"
# we use the time-split defined in EquiBind paper.
# https://github.com/HannesStark/EquiBind/tree/main/data
valid = np.loadtxt("../equbind/timesplit_no_lig_overlap_val", dtype=str)
test = np.loadtxt("../equbind/timesplit_test", dtype=str)
p2rank = "bash /home/jovyan/TankBind/p2rank_2.3/prank"

In [2]:
tankbind_src_folder_path = "../tankbind/"
import sys
sys.path.insert(0, tankbind_src_folder_path)

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# process the raw PDBbind dataset.

In [4]:
from utils import read_pdbbind_data

In [5]:
# raw PDBbind dataset could be downloaded from http://www.pdbbind.org.cn/download.php

df_pdb_id = pd.read_csv(f'{pre}/index/INDEX_general_PL_name.2020', sep="  ", comment='#', header=None, names=['pdb', 'year', 'uid', 'd', 'e','f','g','h','i','j','k','l','m','n','o'], engine='python')
df_pdb_id = df_pdb_id[['pdb','uid']]
data = read_pdbbind_data(f'{pre}/index/INDEX_general_PL_data.2020')
data = data.merge(df_pdb_id, on=['pdb'])


# ligand file should be readable by RDKit.

In [32]:
from feature_utils import read_mol

In [7]:
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
pdb_list = []
probem_list = []
for pdb in tqdm(data.pdb):
    sdf_fileName = f"{pre}/{pdb}/{pdb}_ligand.sdf"
    mol2_fileName = f"{pre}/{pdb}/{pdb}_ligand.mol2"
    mol, problem = read_mol(sdf_fileName, mol2_fileName)
    if problem:
        probem_list.append(pdb)
        continue
    if pdb=="2r1w":
        continue
    pdb_list.append(pdb)

100%|██████████| 19443/19443 [01:38<00:00, 198.10it/s]


In [8]:
data = data.query("pdb in @pdb_list").reset_index(drop=True)

In [9]:
data.shape

(19127, 7)

### for ease of RMSD evaluation later, we renumber the atom index to be consistent with the smiles

In [10]:
from feature_utils import write_renumbered_sdf
import os

In [11]:
toFolder = f"{pre}/renumber_atom_index_same_as_smiles"
os.system(f"mkdir -p {toFolder}")
for pdb in tqdm(pdb_list):
    sdf_fileName = f"{pre}/{pdb}/{pdb}_ligand.sdf"
    mol2_fileName = f"{pre}/{pdb}/{pdb}_ligand.mol2"
    toFile = f"{toFolder}/{pdb}.sdf"
    write_renumbered_sdf(toFile, sdf_fileName, mol2_fileName)


100%|██████████| 19127/19127 [04:31<00:00, 70.39it/s]


# process PDBbind proteins, removing extra chains, cutoff 10A

In [12]:
toFolder = f"{pre}/protein_remove_extra_chains_10A/"
os.system(f"mkdir -p {toFolder}")

0

In [13]:
input_ = []
cutoff = 10
for pdb in data.pdb.values:
    pdbFile = f"{pre}/{pdb}/{pdb}_protein.pdb"
    ligandFile = f"{pre}/renumber_atom_index_same_as_smiles/{pdb}.sdf"
    toFile = f"{toFolder}/{pdb}_protein.pdb"
    x = (pdbFile, ligandFile, cutoff, toFile)
    input_.append(x)

In [14]:
from feature_utils import select_chain_within_cutoff_to_ligand_v2

In [15]:
import mlcrate as mlc
import os
pool = mlc.SuperPool(64)
pool.pool.restart()
_ = pool.map(select_chain_within_cutoff_to_ligand_v2,input_)
pool.exit()

[mlcrate] 64 CPUs:  16%|█▋        | 3154/19127 [01:13<05:11, 51.34it/s][03:50:08] bond with order 0 found on line 35. This is not part of the MDL specification.
[mlcrate] 64 CPUs:  30%|███       | 5797/19127 [02:05<04:50, 45.84it/s][03:51:00] bond with order 0 found on line 37. This is not part of the MDL specification.
[mlcrate] 64 CPUs:  39%|███▉      | 7542/19127 [02:38<03:26, 56.10it/s][03:51:34] bond with order 0 found on line 29. This is not part of the MDL specification.
[mlcrate] 64 CPUs:  43%|████▎     | 8158/19127 [02:49<03:16, 55.79it/s][03:51:45] bond with order 0 found on line 31. This is not part of the MDL specification.
[mlcrate] 64 CPUs:  46%|████▌     | 8781/19127 [03:02<03:23, 50.84it/s][03:51:57] bond with order 0 found on line 59. This is not part of the MDL specification.
[03:51:57] bond with order 0 found on line 61. This is not part of the MDL specification.
[mlcrate] 64 CPUs:  47%|████▋     | 8908/19127 [03:04<03:03, 55.57it/s][03:51:59] bond with order 0 found

In [16]:
# previously, I found that 2r1w has no chain near the ligand.
data = data.query("pdb != '2r1w'").reset_index(drop=True)

# p2rank segmentation

In [17]:
p2rank_prediction_folder = f"{pre}/p2rank_protein_remove_extra_chains_10A"
os.system(f"mkdir -p {p2rank_prediction_folder}")
ds = f"{p2rank_prediction_folder}/protein_list.ds"
with open(ds, "w") as out:
    for pdb in data.pdb.values:
        out.write(f"../protein_remove_extra_chains_10A/{pdb}_protein.pdb\n")

In [None]:
# # takes about 30 minutes.
cmd = f"{p2rank} predict {ds} -o {p2rank_prediction_folder}/p2rank -threads 16"
os.system(cmd)

In [19]:
data.to_csv(f"{pre}/data.csv")

In [4]:
data=pd.read_csv('/home/jovyan/data/data.csv')

In [5]:
pdb_list = data.pdb.values
tankbind_data_path = f"/home/jovyan/torsional/dataset-all/baseline/tankbind_data/tankbind_data"
name_list = pdb_list

In [22]:
tankbind_data_path = f"/home/jovyan/torsional/dataset-all/baseline/tankbind_data"
name_list = pdb_list
d_list = []

for name in tqdm(name_list):
    p2rankFile = f"{pre}/p2rank_protein_remove_extra_chains_10A/p2rank/{name}_protein.pdb_predictions.csv"
    d = pd.read_csv(p2rankFile)
    d.columns = d.columns.str.strip()
    d_list.append(d.assign(name=name))
d = pd.concat(d_list).reset_index(drop=True)
d.reset_index(drop=True).to_feather(f"{tankbind_data_path}/p2rank_result.feather")

100%|██████████| 19127/19127 [01:11<00:00, 269.35it/s]


In [6]:
d = pd.read_feather(f"/home/jovyan/torsional/dataset-all/baseline/tankbind_data/p2rank_result.feather")

In [7]:
pockets_dict = {}
for name in tqdm(name_list):
    pockets_dict[name] = d[d.name == name].reset_index(drop=True)

100%|██████████| 19127/19127 [02:26<00:00, 130.14it/s]


# protein feature

In [25]:
from feature_utils import get_protein_feature

In [26]:
input_ = []
protein_embedding_folder = f"{tankbind_data_path}/gvp_protein_embedding"
os.system(f"mkdir -p {protein_embedding_folder}")
for pdb in pdb_list:
    proteinFile = f"{pre}/protein_remove_extra_chains_10A/{pdb}_protein.pdb"
    toFile = f"{protein_embedding_folder}/{pdb}.pt"
    x = (pdb, proteinFile, toFile)
    input_.append(x)

In [27]:
from Bio.PDB import PDBParser
from feature_utils import get_clean_res_list
import torch
torch.set_num_threads(1)

def batch_run(x):
    protein_dict = {}
    pdb, proteinFile, toFile = x
    parser = PDBParser(QUIET=True)
    s = parser.get_structure(pdb, proteinFile)
    res_list = get_clean_res_list(s.get_residues(), verbose=False, ensure_ca_exist=True)
    protein_dict[pdb] = get_protein_feature(res_list)
    torch.save(protein_dict, toFile)

In [28]:
import mlcrate as mlc
import os
pool = mlc.SuperPool(64)
pool.pool.restart()
_ = pool.map(batch_run,input_)
pool.exit()

[mlcrate] 64 CPUs: 100%|██████████| 19127/19127 [04:38<00:00, 68.61it/s]


In [8]:
import torch
tankbind_data_path = f"/home/jovyan/torsional/dataset-all/baseline/tankbind_data"
protein_embedding_folder = f"{tankbind_data_path}/gvp_protein_embedding"
protein_dict = {}
for pdb in tqdm(pdb_list):
    protein_dict.update(torch.load(f"{protein_embedding_folder}/{pdb}.pt"))


100%|██████████| 19127/19127 [06:26<00:00, 49.49it/s]


# Compound Features

In [30]:
from feature_utils import extract_torchdrug_feature_from_mol
compound_dict = {}
skip_pdb_list = []
for pdb in tqdm(pdb_list):
    mol, _ = read_mol(f"{pre}/renumber_atom_index_same_as_smiles/{pdb}.sdf", None)
    # extract features from sdf.
    try:
        compound_dict[pdb] = extract_torchdrug_feature_from_mol(mol, has_LAS_mask=True)  # self-dock set has_LAS_mask to true
    except Exception as e:
        print(e)
        skip_pdb_list.append(pdb)
        print(pdb)

 28%|██▊       | 5276/19127 [01:17<02:53, 79.89it/s]


3kqs


100%|██████████| 19127/19127 [05:17<00:00, 60.19it/s]


In [31]:
torch.save(compound_dict, f"{tankbind_data_path}/compound_torchdrug_features.pt")

In [9]:
compound_dict = torch.load( f"{tankbind_data_path}/compound_torchdrug_features.pt")

In [10]:
skip_pdb_list = ['3kqs']

In [11]:
data = data.query("pdb not in @skip_pdb_list").reset_index(drop=True)

# construct dataset.

In [12]:

def assign_group(pdb, valid=valid, test=test):
    if pdb in valid:
        return 'valid'
    if pdb in test:
        return 'test'
    return 'train'

data['group'] = data.pdb.map(assign_group)

In [13]:
data.value_counts("group")

group
train    17795
valid      968
test       363
dtype: int64

In [14]:
data['name'] = data['pdb']

In [15]:
info = []
err_pdb_list=[]
for i, line in tqdm(data.iterrows(), total=data.shape[0]):
    pdb = line['pdb']
    uid = line['uid']
    # smiles = line['smiles']
    smiles = ""
    affinity = line['affinity']
    group = line['group']

    compound_name = line['name']
    protein_name = line['name']
    try:
        pocket = pockets_dict[pdb].head(10)
    except:
        err_pdb_list.append(pdb)
        continue
    pocket.columns = pocket.columns.str.strip()
    pocket_coms = pocket[['center_x', 'center_y', 'center_z']].values
    # native block.
    info.append([protein_name, compound_name, pdb, smiles, affinity, uid, None, True, False, group])
    # protein center as a block.
    protein_com = protein_dict[protein_name][0].numpy().mean(axis=0).astype(float).reshape(1, 3)
    info.append([protein_name, compound_name, pdb+"_c", smiles, affinity, uid, protein_com, False, False, group])
    
    for idx, pocket_line in pocket.iterrows():
        pdb_idx = f"{pdb}_{idx}"
        info.append([protein_name, compound_name, pdb_idx, smiles, affinity, uid, pocket_coms[idx].reshape(1, 3), False, False, group])
info = pd.DataFrame(info, columns=['protein_name', 'compound_name', 'pdb', 'smiles', 'affinity', 'uid', 'pocket_com', 
                                   'use_compound_com', 'use_whole_protein',
                                  'group'])
print(len(info))


100%|██████████| 19126/19126 [00:18<00:00, 1024.01it/s]


162036


In [16]:
len(err_pdb_list)

0

In [17]:
len(pockets_dict)

19127

In [18]:
info.shape

(162036, 10)

In [4]:
from data import TankBindDataSet_torsion
import os

In [6]:
toFilePre = "/home/jovyan/torsional/dataset-all/torsional/train_dataset"
dataset = TankBindDataSet_torsion(toFilePre)

['/home/jovyan/torsional/dataset-all/torsional/train_dataset/processed/data.pt', '/home/jovyan/torsional/dataset-all/torsional/train_dataset/processed/protein.pt', '/home/jovyan/torsional/dataset-all/torsional/train_dataset/processed/compound.pt']


In [1]:
toFilePre = "/home/jovyan/torsional/dataset-all/dataset_torsional"
os.system(f"mkdir -p {toFilePre}")
dataset = TankBindDataSet_torsion(toFilePre, data=info, protein_dict=protein_dict, compound_dict=compound_dict)

NameError: name 'os' is not defined

In [28]:
dataset[0]['compound', 'compound'].edge_index

tensor([[ 0,  1,  1,  2,  1,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  7,  9,
          4, 10, 10, 11, 10, 12],
        [ 1,  0,  2,  1,  3,  1,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7,  9,  7,
         10,  4, 11, 10, 12, 10]])

In [21]:
import torch
data = torch.load('/home/jovyan/torsional/dataset-all/torsional/train_dataset/processed/data.pt')
protein_dict = torch.load('/home/jovyan/torsional/dataset-all/torsional/train_dataset/processed/protein.pt')
compound_dict = torch.load('/home/jovyan/torsional/dataset-all/torsional/train_dataset/processed/compound.pt')

In [6]:
line = data.iloc[0]
protein_name = line['protein_name']

In [7]:
protein_node_xyz, protein_seq, protein_node_s, protein_node_v, protein_edge_index, protein_edge_s, protein_edge_v = protein_dict[protein_name]
name = line['compound_name']
coords, compound_node_features, input_atom_edge_list, input_atom_edge_attr_list, pair_dis_distribution = compound_dict[name]

In [8]:
input_atom_edge_list[:,:2].long().t().contiguous()

tensor([[ 0,  1,  1,  2,  1,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  7,  9,
          4, 10, 10, 11, 10, 12],
        [ 1,  0,  2,  1,  3,  1,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7,  9,  7,
         10,  4, 11, 10, 12, 10]])

In [12]:
from utils import construct_data_from_graph_gvp

In [13]:
data, input_node_list, keepNode = construct_data_from_graph_gvp(protein_node_xyz, protein_seq, protein_node_s, 
                                  protein_node_v, protein_edge_index, protein_edge_s, protein_edge_v,
                                  coords, compound_node_features, input_atom_edge_list, input_atom_edge_attr_list, contactCutoff=8.0, includeDisMap=True,
                          pocket_radius=20, add_noise_to_com=None, use_whole_protein=False, 
                          use_compound_com_as_pocket=True, chosen_pocket_com=None, compoundMode=1)

In [53]:
data.pdb = line['pdb']

In [95]:
def get_transformation_mask(pyg_data):
    G = to_networkx(pyg_data.to_homogeneous(), to_undirected=False)
    to_rotate = []
    edges = pyg_data['compound', 'c2c', 'compound'].edge_index.T.numpy()
    for i in range(0, edges.shape[0], 2):
        assert edges[i, 0] == edges[i+1, 1]

        G2 = G.to_undirected()
        G2.remove_edge(*edges[i])
        if not nx.is_connected(G2):
            l = list(sorted(nx.connected_components(G2), key=len)[0])
            if len(l) > 1:
                if edges[i, 0] in l:
                    to_rotate.append([])
                    to_rotate.append(l)
                else:
                    to_rotate.append(l)
                    to_rotate.append([])
                continue
        to_rotate.append([])
        to_rotate.append([])

    mask_edges = np.asarray([0 if len(l) == 0 else 1 for l in to_rotate], dtype=bool)
    mask_rotate = np.zeros((np.sum(mask_edges), len(G.nodes())), dtype=bool)
    idx = 0
    for i in range(len(G.edges())):
        if mask_edges[i]:
            mask_rotate[idx][np.asarray(to_rotate[i], dtype=int)] = True
            idx += 1

    return mask_edges, mask_rotate

In [55]:
from torch_geometric.data import HeteroData
data1 = HeteroData()
name = data.pdb
from feature_utils import extract_torchdrug_feature_from_mol
mol, _ = read_mol(f"/home/jovyan/data/renumber_atom_index_same_as_smiles/{name}.sdf", None)

In [85]:
def get_lig_graph(data_compound, data):
    data_compound['compound'].x = data['compound'].x
    data_compound['compound'].pos = data['compound'].pos
    data_compound['compound', 'c2c', 'compound'].edge_index = data['compound', 'c2c', 'compound'].edge_index
    data_compound['compound', 'c2c', 'compound'].edge_attr = data['compound', 'c2c', 'compound'].edge_attr
    return


In [90]:
import torch.nn.functional as F
data1 = HeteroData()
get_lig_graph(data1, data)

In [96]:
data1

HeteroData(
  [1mcompound[0m={
    x=[13, 56],
    pos=[13, 3]
  },
  [1m(compound, c2c, compound)[0m={
    edge_index=[2, 24],
    edge_attr=[24, 19]
  }
)

In [97]:
edge_mask, mask_rotate = get_transformation_mask(data1)

In [100]:
mask_rotate.shape

(6, 13)

In [7]:
t = []
data = dataset.data
pre_pdb = None
for i, line in tqdm(data.iterrows(), total=data.shape[0]):
    pdb = line['compound_name']
    d = dataset[i]
    p_length = d['node_xyz'].shape[0]
    c_length = d['coords'].shape[0]
    y_length = d['y'].shape[0]
    num_contact = (d.y > 0).sum()
    t.append([i, pdb, p_length, c_length, y_length, num_contact])



100%|██████████| 162036/162036 [1:31:17<00:00, 29.58it/s]  


In [None]:
# data = data.drop(['p_length', 'c_length', 'y_length', 'num_contact'], axis=1)

In [9]:
t = pd.DataFrame(t, columns=['index', 'pdb' ,'p_length', 'c_length', 'y_length', 'num_contact'])
t['num_contact'] = t['num_contact'].apply(lambda x: x.item())

In [11]:
data = pd.concat([data, t[['p_length', 'c_length', 'y_length', 'num_contact']]], axis=1)

In [12]:
native_num_contact = data.query("use_compound_com").set_index("protein_name")['num_contact'].to_dict()
data['native_num_contact'] = data.protein_name.map(native_num_contact)
# data['fract_of_native_contact'] = data['num_contact'] / data['native_num_contact']

In [14]:
import torch

In [15]:
torch.save(data, f"{toFilePre}/processed/data.pt")

In [16]:
toFilePre

'/home/jovyan/torsional/dataset-all/torsional/train_dataset'

In [17]:
import torch
info = torch.load(f"{toFilePre}/processed/data.pt")
info['group'].unique()

array(['train', 'valid', 'test'], dtype=object)

In [18]:
test = info.query("group == 'test'").reset_index(drop=True)
test_pdb_list = info.query("group == 'test'").protein_name.unique()

In [19]:
test = info.query("group == 'test'").reset_index(drop=True)
test_pdb_list = info.query("group == 'test'").protein_name.unique()

In [22]:
subset_protein_dict = {}
for pdb in tqdm(test_pdb_list):
    subset_protein_dict[pdb] = protein_dict[pdb]

100%|██████████| 363/363 [00:00<00:00, 447540.37it/s]


In [23]:
subset_compound_dict = {}
for pdb in tqdm(test_pdb_list):
    subset_compound_dict[pdb] = compound_dict[pdb]

100%|██████████| 363/363 [00:00<00:00, 459701.80it/s]


In [24]:
pre = '/home/jovyan/torsional/dataset-all/torsional'

In [26]:
toFilePre = f"{pre}/test_dataset"
os.system(f"mkdir -p {toFilePre}")
dataset = TankBindDataSet_torsion(toFilePre, data=test, protein_dict=subset_protein_dict, compound_dict=subset_compound_dict)

Processing...
Done!


['/home/jovyan/torsional/dataset-all/torsional/test_dataset/processed/data.pt', '/home/jovyan/torsional/dataset-all/torsional/test_dataset/processed/protein.pt', '/home/jovyan/torsional/dataset-all/torsional/test_dataset/processed/compound.pt']


In [27]:
valid = info.query("group == 'valid'").reset_index(drop=True)
valid_pdb_list = info.query("group == 'valid'").protein_name.unique()

In [28]:
valid = info.query("group == 'valid'").reset_index(drop=True)
valid_pdb_list = info.query("group == 'valid'").protein_name.unique()

In [29]:
subset_protein_dict = {}
for pdb in tqdm(valid_pdb_list):
    subset_protein_dict[pdb] = protein_dict[pdb]

100%|██████████| 968/968 [00:00<00:00, 193494.08it/s]


In [30]:
subset_compound_dict = {}
for pdb in tqdm(valid_pdb_list):
    subset_compound_dict[pdb] = compound_dict[pdb]

100%|██████████| 968/968 [00:00<00:00, 193098.37it/s]


In [31]:
toFilePre = f"{pre}/valid_dataset"
os.system(f"mkdir -p {toFilePre}")
dataset = TankBindDataSet_torsion(toFilePre, data=valid, protein_dict=subset_protein_dict, compound_dict=subset_compound_dict)

Processing...
Done!


['/home/jovyan/torsional/dataset-all/torsional/valid_dataset/processed/data.pt', '/home/jovyan/torsional/dataset-all/torsional/valid_dataset/processed/protein.pt', '/home/jovyan/torsional/dataset-all/torsional/valid_dataset/processed/compound.pt']


In [56]:
def canonical_smiles(smiles):
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles))

In [11]:
dataset = TankBindDataSet('torsional/dataset-all/baseline/test_dataset')

['torsional/dataset-all/baseline/test_dataset/processed/data.pt', 'torsional/dataset-all/baseline/test_dataset/processed/protein.pt', 'torsional/dataset-all/baseline/test_dataset/processed/compound.pt']


In [16]:
valid = TankBindDataSet('/home/jovyan/torsional/dataset-all/baseline/valid_dataset')

['/home/jovyan/torsional/dataset-all/baseline/valid_dataset/processed/data.pt', '/home/jovyan/torsional/dataset-all/baseline/valid_dataset/processed/protein.pt', '/home/jovyan/torsional/dataset-all/baseline/valid_dataset/processed/compound.pt']


In [25]:
valid[0].to_homogeneous()

AttributeError: 'numpy.str_' object has no attribute 'to_homogeneous'

In [4]:
from data import TankBindDataSet_torsion
import os
import torch

In [5]:
toFilePre = f"/home/jovyan/torsional/dataset-all/torsional/test_dataset"
os.system(f"mkdir -p {toFilePre}")
test = torch.load('/home/jovyan/torsional/dataset-all/torsional/test_dataset_/processed/data.pt')
subset_protein_dict = torch.load('/home/jovyan/torsional/dataset-all/torsional/test_dataset_/processed/protein.pt')
subset_compound_dict = torch.load('/home/jovyan/torsional/dataset-all/torsional/test_dataset_/processed/compound.pt')
dataset = TankBindDataSet_torsion(toFilePre, data=test, protein_dict=subset_protein_dict, compound_dict=subset_compound_dict)

Processing...
Done!


['/home/jovyan/torsional/dataset-all/torsional/test_dataset/processed/data.pt', '/home/jovyan/torsional/dataset-all/torsional/test_dataset/processed/protein.pt', '/home/jovyan/torsional/dataset-all/torsional/test_dataset/processed/compound.pt']


In [7]:
toFilePre = f"/home/jovyan/torsional/dataset-all/torsional/valid_dataset"
os.system(f"mkdir -p {toFilePre}")
valid = torch.load('/home/jovyan/torsional/dataset-all/torsional/valid_dataset_/processed/data.pt')
subset_protein_dict = torch.load('/home/jovyan/torsional/dataset-all/torsional/valid_dataset_/processed/protein.pt')
subset_compound_dict = torch.load('/home/jovyan/torsional/dataset-all/torsional/valid_dataset_/processed/compound.pt')
dataset = TankBindDataSet_torsion(toFilePre, data=valid, protein_dict=subset_protein_dict, compound_dict=subset_compound_dict)

Processing...
Done!


['/home/jovyan/torsional/dataset-all/torsional/valid_dataset/processed/data.pt', '/home/jovyan/torsional/dataset-all/torsional/valid_dataset/processed/protein.pt', '/home/jovyan/torsional/dataset-all/torsional/valid_dataset/processed/compound.pt']


In [9]:
toFilePre = f"/home/jovyan/torsional/dataset-all/torsional/train_dataset"
os.system(f"mkdir -p {toFilePre}")
train = torch.load('/home/jovyan/torsional/dataset-all/torsional/train_dataset_/processed/data.pt')
subset_protein_dict = torch.load('/home/jovyan/torsional/dataset-all/torsional/train_dataset_/processed/protein.pt')
subset_compound_dict = torch.load('/home/jovyan/torsional/dataset-all/torsional/train_dataset_/processed/compound.pt')
dataset = TankBindDataSet_torsion(toFilePre, data=train, protein_dict=subset_protein_dict, compound_dict=subset_compound_dict)

Processing...
Done!


['/home/jovyan/torsional/dataset-all/torsional/train_dataset/processed/data.pt', '/home/jovyan/torsional/dataset-all/torsional/train_dataset/processed/protein.pt', '/home/jovyan/torsional/dataset-all/torsional/train_dataset/processed/compound.pt']


In [9]:
data = torch.load('/home/jovyan/torsional/dataset-all/torsional/train_dataset/processed/data.pt')

In [10]:
data

Unnamed: 0,protein_name,compound_name,pdb,smiles,affinity,uid,pocket_com,use_compound_com,use_whole_protein,group,p_length,c_length,y_length,num_contact,native_num_contact,candicate_conf_pos
0,3zzf,3zzf,3zzf,,0.40,Q01217,,True,False,train,144,13,1872,144,144,"[[[-3.539917766266281, 1.1310054302447874, -0...."
1,3zzf,3zzf,3zzf_c,,0.40,Q01217,"[[5.51331901550293, 36.50146484375, 14.4291219...",False,False,train,188,13,2444,136,144,"[[[-3.539917766266281, 1.1310054302447874, -0...."
2,3zzf,3zzf,3zzf_0,,0.40,Q01217,"[[9.2232, 36.6453, 4.2458]]",False,False,train,148,13,1924,144,144,"[[[-3.539917766266281, 1.1310054302447874, -0...."
3,3zzf,3zzf,3zzf_1,,0.40,Q01217,"[[-3.9652, 36.9019, 2.8611]]",False,False,train,121,13,1573,138,144,"[[[-3.539917766266281, 1.1310054302447874, -0...."
4,3zzf,3zzf,3zzf_2,,0.40,Q01217,"[[16.5628, 39.1406, 26.3637]]",False,False,train,116,13,1508,12,144,"[[[-3.539917766266281, 1.1310054302447874, -0...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161889,2avi,2avi,2avi_5,,15.22,P02701,"[[-24.8819, 33.8811, 24.9982]]",False,False,train,91,16,1456,39,157,"[[[5.241567853335355, 0.32110025819596066, -0...."
161890,2avi,2avi,2avi_6,,15.22,P02701,"[[5.0382, 35.5432, 16.4793]]",False,False,train,121,16,1936,137,157,"[[[5.241567853335355, 0.32110025819596066, -0...."
161891,2avi,2avi,2avi_7,,15.22,P02701,"[[-4.7665, 15.8424, 22.5071]]",False,False,train,86,16,1376,144,157,"[[[5.241567853335355, 0.32110025819596066, -0...."
161892,2avi,2avi,2avi_8,,15.22,P02701,"[[4.7665, 64.5276, 22.5071]]",False,False,train,83,16,1328,0,157,"[[[5.241567853335355, 0.32110025819596066, -0...."
