# `重要`
在克隆本 git 项目后，如果未按照现有方式挂载 `dataspace` 数据卷，则需要按情况将每一个代码块的 `processed=True` 修改为 `processed=False`，然后运行，以将处理后的数据保存至某个文件夹（原为`dataspace/Prototype`），并在`tankbind_prototype/datasets` 下生成训练、测试用数据集。
若成功挂载 `dataspace`，请跳到本脚本的 「<font color='gold'>「从这里开始」</font>数据集构建」，并以该处开始构建数据集。

# overview

### Preperation | 路径准备


In [2]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import os
import torch
pdbbind_path = "/home/jovyan/dataspace/PDBbind2020/all_pdbbind"
proto_path = "/home/jovyan/dataspace/Prototype"
p2rank = "bash /home/jovyan/TankBind/p2rank_2.3/prank"
tankbind_src_folder_path = "../tankbind_prototype/"
import sys
sys.path.insert(0, tankbind_src_folder_path)

## 数据整理

#### 读取表格和 `pdb` 文件。

In [3]:
processed = True
if not processed:
    session_table = pd.read_csv("/home/jovyan/dataspace/Liyou/jingsen.csv")

    sample_id = list(range(len(session_table)))
    sample_id = ["ID_"+str(_) for _ in sample_id]
    session_table['sample_id'] = sample_id
    
    pdb_in_session = session_table.pdb_id.unique()
    pdb_in_pdbbind = os.listdir(pdbbind_path)
    for _ in tqdm(pdb_in_session):
        if _.lower() not in pdb_in_pdbbind:
            print(_, end=" ")
    # All PDBs in session table could be found in pdbbind2020
    pdb_in_session = list(pdb_in_sessio)

#### 遍历 PDBBind2020 数据库的 ligand 文件，为读取最佳口袋坐标做准备。

In [5]:
processed = True
if not processed:
    from feature_utils import read_mol
    from rdkit import RDLogger
    RDLogger.DisableLog('rdApp.*')
    pdb_list = []
    problem_list = []
    for pdb in tqdm(pdb_in_session):
        pdb = pdb.lower()
        sdf_fileName = f"{pdbbind_path}/{pdb}/{pdb}_ligand.sdf"
        mol2_fileName = f"{pdbbind_path}/{pdb}/{pdb}_ligand.mol2"
        mol, problem = read_mol(sdf_fileName, mol2_fileName)
        if problem:
            problem_list.append(pdb)
            continue
        pdb_list.append(pdb)
    print(problem_list)

`problem_list = ['2foy', '2fou', '2fov', '3zp9', '1h07', '3dwb', '1a7x', '1qpf', '4qxs', '2ovz', '1hv5', '1dmt', '3fuc', '4g8l', '1tlp', '4dt6', '3bwf', '3ntp', '2e94', '4lkm', '3mi2', '3gep', '3ggj', '1bzy', '4bxk', '2oc2', '3kck', '3djx', '2pll', '5aqk', '5ipc', '5kly', '5hlm', '4l4l', '4jbs', '5kyk', '3bbb', '4e67', '3fvh', '4lkl', '4r07', '4loi', '3hik', '4e9d', '4e9c', '5km1', '5kma', '4k10']`

针对这些蛋白，将读取文件坐标找到最佳口袋。

#### 准备表格
最终结果已导出为 `multiton_table.csv`，故前面的步骤都跳过。

In [7]:
processed = True
if not processed:
    session_table['session_au'] = session_table['assay_id'].astype(str) + "_" + session_table['uniprot_id']
    session_count = session_table[['session_au', 'kekule_smiles', 'pdb_id']].groupby('session_au').nunique()
    session_count.reset_index(inplace=True)

    singleton_session = list(session_count[session_count.kekule_smiles==1].session_au)
    multiton_session = list(session_count[session_count.kekule_smiles!=1].session_au)

    ms_dict ={_:"singleton" for _ in singleton_session}
    ms_dict.update({_:"multiton" for _ in multiton_session})

    session_table['session_au_usability'] = session_table.session_au.apply(lambda x: ms_dict[x])
    
    
    from feature_utils import generate_sdf_from_smiles_using_rdkit
    multiton_table = session_table[session_table.session_au_usability=="multiton"]
    multiton_table

    kekule_smiles = list(multiton_table.kekule_smiles.unique())
    len(kekule_smiles)

生成 `smiles_name` 词典，以免直接用 `kekule_smiles` 命名文件导致的问题。

读取 `smiles_name` 词典。

In [8]:
processed = True
if not processed:
    import torch
    smiles_dict = {}
    for i, smiles in enumerate(kekule_smiles):
        smiles_dict[smiles] = f"smiles_{i}"
    import torch
    torch.save(smiles_dict, "/home/jovyan/dataspace/Prototype/dicts/FULL_smiles_dict.pt")
else:
    smiles_dict = torch.load("/home/jovyan/dataspace/Prototype/dicts/FULL_smiles_dict.pt")
    


根据 `kekule_smiles` 生成小分子文件。只需要执行一次。

In [9]:
processed = True
if not processed:
    error_smiles = []
    for i, smiles in tqdm(enumerate(kekule_smiles), total=len(kekule_smiles)):
        try:
            smiles_name = smiles_dict[smiles]
            generate_sdf_from_smiles_using_rdkit(smiles=smiles, rdkitMolFile=f"/home/jovyan/dataspace/Prototype/rdkit_generated_sdfs/{smiles_name}.sdf", shift_dis=0)
        except:
            error_smiles.append((smiles, smiles_dict[smiles]))

清理错误的样本，并保存最后的 `multiton_table`。过程只需要执行一次。
```
error_smiles = ["smiles_12096",  "smiles_12097", "smiles_12098", "smiles_37527"]
```

In [12]:
processed = True
if not processed:
    smiles_files = os.listdir("/home/jovyan/dataspace/Prototype/rdkit_generated_sdfs")
    smiles_files = [_.replace(".sdf", "") for _ in smiles_files]

    error_smiles = []
    for _ in tqdm(kekule_smiles):
        if smiles_dict[_] not in smiles_files:
            error_smiles.append((_, smiles_dict[_]))

    real_error_smiles = []
    print("LENGTH", len(error_smiles))
    a = 0
    for i, error in tqdm(enumerate(error_smiles), total=len(error_smiles)):
        smiles, smiles_name = error
        try:
            generate_sdf_from_smiles_using_rdkit(smiles=smiles, rdkitMolFile=f"/home/jovyan/dataspace/Prototype/rdkit_generated_sdfs/{smiles_name}.sdf", shift_dis=0)
        except:
            a += 1
            print(smiles_name, end=" ")
            real_error_smiles.append((smiles, smiles_dict[smiles]))
    print("\nLENGTH", a)


    all_smiles_file = os.listdir("/home/jovyan/dataspace/Prototype/rdkit_generated_sdfs")
    all_smiles_file = [_.replace(".sdf", "") for _ in all_smiles_file]


    for _ in tqdm(kekule_smiles):
        if smiles_dict[_] not in all_smiles_file:
            print(smiles_dict[_])


    multiton_table['kekule_smiles_name'] = multiton_table.kekule_smiles.apply(lambda x: smiles_dict[x])


    error_smiles = ["smiles_12096",  "smiles_12097", "smiles_12098", "smiles_37527"]


    multiton_table = multiton_table[~multiton_table.kekule_smiles_name.isin(error_smiles)]
    multiton_table.to_csv("/home/jovyan/dataspace/Prototype/tables/FULLRAW_multiton_table_with_smiles_name.csv")

  multiton_table = pd.read_csv("/home/jovyan/dataspace/Prototype/tables/multiton_table_with_smiles_name.csv", index_col=0)


#### 读取 `smiles_dict.pt`，读取 `multiton_table.csv`。

In [6]:
if False:
    smiles_dict = torch.load("/home/jovyan/dataspace/Prototype/FULL_smiles_dict.pt")
    multiton_table = pd.read_csv("/home/jovyan/dataspace/Prototype/tables/FULLRAW_multiton_table_with_smiles_name.csv", index_col=0)

  multiton_table = pd.read_csv("/home/jovyan/dataspace/Prototype/tables/multiton_table_with_smiles_name.csv", index_col=0)


### 蛋白质：口袋切分与特征构建

#### 使用 `p2rank` 生成口袋「略」。

In [None]:
processed = True
if False:
    p2rank_prediction_folder = "/home/jovyan/dataspace/Prototype/p2rank_results"
    os.system(f"mkdir -p {p2rank_prediction_folder}")
    ds = f"{p2rank_prediction_folder}/protein_list.ds"
    with open(ds, "w") as out:
        for pdb in ##TODO: protein_names
            out.write(f"../{TODO: foldernames}/{pdb}_protein.pdb\n")

#### 检查已有的 `p2rank` 结果。

根据 `multiton_table.csv` 得到当前的 `pdb_list`。

检查是否所有的 `pdb` 口袋都能在 `p2rank` 输出文件夹中找到。

In [8]:
processed = True
if not processed:
    pdb_list = [_.lower() for _ in list(multiton_table.pdb_id.unique())]
    p2rank_prediction_folder = "/home/jovyan/dataspace/Prototype/p2rank_results"

    _ = os.listdir(p2rank_prediction_folder)
    pdb_without_p2rank_result = []
    for pdb in tqdm(pdb_list):
        if f"{pdb.lower()}_protein.pdb_predictions.csv" in _:
            continue
        else:
            pdb_without_p2rank_result.append(pdb)
            print(pdb, end=", ")
    if pdb_without_p2rank_result == []:
        print("P2RANK YES.")

  0%|          | 0/9092 [00:00<?, ?it/s]

P2RANK YES.


####  准备 `protein_features`。

In [24]:
processed = True
if not processed:
    tankbind_data_path = f"/home/jovyan/dataspace/Prototype/tankbind_data"
    os.system(f"mkdir -p {tankbind_data_path}")
    pdb_list = [_.lower() for _ in list(multiton_table.pdb_id.unique())]
    d_list = []

    for name in tqdm(pdb_list):
        p2rankFile = f"{p2rank_prediction_folder}/{name}_protein.pdb_predictions.csv"
        d = pd.read_csv(p2rankFile)
        d.columns = d.columns.str.strip()
        d_list.append(d.assign(name=name))
    d = pd.concat(d_list).reset_index(drop=True)
    d.reset_index(drop=True).to_feather(f"{tankbind_data_path}/p2rank_result.feather")

    tankbind_data_path = f"/home/jovyan/dataspace/Prototype/tankbind_data"
    d = pd.read_feather(f"{tankbind_data_path}/p2rank_result.feather")

    pockets_dict = {}
    for name in tqdm(pdb_list):
        pockets_dict[name] = d[d.name == name].reset_index(drop=True)
        
    torch.save(pockets_dict, "/home/jovyan/dataspace/Prototype/dicts/FULL_pockets_dict.pt")

  0%|          | 0/9092 [00:00<?, ?it/s]

In [75]:
processed = True
if not processed:
    from feature_utils import get_protein_feature

    input_ = []
    protein_embedding_folder = f"{tankbind_data_path}/gvp_protein_embedding"
    os.system(f"mkdir -p {protein_embedding_folder}")
    for pdb in tqdm(pdb_list):
        proteinFile = f"{pdbbind_path}/{pdb}/{pdb}_protein.pdb"
        toFile = f"{protein_embedding_folder}/{pdb}.pt"
        x = (pdb, proteinFile, toFile)
        input_.append(x)
        

    from Bio.PDB import PDBParser
    from feature_utils import get_clean_res_list
    import torch
    torch.set_num_threads(1)

    def batch_run(x):
        protein_dict = {}
        pdb, proteinFile, toFile = x
        parser = PDBParser(QUIET=True)
        s = parser.get_structure(pdb, proteinFile)
        res_list = get_clean_res_list(s.get_residues(), verbose=False, ensure_ca_exist=True)
        protein_dict[pdb] = get_protein_feature(res_list)
        torch.save(protein_dict, toFile)
        

    import mlcrate as mlc
    import os
    pool = mlc.SuperPool(64)
    pool.pool.restart()
    _ = pool.map(batch_run,input_)
    pool.exit()


    import torch
    protein_dict = {}
    for pdb in tqdm(name_list):
        protein_dict.update(torch.load(f"{protein_embedding_folder}/{pdb}.pt"))
        
        
    torch.save(protein_dict, "/home/jovyan/dataspace/Prototype/dicts/FULL_protein_dict.pt")




#### 「折叠」读取已保存的 `protein_dict` 和 `pockets_dict`

In [10]:
processed = True
if not processed:
    tankbind_data_path = f"/home/jovyan/dataspace/Prototype/tankbind_data"
    protein_dict = torch.load("/home/jovyan/dataspace/Prototype/dicts/FULL_protein_dict.pt")
    pockets_dict = torch.load("/home/jovyan/dataspace/Prototype/dicts/FULL_pockets_dict.pt")

### 「折叠」小分子：特征构建

In [11]:
processed = True
if not processed:
    from feature_utils import extract_torchdrug_feature_from_mol
    from feature_utils import read_mol

    compound_dict = {}
    skip_smiles_list = []
    for smiles_name in tqdm(multiton_table.kekule_smiles_name.unique()):
        mol, _ = read_mol(f"/home/jovyan/dataspace/Prototype/rdkit_generated_sdfs/{smiles_name}.sdf", None)
        # extract features from sdf.
        try:
            compound_dict[smiles_name] = extract_torchdrug_feature_from_mol(mol, has_LAS_mask=True)  # self-dock set has_LAS_mask to true
        except Exception as e:
            print(smiles_name, e)
            skip_smiles_list.append(smiles_name)

  0%|          | 0/113099 [00:00<?, ?it/s]



smiles_5544 max(): Expected reduction dim to be specified for input.numel() == 0. Specify the reduction dim with the 'dim' argument.




smiles_39785 max(): Expected reduction dim to be specified for input.numel() == 0. Specify the reduction dim with the 'dim' argument.
smiles_39786 max(): Expected reduction dim to be specified for input.numel() == 0. Specify the reduction dim with the 'dim' argument.
smiles_39787 max(): Expected reduction dim to be specified for input.numel() == 0. Specify the reduction dim with the 'dim' argument.
smiles_39788 max(): Expected reduction dim to be specified for input.numel() == 0. Specify the reduction dim with the 'dim' argument.
smiles_40760 max(): Expected reduction dim to be specified for input.numel() == 0. Specify the reduction dim with the 'dim' argument.
smiles_40762 max(): Expected reduction dim to be specified for input.numel() == 0. Specify the reduction dim with the 'dim' argument.
smiles_40763 max(): Expected reduction dim to be specified for input.numel() == 0. Specify the reduction dim with the 'dim' argument.
smiles_40764 max(): Expected reduction dim to be specified for

```
skip_smiles_list = ['smiles_5544', 'smiles_39785', 'smiles_39786', 'smiles_39787', 'smiles_39788', 'smiles_40760', 'smiles_40762', 'smiles_40763', 'smiles_40764', 'smiles_40766', 'smiles_43789']

In [12]:
processed = True
if not processed:
    torch.save(compound_dict, f"/home/jovyan/dataspace/Prototype/dicts/FULL_compound_torchdrug_features.pt")

In [14]:
processed = True
if not processed:
    multiton_table = multiton_table[~multiton_table.kekule_smiles_name.isin(skip_smiles_list)]
    multiton_table.to_csv("/home/jovyan/dataspace/Prototype/tables/FULLRAW_multiton_table_v2.csv")

In [15]:
processed = True
if not processed:
    multiton_table = pd.read_csv("/home/jovyan/dataspace/Prototype/tables/FULLRAW_multiton_table_v2.csv", index_col=0)
    multiton_table['session_aus'] = multiton_table['session_au'] + "_" + multiton_table['kekule_smiles']
    data = multiton_table.copy()

  multiton_table = pd.read_csv("/home/jovyan/dataspace/Prototype/tables/multiton_table_v2.csv", index_col=0)


### 表格处理

#### 导入分组文件以进行 `train-ood-iid-val` 分组

In [66]:
processed = True
if not processed:
    drugood_split = pd.read_csv("/home/jovyan/dataspace/Prototype/tables/non_drugood_split2.csv")
    drugood_split=drugood_split.fillna("-")
    drugood_split.split_tag = drugood_split.split_tag.apply(lambda x: "train" if x=="-" else x)
    drugood_split["session_aus"] = drugood_split['assay_id'].astype(str) + "_" + drugood_split['uniprot_id'] + "_" + drugood_split['kekule_smiles']
    drugood_split=drugood_split[['session_aus', 'split_tag']]
    
    data = pd.merge(left=multiton_table, right=drugood_split, how="left", on="session_aus")
    data.to_csv("/home/jovyan/dataspace/Prototype/tables/FULLRAW_info.csv")

#### 读取 `raw_info.csv` 为 `data`

In [94]:
processed = True
if not processed:
    data = pd.read_csv("/home/jovyan/dataspace/Prototype/tables/FULLRAW_info.csv", index_col=0)

#### 处理`data`生成 `info` 并保存

In [128]:
processed = True
if not processed:
    info = []
    err_pdb_list=[]
    for i, line in tqdm(data.iterrows(), total=data.shape[0]):
        
        sample_id = line['sample_id']
        pdb_id = line['pdb_id']
        uniprot_id = line['uniprot_id']
        assay_id = line['assay_id']
        smiles_name = line['kekule_smiles_name']
        smiles = line['kekule_smiles']
        session_au = line['session_au']
        session_aus = line['session_aus']
        
        value = line['value']
        docking_score = line['docking_score']
        mmgbsa_binding_energy = line['mmgbsa_binding_energy']
        qed = line["qed"]
        
        split_tag = line['split_tag']
        
        protein_name = pdb_id.lower()
        try:
            pocket = pockets_dict[protein_name].head(20)
        except:
            err_pdb_list.append(pdb)
            continue
        pocket.columns = pocket.columns.str.strip()
        pocket_coms = pocket[['center_x', 'center_y', 'center_z']].values

        # protein center as a block.
        protein_com = protein_dict[protein_name][0].numpy().mean(axis=0).astype(float).reshape(1, 3)
        info.append([sample_id, pdb_id, uniprot_id, assay_id, smiles_name, smiles, session_au, session_aus, protein_com, protein_name+"_c", 
                     value, docking_score, mmgbsa_binding_energy, qed, split_tag])
        
        for idx, pocket_line in pocket.iterrows():
            pdb_idx = f"{protein_name}_{idx}"
            info.append([sample_id, pdb_id, uniprot_id, assay_id, smiles_name, smiles, session_au, session_aus, pocket_coms[idx].reshape(1, 3), pdb_idx, 
                         value, docking_score, mmgbsa_binding_energy, qed, split_tag])
            
    info = pd.DataFrame(
        info, 
        columns=['sample_id', 'pdb_id', 'uniprot_id', 'assay_id', 'smiles_name', 'kekule_smiles', 'session_au', 'session_aus', 'pocket_com', 'pocket_name', 
                 'value', 'docking_score','mmgbsa_binding_energy', 'qed', 'split_tag'])
    print(len(info))


    len(err_pdb_list)
    # 0


    info.to_csv("/home/jovyan/dataspace/Prototype/tables/FULLRAW_info_all_pockets.csv")

  0%|          | 0/695316 [00:00<?, ?it/s]

5886176


#### 读取 `info_all_pockets` 为 `info`

In [74]:
processed = True
if not processed:
    info = pd.read_csv("/home/jovyan/dataspace/Prototype/tables/FULLRAW_info_all_pockets.csv", index_col=0)

0

#### 使用此前生成的 `ligand_coor_dict` 来指定口袋，生成 `pdb_id_to_pocket_name_dict`

In [101]:
processed = True
if not processed:
    ligand_coor_dict = torch.load("/home/jovyan/dataspace/Prototype/dicts/ARC_ligand_coor_dict.pt")
    
    
    pdb_id_to_pocket_name_dict ={}
    error_pdb_id_list = []
    for PDB_id in tqdm(ligand_coor_dict, total=len(ligand_coor_dict)):
        try:
            pdb_id = PDB_id.lower()
            ligand_center = ligand_coor_dict[PDB_id][0]
            p2rankFile = f"/home/jovyan/dataspace/Prototype/p2rank_results/{pdb_id}_protein.pdb_predictions.csv"
            pocket = pd.read_csv(p2rankFile)
            pocket.columns = pocket.columns.str.strip()
            pocket_coms = pocket[['center_x', 'center_y', 'center_z']].values
            pocket_coms_with_center = np.concatenate([pocket_coms.mean(axis=0).reshape(1,3), pocket_coms])
            pocket_names = [pdb_id + "_c"] + [pdb_id+"_"+str(i) for i in range(len(pocket_coms))]
            right_ith = ((pocket_coms_with_center-ligand_center)**2).sum(axis=1).argmin()
            right_name = pocket_names[right_ith]
            pdb_id_to_pocket_name_dict[PDB_id] = right_name
        except:
            print(PDB_id, end = " ")
            error_pdb_id_list.append(PDB_id)
            pdb_id_to_pocket_name_dict[PDB_id] = "ERROR"
            
    torch.save(pdb_id_to_pocket_name_dict, "/home/jovyan/dataspace/Prototype/dicts/FULL_pdb_id_to_pocket_name_dict.pt")
    torch.save(error_pdb_id_list, "/home/jovyan/dataspace/Prototype/dicts/FULL_pdb_id_to_pocket_name_error_list.pt")


#### 读取`pdb_id_to_pocket_name_dict`，并结合 `info_all_pockets` 构造出仅保留单个 `pocket` 的 `info_core_pocket`。
错误的蛋白因为已标注为 `ERROR`，会因为没有名为 `ERROR` 的口袋而被略去。

In [129]:
processed = True
if not processed:
    pdb_id_to_pocket_name_dict = torch.load("/home/jovyan/dataspace/Prototype/pdb_id_to_pocket_name_dict.pt")
    info['p2rank_core_pocket'] = info.apply(lambda x: pdb_id_to_pocket_name_dict[x.pdb_id] == x.pocket_name, axis=1)
    
    info.to_csv("/home/jovyan/dataspace/Prototype/tables/FULLRAW_info_all_pockets_noted.csv")
    
    info = info[info.p2rank_core_pocket==True] 
    info.to_csv("/home/jovyan/dataspace/Prototype/tables/FULLRAW_info_core_pocket.csv")

#### 读取 `info_core_pocket`，该表格被用于构建最终数据集。

In [None]:
processed = True
if not processed:
    info = pd.read_csv("/home/jovyan/dataspace/Prototype/tables/info_core_pocket.csv", index_col=0)

针对 `val / test` 组的数据，额外提供一个根据 docking_score 预先指定口袋的 info_val_test_best_docking_score

这一步在后续构建两个数据集时也进行了，所以可以不管这一格

In [134]:
processed = True
if not processed:
    from pandas import DataFrame
    info_val_test = info[info.split_tag!="train"]
    info_val_test = info_val_test.sort_values('docking_score').groupby('session_aus').apply(DataFrame.head, n=1).reset_index(drop=True)
    info_val_test.to_csv("/home/jovyan/dataspace/Prototype/tables/info_extra_for_val_and_test.csv")

针对 info 和 info_val_test 进行额外的 session_au_usability_check

In [156]:
processed = True
if not processed:
    info_count = info[['session_au', 'kekule_smiles', 'pdb_id']].groupby('session_au').nunique()
    info_count.reset_index(inplace=True)
    info_singleton = list(info_count[info_count.kekule_smiles==1].session_au)
    info_multiton = list(info_count[info_count.kekule_smiles!=1].session_au)
    ms_dict ={_:"singleton" for _ in info_singleton}
    ms_dict.update({_:"multiton" for _ in info_multiton})
    info['session_au_usability'] = info.session_au.apply(lambda x: ms_dict[x])
    info = info[info.session_au_usability=="multiton"]
    info.to_csv("/home/jovyan/dataspace/Prototype/tables/FULL_info_core_pocket_multiton.csv")
    info.to_pickle("/home/jovyan/dataspace/Prototype/tables/FULL_info_core_pocket_multiton.pkl")

In [158]:
processed = True
if not processed:
    info_count = info_val_test[['session_au', 'kekule_smiles', 'pdb_id']].groupby('session_au').nunique()
    info_count.reset_index(inplace=True)
    info_singleton = list(info_count[info_count.kekule_smiles==1].session_au)
    info_multiton = list(info_count[info_count.kekule_smiles!=1].session_au)
    ms_dict ={_:"singleton" for _ in info_singleton}
    ms_dict.update({_:"multiton" for _ in info_multiton})
    info_val_test['session_au_usability'] = info_val_test.session_au.apply(lambda x: ms_dict[x])
    info_val_test = info_val_test[info_val_test.session_au_usability=="multiton"]
    info_val_test.to_csv("/home/jovyan/dataspace/Prototype/tables/info_extra_for_val_and_test_multiton.csv")

# <font color='gold'>「从这里开始」</font>数据集构建

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import os
import torch
pdbbind_path = "/home/jovyan/dataspace/PDBbind2020/all_pdbbind"
proto_path = "/home/jovyan/dataspace/Prototype"
p2rank = "bash /home/jovyan/TankBind/p2rank_2.3/prank"
tankbind_src_folder_path = "../tankbind_prototype/"
import sys
sys.path.insert(0, tankbind_src_folder_path)

In [3]:
tankbind_data_path = f"/home/jovyan/dataspace/Prototype/tankbind_data"
protein_dict = torch.load("/home/jovyan/dataspace/Prototype/dicts/FULL_protein_dict.pt")
compound_dict = torch.load("/home/jovyan/dataspace/Prototype/dicts/FULL_compound_torchdrug_features.pt")

读取 `info` 

In [8]:
info = pd.read_pickle("/home/jovyan/dataspace/Prototype/tables/FULL_info_core_pocket_multiton.pkl")
#session type 使用 同assay同pdbid 或者  同assay同uniprot
info["session_ap"] = info['assay_id'].astype(str) + "_" + info['pdb_id']
info["session_aps"]=info["session_ap"]+"_"+info["kekule_smiles"]

In [5]:
from data_prototype import TankBindDataSet_prototype

toFileFull = f"/home/jovyan/main_tankbind/dataset_prototype/full/dataset"
os.system(f"rm -rf {toFileFull}")
os.system(f"mkdir -p {toFileFull}")
dataset = TankBindDataSet_prototype(toFileFull, data=info, protein_dict=protein_dict, compound_dict=compound_dict)

Processing...
Done!


['/home/jovyan/main_tankbind/dataset_prototype/full/dataset/processed/data.pt', '/home/jovyan/main_tankbind/dataset_prototype/full/dataset/processed/protein.pt', '/home/jovyan/main_tankbind/dataset_prototype/full/dataset/processed/compound.pt']


In [None]:
t = []
t_dict = {}
data = dataset.data

for i, line in tqdm(data.iterrows(), total=data.shape[0]):
    d = dataset[i]
    sample_id = line['sample_id']
    p_length = d['node_xyz'].shape[0]
    c_length = d['coords'].shape[0]
    y_length = d['y'].shape[0]
    t.append([i, sample_id, p_length, c_length, y_length])
    t_dict[sample_id] = [i, sample_id, p_length, c_length, y_length]
    
torch.save(t, "/home/jovyan/dataspace/Prototype/tables/FULL_supplementary.pt")
torch.save(t_dict, "/home/jovyan/dataspace/Prototype/dicts/FULL_supplementary_dict.pt")

t = pd.DataFrame(t, columns=['index', 'sample_id', 'p_length', 'c_length', 'y_length'])
t.to_csv("/home/jovyan/dataspace/Prototype/tables/FULL_supplementary.csv")

data = pd.concat([data, t[['p_length', 'c_length', 'y_length']]], axis=1)
torch.save(data, f"{toFileFull}/processed/data.pt")

  0%|          | 0/691468 [00:00<?, ?it/s]

In [7]:
data = dataset.data
t = pd.read_csv("/home/jovyan/dataspace/Prototype/tables/FULL_t.csv", index_col=0)
data =pd.concat([data, t[['p_length', 'c_length', 'y_length']]], axis=1)
torch.save(data, f"{toFileFull}/processed/data.pt")

### 验证集和测试集

In [4]:
info = pd.read_csv("/home/jovyan/dataspace/Prototype/tables/FULL_info_core_pocket_multiton.csv", index_col=0)
#session type 使用 同assay同pdbid 或者  同assay同uniprot
info["session_ap"] = info['assay_id'].astype(str) + "_" + info['pdb_id']
info["session_aps"]=info["session_ap"]+"_"+info["kekule_smiles"]
info.reset_index(inplace=True, drop=True)
info.pocket_com = info.pocket_com.apply(lambda x: np.array([float(_) for _ in x.replace("[[", "").replace("]]", "").split()]))
info2 = info[info.split_tag.isin(["test", 'iid_val', 'ood_val'])].reset_index(drop=True)

同 `aus` 样本，不同 `pdb` 中抽取 `docking_score` 最低的一条。

In [6]:
info3 = info2.sort_values(by='docking_score', ascending=True).groupby('session_aus', as_index=False).first()
info3.head(5)
info3.to_pickle("/home/jovyan/dataspace/Prototype/tables/EXTRA_info_sole_pdb.pkl")

In [9]:
small_pdbs = [_.lower() for _ in info2.pdb_id.unique()]
small_smiles = list(info2.smiles_name.unique())

protein_dict2 = {_:protein_dict[_] for _ in small_pdbs}
compound_dict2 = {_:compound_dict[_] for _ in small_smiles}

In [10]:
toFileExtra = f"/home/jovyan/main_tankbind/dataset_prototype/extra_val_test_reduced_0130"
os.system(f"rm -rf {toFileExtra}")
os.system(f"mkdir -p {toFileExtra}")

from data_prototype import TankBindDataSet_prototype
dataset = TankBindDataSet_prototype(toFileExtra, data=info3, protein_dict=protein_dict2, compound_dict=compound_dict2)

Processing...
Done!


['/home/jovyan/main_tankbind/dataset_prototype/extra_val_test_reduced_0130/processed/data.pt', '/home/jovyan/main_tankbind/dataset_prototype/extra_val_test_reduced_0130/processed/protein.pt', '/home/jovyan/main_tankbind/dataset_prototype/extra_val_test_reduced_0130/processed/compound.pt']


In [11]:
t = []
t_dict = {}
data = dataset.data

for i, line in tqdm(data.iterrows(), total=data.shape[0]):
    d = dataset[i]
    sample_id = line['sample_id']
    p_length = d['node_xyz'].shape[0]
    c_length = d['coords'].shape[0]
    y_length = d['y'].shape[0]
    t.append([i, sample_id, p_length, c_length, y_length])
    t_dict[sample_id] = [i, sample_id, p_length, c_length, y_length]
    
torch.save(t, "/home/jovyan/dataspace/Prototype/tables/EXTRA_supplementary.pt")
torch.save(t_dict, "/home/jovyan/dataspace/Prototype/dicts/EXTRA_supplementary_dict.pt")

t = pd.DataFrame(t, columns=['index', 'sample_id', 'p_length', 'c_length', 'y_length'])
t.to_csv("/home/jovyan/dataspace/Prototype/tables/EXTRA_supplementary.csv")

data = pd.concat([data, t[['p_length', 'c_length', 'y_length']]], axis=1)
torch.save(data, f"{toFileExtra}/processed/data.pt")

  0%|          | 0/32631 [00:00<?, ?it/s]