# README

#### 放置数据
- 将 本笔记本 和 p2rank_2.3 文件夹置于 TankBind/example 下（p2rank_2.3的路径可在下一格的 Input configuration 中修改）

- 在 TankBind/example 下建立文件夹 inputs 文件夹 （路径可修改）

- 在 input 蛋白质对应的 .pdb 文件

    - .pdb 文件：名字格式为：`f"{pdb_name}".pdb`，此`pdb_name`参数在下文填充。例如："7kac.pdb"

#### 小分子构象文件
生成的小分子构象文件中，若文件名
- 包含 `_AfGv_` 字段，说明手动指定的口袋和由 TankBind 依据最高 Affinity 选择的口袋为同一口袋

否则，将会分别生成：
- 包含中间字段 `_Af_` 的构象：由 TankBind 选择的口袋
- 包含中间字段 `_Gv_` 的构象：由`given_pocket`参数指定的口袋

# Input configuration
需在此指定 .csv 文件的名称(和此前的 DownloadData 文件相配合）。
所需的 .pdb 文件仍需要手动上传到下述 inputs 文件夹中。

`conf_by_chosen` 参数：
- 若为 `True`，则 TankBind 会依据预测出的最高 Affinity 选择对应口袋并生成构象。
- 当 `given_pocket = 0`时，此参数会被修正为 `True`.

`distinguish_by_timestamp` 参数：若为 `True`，则 输出文件夹名字会额外包含一个时间戳后缀。




In [143]:
pdb_name = "6scm" # f'{pdb_name}.pdb' file in the inputs folder will be used.
target_name = "SOS1"  # "target" in the .csv file
csv_name = "all_middle.test.csv"

given_pocket = 0
#given_pocket = "pocket_8" # or int 0

input_path = "./inputs" # for .pdb files 
base_pre = f"./PREDICTION" # for results

conf_by_chosen = True
# if false and "given_pocket" is 0, only the conformations relating to the given_pocket will be generated.

distinguish_by_timestamp = True 
# if true, the output folder name will be suffiexed with a timestamp 


p2rank = "bash ./p2rank_2.3/prank" # p2rank file path

<font color="blue" size=5>需在此指定 label_data 的路径</font>

In [144]:
path_label_data = "../../sar-3d-pharam/data/label_data/"
path_data_folder = "SOS1 HPK1" # or "" if the .csv file is placed in label_data folder


    # label_data*
    # - HPK1 SOS1*
    #     - all_middle.test.csv
    #     - all_middle.train.csv
    #     - ...
    # - PRTM5*
    #     - all_middle.test.csv
    #     - all_middle.train.csv
    #     - ...
    # - all_middle.test.csv
    # - all_middle.train.csv
    # - ...

path_data = path_label_data + path_data_folder
import os
os.listdir(path_data)
    # You should see something like this:
    #   ['all_middle.test.csv',
    #    'all_middle.train.csv',
    #    ...
    # The SMILES in the 'all_middle.text.csv' file will be used ：

['.ipynb_checkpoints',
 'all_middle.test.csv',
 'all_middle.train.csv',
 'all_ndcg.test.csv',
 'all_ndcg.train.csv',
 'all_pair.test.csv',
 'all_pair.train.csv',
 'all_top.test.csv',
 'all_top.train.csv']

# Start

In [145]:
cp_name = target_name.replace("_","")
if (given_pocket == 0) and (conf_by_chosen == False):
    conf_by_chosen = True

tankbind_src_folder_path = "../tankbind/"
import sys
sys.path.insert(0, tankbind_src_folder_path)
from Bio.PDB.PDBList import PDBList   # pip install biopython if import failure
import os
import numpy as np
import pandas as pd
import time

In [146]:
if distinguish_by_timestamp:
    timetag = time.strftime("%m%d%H%M")
    pre = f"{base_pre}/{cp_name}-{pdb_name}-{timetag}"
else:
    pre = f"{base_pre}/{cp_name}-{pdb_name}"
pdir = f"{pre}/PDBs/"
os.system(f"mkdir -p {base_pre}")
os.system(f"rm -rf {pre}/sdfs")
os.system(f"mkdir -p {pre}/sdfs")
os.system(f"rm -rf {pre}/PDBs")
os.system(f"mkdir -p {pre}/PDBs")
os.system(f"rm -rf {pre}/p2rank")
os.system(f"mkdir -p {pre}/p2rank")


0

In [147]:
pdb = f'{pdb_name}'
os.system(f"mkdir -p {pdir}")
pdbl = PDBList()
native_pdb = f"./inputs/{pdb_name}.pdb"

from Bio.PDB import PDBParser
parser = PDBParser(QUIET=True)
s = parser.get_structure(pdb, native_pdb)



In [148]:
proteinFile = f"./inputs/{pdb_name}.pdb"

In [149]:
import rdkit.Chem as Chem
from feature_utils import generate_sdf_from_smiles_using_rdkit

In [150]:
input_dataframe = pd.read_csv(f'{path_data}/{csv_name}')
input_dataframe = input_dataframe[input_dataframe["target"]==target_name][["id_in_patent","smiles","target"]]
input_dataframe

In [151]:
smilesdict = dict(zip(input_dataframe['id_in_patent'], input_dataframe['smiles']))
for _key in smilesdict.keys():
    rdkitMolFile = f"{pre}/sdfs/{pdb}_{_key}_mol_from_rdkit.sdf"
    shift_dis = 0   # for visual only, could be any number, shift the ligand away from the protein.
    generate_sdf_from_smiles_using_rdkit(smilesdict[_key], rdkitMolFile, shift_dis=shift_dis)

# get protein feature

In [152]:
from feature_utils import get_protein_feature

In [153]:
parser = PDBParser(QUIET=True)
s = parser.get_structure("x", proteinFile)
res_list = list(s.get_residues())

In [154]:
protein_dict = {}
protein_dict[pdb] = get_protein_feature(res_list)


# get compound feature

In [155]:
from feature_utils import extract_torchdrug_feature_from_mol

In [156]:
compound_dict = {}
for ligandName in smilesdict.keys():
    rdkitMolFile = f"{pre}/sdfs/{pdb}_{ligandName}_mol_from_rdkit.sdf"
    mol = Chem.MolFromMolFile(rdkitMolFile)
    compound_dict[pdb+f"_{ligandName}"+"_rdkit"] = extract_torchdrug_feature_from_mol(mol, has_LAS_mask=True)

# p2rank

In [157]:
pdb_list = [pdb]
ds = f"{pre}/protein_list.ds"
with open(ds, "w") as out:
    for pdb in pdb_list:
        out.write(f"../../{input_path}/{pdb_name}.pdb\n")


In [158]:
cmd = f"{p2rank} predict {ds} -o {pre}/p2rank -threads 1"
os.system(cmd)

----------------------------------------------------------------------------------------------
 P2Rank 2.3
----------------------------------------------------------------------------------------------

predicting pockets for proteins from dataset [protein_list.ds]
processing [6scm.pdb] (1/1)
predicting pockets finished in 0 hours 0 minutes 10.759 seconds
results saved to directory [/home/jovyan/TankBind/examples/PREDICTION/SOS1-6scm-07250349/p2rank]

----------------------------------------------------------------------------------------------
 finished successfully in 0 hours 0 minutes 11.580 seconds
----------------------------------------------------------------------------------------------


0

In [159]:
info = []
for pdb in pdb_list:
    for compound_name in list(compound_dict.keys()):
        # use protein center as the block center.
        com = ",".join([str(a.round(3)) for a in protein_dict[pdb][0].mean(axis=0).numpy()])
        info.append([pdb, compound_name, "protein_center", com])

        p2rankFile = f"{pre}/p2rank/{pdb}.pdb_predictions.csv"
        pocket = pd.read_csv(p2rankFile)
        pocket.columns = pocket.columns.str.strip()
        pocket_coms = pocket[['center_x', 'center_y', 'center_z']].values
        for ith_pocket, com in enumerate(pocket_coms):
            com = ",".join([str(a.round(3)) for a in com])
            info.append([pdb, compound_name, f"pocket_{ith_pocket+1}", com])
info = pd.DataFrame(info, columns=['protein_name', 'compound_name', 'pocket_name', 'pocket_com'])
info

Unnamed: 0,protein_name,compound_name,pocket_name,pocket_com
0,6scm,6scm_0_rdkit,protein_center,"8.736,-15.087,-22.089"
1,6scm,6scm_0_rdkit,pocket_1,"8.709,-24.923,-34.936"
2,6scm,6scm_0_rdkit,pocket_2,"14.982,-43.708,-30.367"
3,6scm,6scm_0_rdkit,pocket_3,"4.658,-7.672,-15.484"
4,6scm,6scm_0_rdkit,pocket_4,"1.534,-35.549,-43.763"
...,...,...,...,...
1185,6scm,6scm_118_rdkit,pocket_5,"7.982,19.069,-15.421"
1186,6scm,6scm_118_rdkit,pocket_6,"-1.262,-16.002,-29.16"
1187,6scm,6scm_118_rdkit,pocket_7,"3.006,3.753,-22.598"
1188,6scm,6scm_118_rdkit,pocket_8,"15.256,-6.178,-31.047"


In [161]:
p2rankFile = f"{pre}/p2rank/{pdb_name}.pdb_predictions.csv"
pocket = pd.read_csv(p2rankFile)

# construct dataset

In [162]:
import torch
torch.set_num_threads(1)

In [163]:
from data import TankBind_prediction

In [164]:
dataset_path = f"{pre}/{pdb}_dataset/"
os.system(f"rm -r {dataset_path}")
os.system(f"mkdir -p {dataset_path}")
dataset = TankBind_prediction(dataset_path, data=info, protein_dict=protein_dict, compound_dict=compound_dict)

rm: cannot remove './PREDICTION/SOS1-6scm-07250349/6scm_dataset/': No such file or directory
Processing...


['PREDICTION/SOS1-6scm-07250349/6scm_dataset/processed/data.pt', 'PREDICTION/SOS1-6scm-07250349/6scm_dataset/processed/protein.pt', 'PREDICTION/SOS1-6scm-07250349/6scm_dataset/processed/compound.pt']


Done!


In [165]:
dataset_path = f"{pre}/{pdb}_dataset/"
dataset = TankBind_prediction(dataset_path)

['PREDICTION/SOS1-6scm-07250349/6scm_dataset/processed/data.pt', 'PREDICTION/SOS1-6scm-07250349/6scm_dataset/processed/protein.pt', 'PREDICTION/SOS1-6scm-07250349/6scm_dataset/processed/compound.pt']


In [166]:
import logging
from torch_geometric.loader import DataLoader
from tqdm import tqdm    # pip install tqdm if fails.
from model import get_model
# from utils import *

In [167]:
batch_size = 5
device = 'cuda' if torch.cuda.is_available() else 'cpu'
logging.basicConfig(level=logging.INFO)
model = get_model(0, logging, device)
# re-dock model
# modelFile = "../saved_models/re_dock.pt"
# self-dock model
modelFile = "../saved_models/self_dock.pt"

model.load_state_dict(torch.load(modelFile, map_location=device))
_ = model.eval()

data_loader = DataLoader(dataset, batch_size=batch_size, follow_batch=['x', 'y', 'compound_pair'], shuffle=False, num_workers=8)
affinity_pred_list = []
y_pred_list = []
for data in tqdm(data_loader):
    data = data.to(device)
    y_pred, affinity_pred = model(data)
    affinity_pred_list.append(affinity_pred.detach().cpu())
    for i in range(data.y_batch.max() + 1):
        y_pred_list.append((y_pred[data['y_batch'] == i]).detach().cpu())

affinity_pred_list = torch.cat(affinity_pred_list)

03:50:45   5 stack, readout2, pred dis map add self attention and GVP embed, compound model GIN
Parameter containing:
tensor([1.], requires_grad=True)


100%|██████████| 238/238 [00:14<00:00, 16.60it/s]


In [168]:
info = dataset.data
info['affinity'] = affinity_pred_list

In [169]:
info.to_csv(f"{pre}/result_info_all.csv")

In [170]:
chosen = info.loc[info.groupby(['protein_name', 'compound_name'],sort=False)['affinity'].agg('idxmax')].reset_index()
chosen.to_csv(f"{pre}/result_chosen_by_affinity.csv")

In [171]:
if given_pocket:
    info_right_pocket = info[info["pocket_name"]==given_pocket].reset_index()
    info_right_pocket.to_csv(f"{pre}/result_given_pocket.csv")

# from predicted interaction distance map to sdf

In [172]:
import matplotlib.pyplot as plt
%matplotlib inline

In [173]:
from generation_utils import get_LAS_distance_constraint_mask, get_info_pred_distance, write_with_new_coords
device = 'cpu'
if conf_by_chosen:
    for i, line in chosen.iterrows():
        idx = line['index']
        pocket_name = line['pocket_name']
        compound_name = line['compound_name']
        ligandName = compound_name.split("_")[1]
        coords = dataset[idx].coords.to(device)
        protein_nodes_xyz = dataset[idx].node_xyz.to(device)
        n_compound = coords.shape[0]
        n_protein = protein_nodes_xyz.shape[0]
        y_pred = y_pred_list[idx].reshape(n_protein, n_compound).to(device)
        y = dataset[idx].dis_map.reshape(n_protein, n_compound).to(device)
        compound_pair_dis_constraint = torch.cdist(coords, coords)
        rdkitMolFile = f"{pre}/sdfs/{pdb}_{ligandName}_mol_from_rdkit.sdf"
        mol = Chem.MolFromMolFile(rdkitMolFile)
        LAS_distance_constraint_mask = get_LAS_distance_constraint_mask(mol).bool()
        info2 = get_info_pred_distance(coords, y_pred, protein_nodes_xyz, compound_pair_dis_constraint, 
                                      LAS_distance_constraint_mask=LAS_distance_constraint_mask,
                                      n_repeat=1, show_progress=False)

        result_folder = f'{pre}/{pdb}_result/'
        os.system(f'mkdir -p {result_folder}')
        # toFile = f'{result_folder}/{ligandName}_{pocket_name}_tankbind.sdf'
        if pocket_name == given_pocket:
            toFile = f'{result_folder}/{ligandName}_tankbind_AfGv_{pocket_name}.sdf'
        else:
            toFile = f'{result_folder}/{ligandName}_tankbind_Af_{pocket_name}.sdf'
        # print(toFile)
        new_coords = info2.sort_values("loss")['coords'].iloc[0].astype(np.double)
        write_with_new_coords(mol, new_coords, toFile)

if given_pocket:
    for i, line in info_right_pocket.iterrows():
        idx = line['index']
        pocket_name = line['pocket_name']
        if (pocket_name == given_pocket) and conf_by_chosen:
            pass
        else:
            compound_name = line['compound_name']
            ligandName = compound_name.split("_")[1]
            coords = dataset[idx].coords.to(device)
            protein_nodes_xyz = dataset[idx].node_xyz.to(device)
            n_compound = coords.shape[0]
            n_protein = protein_nodes_xyz.shape[0]
            y_pred = y_pred_list[idx].reshape(n_protein, n_compound).to(device)
            y = dataset[idx].dis_map.reshape(n_protein, n_compound).to(device)
            compound_pair_dis_constraint = torch.cdist(coords, coords)
            rdkitMolFile = f"{pre}/sdfs/{pdb}_{ligandName}_mol_from_rdkit.sdf"
            mol = Chem.MolFromMolFile(rdkitMolFile)
            LAS_distance_constraint_mask = get_LAS_distance_constraint_mask(mol).bool()
            info2 = get_info_pred_distance(coords, y_pred, protein_nodes_xyz, compound_pair_dis_constraint, 
                                          LAS_distance_constraint_mask=LAS_distance_constraint_mask,
                                          n_repeat=1, show_progress=False)

            result_folder = f'{pre}/{pdb}_result/'
            os.system(f'mkdir -p {result_folder}')
            # toFile = f'{result_folder}/{ligandName}_{pocket_name}_tankbind.sdf'
            toFile = f'{result_folder}/{ligandName}_tankbind_Gv_{pocket_name}.sdf'
            # print(toFile)
            new_coords = info2.sort_values("loss")['coords'].iloc[0].astype(np.double)
            write_with_new_coords(mol, new_coords, toFile)