# sx.nciyes.process.ipynb

Notebook for processing input datas.

- 原始数据
    - 【NCI】merge_0304.csv，NCI 总表，凡 <font color="blue">667887</font> 条 NCI 数据。
    - 【PDB & AFF】凡 <font color="blue">19443</font> 组 PDB 数据。
        - 【refined】凡 5316 个 PDB 文件夹。
        - 【refined aff】凡 5316 条活性数据，PDB_code 与前者一一对应。
        - 【general】凡 14127 个 PDB 文件夹。
        - 【general aff】凡 14127 条活性数据，PDB_code 与前者一一对应。
- 初步清洗
    - 【NCI】从 NCI 去掉 water：去掉 <font color="red">66528</font>；余 <font color="green">601359</font>；
        - 其中满足 CP_type 为 (3::-1) 的一共只有 <font color="blue">184175</font> 条。
    - 【NCI】从 NCI 去掉 pdbbind 中不包含的：去掉 <font color="red">86</font>；余 <font color="green">601273</font>；
    - 【pdb】从 pdbbind 去掉 NCI 数据中不包含的：
        - 【refined】 去掉 <font color="red">1738</font>；余 <font color="green">3578</font>；
        - 【general】 去掉 <font color="red">5113</font>；余 <font color="green">9014</font>；
- 配体名字一致性
    - 【初始】共有 <font color="blue">12592</font> 组 PDB， <font color="blue">601359</font> 条 NCI 数据。
    - 【LigName incoherent】针对同一蛋白，要求 pdbbind 中 mol2 文件的 配体名、 pdbbind index 活性文件中的 配体名、 NCI 数据中的 配体名 一致。
        - 【Coherent】共计 <font color="green">10824</font> 组 PDB 数据满足要求。涉及 <font color="green">561893</font> 条 NCI 数据。
        - 【Incoherent】共计 <font color="red">1768</font> 组 PDB 不满足要求。去掉了 <font color="red">39466</font> 条 NCI 数据。
            - ！这部分数据中有一部分经处理后或可以使用。
- 电子密度拓扑形态
    - 【初始】共有 <font color="blue">10824</font> 组 PDB， <font color="blue">561893</font> 条 NCI 数据。
    - 【NCI e topo】要求 NCI 数据中 CP_Type 为 (3::-1)。
        - 余 <font color="green">171777</font> 条 NCI；去掉了 <font color="red">390116</font> 条数据。
    - 【PDB e topo】去掉 不包含 (3::-1) NCI 的 PDB。
        - 余 <font color="green">10810</font> 组 PDB；去掉了 <font color="red">14</font> 组。
- 共计 <font color="green">171777</font> 条 NCI 数据 和 <font color="green">10810</font> 组 PDB。

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
r = "\033[1;31m"
b = "\033[1;34m"
suf = "\033[0m"

In [3]:
# path
save_path = "./Inputs/"
nci_fpath = "../../data/ExptNCI/merged_0304.csv"
nci_save_fpath = "./Inputs/allpdb.nci.csv"
nci_coherent_save_fpath = "./Inputs/allpdb.nci__coherent.csv"
nci_coherent_topo_save_fpath = "./Inputs/allpdb.nci__coherent.topo.csv"
refined_path = "../../data/pdbbind2020/refined-set/" # for .pdb files
general_path = "../../data/pdbbind2020/v2020-other-PL/"
#hbligand_path = "../../data/HBLigand/" # for .
#pdbs_save_path = "./Inputs/Dev/RenamedPDBBind/"
pdb_df_save_fpath = "./Inputs/allpdb.pdbs.csv"
pdb_coherent_topo_df_save_fpath = "./Inputs/allpdb.pdbs__coherent.topo.csv"
#ligands_save_path = "./Inputs/Dev/RenamedPDBBind/"

ligand_df_save_fpath = './Inputs/allpdb.ligands.3name.csv'
ligand_coherent_df_save_fpath = './Inputs/allpdb.ligands__3name.coherent.csv'
ligand_coherent_topo_df_save_fpath = './Inputs/allpdb.ligands__3name.coherent.topo.csv'
refined_affinity_fpath = "../../data/pdbbind2020/index/INDEX_refined_data_preprocessed.2020"
general_affinity_fpath = "../../data/pdbbind2020/index/INDEX_general_PL_data_preprocessed.2020"
logs_failed_files_fpath = "./Inputs/dropped_failed_files.txt"

refined_aff_df_fpath = refined_affinity_fpath+".csv"
general_aff_df_fpath = general_affinity_fpath +".csv"

##### Manually modification record

PL other：
- Rename the columns : PDB, resolution, release_year, -logKd/Ki,Kd/Ki, reference, ligand_name
- Rename ligand name "FMN hq", "FMN sq", "FMN ox", ... to "FMN-hq", "FMN-sq", "FMN-ox"
- Remove "(", ")", "//"
- Line 5512 (p 5507): from "5ot3  2.04  2018   5.23  Kd=5.86uM     5ot3.pdf 9LQ,18-mer" to "9LQ/18-mer"
- Save modified file to "INDEX_general_PL_data_preprocessed.2020"
    

refined:
- Rename the columns : PDB, resolution, release_year, -logKd/Ki,Kd/Ki, reference, ligand_name
- Remove "(", ")", "//"
- Save modified file to "INDEX_refined_data_preprocessed.2020"

### util: affinity_data to .csv

In [4]:
# Generation
for _dpath,_folder in zip([refined_affinity_fpath, general_affinity_fpath], [refined_path, general_path]):
    if True:
        os.system(f"rm {_dpath}.csv")
        ls = []
        with open(_dpath, 'r') as f:
            line = f.readline()
            while line != "" or []:
                items = line.split()
                ls.append(items)
                line = f.readline()
        columns = ls[0]
        del ls[0]
        df = pd.DataFrame(ls, columns=columns)
        df.to_csv(f"{_dpath}.csv")

In [5]:
# Check affinity
_aff = pd.read_csv(f"{_dpath}.csv", index_col=0)
_aff_pdbs = set(_aff.PDB_code.unique())
_folder_pdbs = np.array(os.listdir(_folder))
_folder_pdbs.sort()
for _f in _folder_pdbs:
    if _f not in _aff_pdbs:
        print(_f,"has no affinity data")

In [6]:
# Get PDB list
refined_pdbs = set(os.listdir(refined_path))
general_pdbs = set(os.listdir(general_path))
all_pdbs = refined_pdbs|general_pdbs
len(refined_pdbs), len(general_pdbs), len(refined_pdbs) + len(general_pdbs), len(all_pdbs)

(5316, 14127, 19443, 19443)

We have 5316 pdbs in refined set and 14127 in general set. The two sets are disjoint.

### NCI data processing

In [7]:
nci_data = pd.read_csv(nci_fpath)

In [8]:
# Remove water
nci_data = nci_data.drop(nci_data[nci_data['Group']=='water'].index)

# Remove ncis without pdbbind files
drop_1 = nci_data[~nci_data.PDB_Code.isin(all_pdbs)]
drop_1.to_csv(f"{save_path}dropped_nci - without pdbbind files.csv")
nci_data = nci_data[nci_data.PDB_Code.isin(all_pdbs)].reset_index()
del nci_data["index"]

# Check ncis
nci_pdbs = set(nci_data.PDB_Code.unique())
refined_without_nci = refined_pdbs - nci_pdbs
general_without_nci = general_pdbs - nci_pdbs
refined_nci_pdbs = refined_pdbs - refined_without_nci
general_nci_pdbs = general_pdbs - general_without_nci
all_nci_pdbs = refined_nci_pdbs|general_nci_pdbs

# Save files
os.system(f"rm {save_path}\"dropped_refined_pdb - without nci.txt\"")
os.system(f"rm {save_path}\"dropped_general_pdb - without nci.txt\"")
os.system(f"rm {save_path}\"keeped_refined_pdb.txt\"")
os.system(f"rm {save_path}\"keeped_general_pdb.txt\"")
os.system(f"rm {save_path}\"keeped_all_pdb.txt\"")
os.system(f"rm {save_path}\"keeped_all_pdb_with_source.txt\"")
with open(f"{save_path}dropped_refined_pdb - without nci.txt", "a") as f:
    f.writelines([ _line + "\n" for _line in refined_without_nci])
with open(f"{save_path}dropped_general_pdb - without nci.txt", "a") as f:
    f.writelines([ _line + "\n" for _line in general_without_nci])
with open(f"{save_path}keeped_refined_pdb.txt", "a") as f:
    f.writelines([ _line + "\n" for _line in refined_nci_pdbs])
with open(f"{save_path}keeped_general_pdb.txt", "a") as f:
    f.writelines([ _line + "\n" for _line in general_nci_pdbs])
with open(f"{save_path}keeped_all_pdb.txt", "a") as f:
    f.writelines([ _line + "\n" for _line in all_nci_pdbs])
with open(f"{save_path}keeped_all_pdb_with_source.txt", "a") as f:
    f.writelines([ _line + ",refined\n" for _line in refined_nci_pdbs] + [ _line + ",general\n" for _line in general_nci_pdbs])

In [9]:
len(refined_without_nci), len(refined_nci_pdbs), len(refined_without_nci)+len(refined_nci_pdbs),len(general_without_nci), len(general_nci_pdbs), len(general_without_nci)+len(general_nci_pdbs),len(all_nci_pdbs)

(1738, 3578, 5316, 5113, 9014, 14127, 12592)

In [10]:
# Add ResFullID
nci_data["ResFullID"] = nci_data["ChainID"]+"_"+nci_data["ResID"].astype(str)

# Save File
nci_data.to_csv(nci_save_fpath)
print(f"Saved {r}nci_data{suf} to {b}{nci_save_fpath}{suf}")

Saved [1;31mnci_data[0m to [1;34m./Inputs/allpdb.nci.csv[0m


In [35]:
nci_data[nci_data.CP_type == "(3::-1)"]

Unnamed: 0,NCI_ID,Complex_ID,PDB_Code,L_type,R_type,NCI_atom_pair,NCI_intuitive,NCI_ODDT,Resolution,CP_type,...,Group,lr_id,rl_type,rec_atom_type,new,in_use,x,y,z,ResFullID
0,11gs-10::195,11gs::GSH,11gs,N.3,O.2,N.3::O.2,Hydrophilic (N/O)...Hydrophilic (N/O),hbonds,2.30,(3::-1),...,protein,10::195,O.2::N.3,O_pi_acc,1,1,15.701,11.344,23.674,A_64
1,11gs-10::351,11gs::GSH,11gs,N.3,O.co2,N.3::O.co2,Hydrophilic (N/O)...Hydrophilic (N/O),salt_bridges,2.30,(3::-1),...,protein,10::351,O.co2::N.3,O_pi_acc_neg,1,1,14.106,11.941,22.682,B_98
2,11gs-12::175,11gs::GSH,11gs,N.am,O.2,N.am::O.2,Hydrophilic (N/O)...Hydrophilic (N/O),hbonds,2.30,(3::-1),...,protein,12::175,O.2::N.am,O_pi_acc,1,1,12.741,7.612,27.502,A_52
3,11gs-13::172,11gs::GSH,11gs,O.2,N.am,N.am::O.2,Hydrophilic (N/O)...Hydrophilic (N/O),hbonds,2.30,(3::-1),...,protein,13::172,N.am::O.2,N_pi_don,1,1,11.478,8.508,30.479,A_52
4,11gs-13::21,11gs::GSH,11gs,O.2,C.ar,C.ar::O.2,Aromatic…Hydrophilic (N/O),not_rule,2.30,(3::-1),...,protein,13::21,C.ar::O.2,C_pi,1,1,9.981,6.418,30.904,A_8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184170,6m8y-3::30,6m8y::A8X,6m8y,O.3,C.3,C.3::O.3,Aliphatic C…Hydrophilic (N/O),not_rule,1.10,(3::-1),...,protein,3::30,C.3::O.3,C_ali_apol,-1,0,22.341,28.958,29.703,A_51
184171,6m9t-1::197,6m9t::GOL,6m9t,C.3,C.3,C.3::C.3,Aliphatic C…Aliphatic C,not_rule,2.50,(3::-1),...,protein,1::197,C.3::C.3,C_ali_apol,-1,0,127.340,-12.712,152.181,A_335
184172,6ma1-0::112,6ma1::J9P,6ma1,Cl,C.3,C.3::Cl,Sigma hole involved,not_rule,2.75,(3::-1),...,protein,0::112,C.3::Cl,C_ali_apol,-1,0,-28.335,-4.199,2.895,A_898
184173,6q6y-12::119,6q6y::JA4,6q6y,C.ar,O.2,C.ar::O.2,Aromatic…Hydrophilic (N/O),not_rule,2.03,(3::-1),...,protein,12::119,O.2::C.ar,O_pi_acc,-1,0,-10.209,-36.413,23.847,A_897


In [11]:
nci_data = pd.read_csv(nci_save_fpath, index_col = 0)
nci_data

Unnamed: 0,NCI_ID,Complex_ID,PDB_Code,L_type,R_type,NCI_atom_pair,NCI_intuitive,NCI_ODDT,Resolution,CP_type,...,Group,lr_id,rl_type,rec_atom_type,new,in_use,x,y,z,ResFullID
0,11gs-10::195,11gs::GSH,11gs,N.3,O.2,N.3::O.2,Hydrophilic (N/O)...Hydrophilic (N/O),hbonds,2.30,(3::-1),...,protein,10::195,O.2::N.3,O_pi_acc,1,1,15.701,11.344,23.674,A_64
1,11gs-10::351,11gs::GSH,11gs,N.3,O.co2,N.3::O.co2,Hydrophilic (N/O)...Hydrophilic (N/O),salt_bridges,2.30,(3::-1),...,protein,10::351,O.co2::N.3,O_pi_acc_neg,1,1,14.106,11.941,22.682,B_98
2,11gs-12::175,11gs::GSH,11gs,N.am,O.2,N.am::O.2,Hydrophilic (N/O)...Hydrophilic (N/O),hbonds,2.30,(3::-1),...,protein,12::175,O.2::N.am,O_pi_acc,1,1,12.741,7.612,27.502,A_52
3,11gs-13::172,11gs::GSH,11gs,O.2,N.am,N.am::O.2,Hydrophilic (N/O)...Hydrophilic (N/O),hbonds,2.30,(3::-1),...,protein,13::172,N.am::O.2,N_pi_don,1,1,11.478,8.508,30.479,A_52
4,11gs-13::21,11gs::GSH,11gs,O.2,C.ar,C.ar::O.2,Aromatic…Hydrophilic (N/O),not_rule,2.30,(3::-1),...,protein,13::21,C.ar::O.2,C_pi,1,1,9.981,6.418,30.904,A_8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601268,8a3h-8::110,8a3h::IDC,8a3h,O.3,O.co2,O.3::O.co2,Hydrophilic (N/O)...Hydrophilic (N/O),hbonds,0.97,,...,protein,8::110,O.co2::O.3,O_pi_acc_neg,1,0,,,,A_228
601269,8a3h-8::50,8a3h::IDC,8a3h,O.3,C.ar,C.ar::O.3,Aromatic…Hydrophilic (N/O),not_rule,0.97,(3::+1),...,protein,8::50,C.ar::O.3,C_pi_don,1,1,58.847,42.074,33.363,A_101
601270,8a3h-8::67,8a3h::IDC,8a3h,O.3,O.2,O.2::O.3,Hydrophilic (N/O)...Hydrophilic (N/O),not_rule,0.97,(3::+1),...,protein,8::67,O.2::O.3,O_pi_acc,1,1,60.832,40.473,32.790,A_138
601271,8a3h-8::73,8a3h::IDC,8a3h,O.3,C.3,C.3::O.3,Aliphatic C…Hydrophilic (N/O),not_rule,0.97,(3::+1),...,protein,8::73,C.3::O.3,C_pi,1,1,60.407,39.454,34.079,A_139


In [13]:
# Check ligand name uniqueness for each pdb_code
nci_ligand_name_data = nci_data[["PDB_Code", "LigName", "LigAtomName", "ResFullID", "CP_type", "Group"]]
nci_ligand_name_data = nci_ligand_name_data.groupby(["PDB_Code", "LigName"]).count().reset_index()
nci_ligand_name_data

Unnamed: 0,PDB_Code,LigName,LigAtomName,ResFullID,CP_type,Group
0,11gs,GSH,33,33,30,33
1,13gs,SAS,103,103,26,103
2,184l,I4B,21,21,21,21
3,185l,IND,18,18,18,18
4,186l,N4B,17,17,17,17
...,...,...,...,...,...,...
12587,6q3q,GOL,13,13,12,13
12588,6q6y,HKQ,29,29,27,29
12589,6q73,HKK,26,26,24,26
12590,6q74,HKN,34,34,33,34


In [14]:
len(nci_ligand_name_data['PDB_Code']), len(nci_ligand_name_data['PDB_Code'].unique())

(12592, 12592)

Since we have 12592 unique pdb_codes and 12592 rows, each pdb is related to a unique ligname in nci.

### Prepare input

In [15]:
if True:
    pdb_df_list = []
    ligand_df_list = []
    logs_failed_files = []

    # refined
    for (_name, _pdb_list, _pdb_path, _aff_fpath) in zip(
        ["refined","general"],[refined_nci_pdbs, general_nci_pdbs], [refined_path, general_path], [refined_aff_df_fpath, general_aff_df_fpath]
        ):
        print(f"Processing {b}.pdb{suf} and {b}.mol2{suf} files for {r}{_name}{suf}:")
        aff_df = pd.read_csv(_aff_fpath, index_col=0)
        for (i,_pdb) in enumerate(tqdm(_pdb_list, total=len(_pdb_list))):
            with open(f"{_pdb_path}{_pdb}/{_pdb}_ligand.mol2", "r") as f:
                text = ""
                while (text != "@<TRIPOS>ATOM\n"):
                    text = f.readline()
                text = f.readline().split()
                
            if (text[0] == "1"):
                ligand_name = text[-2]
            else:
                logs_failed_files.append(f"{_name} - {_pdb} - {ligand_name} : Structure Error : .mol2 file in {_pdb_path}{_pdb}/{_pdb}_ligand.mol2\n")
                continue
                
            
            aff_line = aff_df[(aff_df["PDB_code"]== _pdb)]
            
            if len(aff_line) == 1:
                aff = aff_line["-logKd/Ki"].values[0]
                if not isinstance(aff, int) and not isinstance(aff, float):
                    logs_failed_files.append(f"{_name} - {_pdb} - {ligand_name} : Affinity value Error : .mol2 file in {_pdb_path}{_pdb}/{_pdb}_ligand.mol2\n")
                    continue
            else:
                logs_failed_files.append(f"{_name} - {_pdb} - {ligand_name} : Affinity error : .mol2 file in {_pdb_path}{_pdb}/{_pdb}_ligand.mol2\n")
                continue
            ligand_name2 = aff_line['ligand_name'].values[0]
            ligand_name3 = nci_data[(nci_data["PDB_Code"]==_pdb)].LigName.unique().__str__().replace("[","").replace("]","").replace("\'", "")
            pdb_df_list.append([_pdb, f"{_pdb_path}{_pdb}/{_pdb}_protein.pdb", _name])
            ligand_df_list.append([_pdb, ligand_name, ligand_name2, ligand_name3, f"{_pdb_path}{_pdb}/{_pdb}_ligand.mol2", aff, _name])
            
    pdb_df = pd.DataFrame(pdb_df_list, columns=["pdb_code", "pdb_fpath", "datagroup"])
    pdb_df.to_csv(pdb_df_save_fpath)
    print(f"Saved {r}pdb_df{suf} to {b}{pdb_df_save_fpath}{suf}")
    with open(logs_failed_files_fpath, "w") as f:
        f.writelines(logs_failed_files)
    ligand_df = pd.DataFrame(ligand_df_list, columns=["pdb_code", "ligand_name", "ligand_name_in_aff", "ligand_name_in_nci", "ligand_fpath", "affinity", "datagroup"])
    ligand_df.to_csv(ligand_df_save_fpath)
    print(f"Saved {r}ligand_df{suf} to {b}{ligand_df_save_fpath}{suf}")

Processing [1;34m.pdb[0m and [1;34m.mol2[0m files for [1;31mrefined[0m:


  0%|          | 0/3578 [00:00<?, ?it/s]

Processing [1;34m.pdb[0m and [1;34m.mol2[0m files for [1;31mgeneral[0m:


  0%|          | 0/9014 [00:00<?, ?it/s]

Saved [1;31mpdb_df[0m to [1;34m./Inputs/allpdb.pdbs.csv[0m
Saved [1;31mligand_df[0m to [1;34m./Inputs/allpdb.ligands.3name.csv[0m


In [16]:
pdb_df = pd.read_csv(pdb_df_save_fpath)
ligand_df = pd.read_csv(ligand_df_save_fpath)

In [17]:
ligand_coherent_df = ligand_df[(ligand_df["ligand_name"]==ligand_df["ligand_name_in_aff"])&(ligand_df["ligand_name"]==ligand_df["ligand_name_in_nci"])]
ligand_coherent_df.to_csv(ligand_coherent_df_save_fpath)
print(f"Saved {r}ligand_coherent_df{suf} to {b}{ligand_coherent_df_save_fpath}{suf}")

Saved [1;31mligand_coherent_df[0m to [1;34m./Inputs/allpdb.ligands__3name.coherent.csv[0m


In [18]:
ligand_coherent_df

Unnamed: 0.1,Unnamed: 0,pdb_code,ligand_name,ligand_name_in_aff,ligand_name_in_nci,ligand_fpath,affinity,datagroup
0,0,2jh6,894,894,894,../../data/pdbbind2020/refined-set/2jh6/2jh6_l...,7.77,refined
1,1,3lk8,Z79,Z79,Z79,../../data/pdbbind2020/refined-set/3lk8/3lk8_l...,7.71,refined
2,2,4qjw,WWO,WWO,WWO,../../data/pdbbind2020/refined-set/4qjw/4qjw_l...,7.80,refined
3,3,5o9o,9PB,9PB,9PB,../../data/pdbbind2020/refined-set/5o9o/5o9o_l...,4.89,refined
4,4,4wov,3SM,3SM,3SM,../../data/pdbbind2020/refined-set/4wov/4wov_l...,7.14,refined
...,...,...,...,...,...,...,...,...
12585,12585,4j0v,1H7,1H7,1H7,../../data/pdbbind2020/v2020-other-PL/4j0v/4j0...,7.55,general
12586,12586,4tyl,39O,39O,39O,../../data/pdbbind2020/v2020-other-PL/4tyl/4ty...,3.22,general
12587,12587,2ew6,Y13,Y13,Y13,../../data/pdbbind2020/v2020-other-PL/2ew6/2ew...,4.97,general
12589,12589,3hwx,TPP,TPP,TPP,../../data/pdbbind2020/v2020-other-PL/3hwx/3hw...,7.86,general


In [19]:
ligand_incoherent_df = ligand_df[(ligand_df["ligand_name"]!=ligand_df["ligand_name_in_aff"])|(ligand_df["ligand_name"]!=ligand_df["ligand_name_in_nci"])]
ligand_incoherent_df.to_csv(f"{save_path}dropped_2 - ligand_df - ligand_name incoherent.csv")
print(f"Saved {r}ligand_incoherent_df{suf} to {b}{save_path}dropped_2 - ligand_df - ligand_name incoherent.csv{suf}")

Saved [1;31mligand_incoherent_df[0m to [1;34m./Inputs/dropped_2 - ligand_df - ligand_name incoherent.csv[0m


In [20]:
ligand_incoherent_df

Unnamed: 0.1,Unnamed: 0,pdb_code,ligand_name,ligand_name_in_aff,ligand_name_in_nci,ligand_fpath,affinity,datagroup
27,27,4duh,RLI,RLI,UNK,../../data/pdbbind2020/refined-set/4duh/4duh_l...,5.18,refined
38,38,1y3n,BEM,2-mer,MAV,../../data/pdbbind2020/refined-set/1y3n/1y3n_l...,5.55,refined
50,50,5jop,NON,4-mer,NAG,../../data/pdbbind2020/refined-set/5jop/5jop_l...,9.41,refined
63,63,5ovp,ACE,7-mer,ACE,../../data/pdbbind2020/refined-set/5ovp/5ovp_l...,5.04,refined
81,81,4c1t,AHR,4-mer,1PE,../../data/pdbbind2020/refined-set/4c1t/4c1t_l...,7.15,refined
...,...,...,...,...,...,...,...,...
12567,12567,3zju,NON,AN3016,DJF,../../data/pdbbind2020/v2020-other-PL/3zju/3zj...,5.68,general
12570,12570,3luo,SIN,6-mer,NIT,../../data/pdbbind2020/v2020-other-PL/3luo/3lu...,6.95,general
12572,12572,5n7g,LEU,7-mer,GOL,../../data/pdbbind2020/v2020-other-PL/5n7g/5n7...,4.58,general
12588,12588,4rhu,3QE,45T-3QE,3QE,../../data/pdbbind2020/v2020-other-PL/4rhu/4rh...,6.16,general


In [21]:
nci_data_coherent = nci_data[~nci_data.PDB_Code.isin(ligand_incoherent_df.pdb_code.unique())]
nci_data_coherent = nci_data_coherent.reset_index()
del nci_data_coherent["index"]
nci_data_coherent.to_csv(nci_coherent_save_fpath)
print(f"Saved {r}nci_data_coherent{suf} to {b}{nci_coherent_save_fpath}{suf}")

Saved [1;31mnci_data_coherent[0m to [1;34m./Inputs/allpdb.nci__coherent.csv[0m


In [34]:
nci_data_coherent

Unnamed: 0,NCI_ID,Complex_ID,PDB_Code,L_type,R_type,NCI_atom_pair,NCI_intuitive,NCI_ODDT,Resolution,CP_type,...,Group,lr_id,rl_type,rec_atom_type,new,in_use,x,y,z,ResFullID
0,13gs-12::92,13gs::GSH,13gs,O.co2,S.3,O.co2::S.3,Sigma hole involved,hbonds,1.90,(3::-1),...,other,12::92,S.3::O.co2,other,1,1,10.824,4.413,25.228,A_210
1,13gs-14::44,13gs::SAS,13gs,O.co2,N.pl3,N.pl3::O.co2,Hydrophilic (N/O)...Hydrophilic (N/O),salt_bridges,1.90,(3::-1),...,protein,14::44,N.pl3::O.co2,C_pi_don,1,1,12.152,6.121,23.221,A_13
2,13gs-14::62,13gs::SAS,13gs,O.co2,C.3,C.3::O.co2,Aliphatic C…Hydrophilic (N/O),not_rule,1.90,(3::-1),...,protein,14::62,C.3::O.co2,C_ali_apol,1,1,11.166,4.982,21.931,A_104
3,13gs-16::83,13gs::SAS,13gs,C.ar,C.3,C.3::C.ar,Aromatic…Aliphatic C,not_rule,1.90,(3::-1),...,protein,16::83,C.3::C.ar,C_ali_don,1,1,7.181,-0.427,28.955,A_205
4,13gs-2::29,13gs::SAS,13gs,O.2,C.3,C.3::O.2,Aliphatic C…Hydrophilic (N/O),not_rule,1.90,(3::-1),...,protein,2::29,C.3::O.2,C_ali_don,1,1,8.392,-1.424,33.685,A_9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
561888,8a3h-8::110,8a3h::IDC,8a3h,O.3,O.co2,O.3::O.co2,Hydrophilic (N/O)...Hydrophilic (N/O),hbonds,0.97,,...,protein,8::110,O.co2::O.3,O_pi_acc_neg,1,0,,,,A_228
561889,8a3h-8::50,8a3h::IDC,8a3h,O.3,C.ar,C.ar::O.3,Aromatic…Hydrophilic (N/O),not_rule,0.97,(3::+1),...,protein,8::50,C.ar::O.3,C_pi_don,1,1,58.847,42.074,33.363,A_101
561890,8a3h-8::67,8a3h::IDC,8a3h,O.3,O.2,O.2::O.3,Hydrophilic (N/O)...Hydrophilic (N/O),not_rule,0.97,(3::+1),...,protein,8::67,O.2::O.3,O_pi_acc,1,1,60.832,40.473,32.790,A_138
561891,8a3h-8::73,8a3h::IDC,8a3h,O.3,C.3,C.3::O.3,Aliphatic C…Hydrophilic (N/O),not_rule,0.97,(3::+1),...,protein,8::73,C.3::O.3,C_pi,1,1,60.407,39.454,34.079,A_139


In [22]:
nci_data_coherant_topo = nci_data_coherent[nci_data_coherent["CP_type"]=="(3::-1)"]
nci_data_coherant_topo.to_csv(nci_coherent_topo_save_fpath)
print(f"Saved {r}nci_data_coherant_topo{suf} to {b}{nci_coherent_topo_save_fpath}{suf}")

Saved [1;31mnci_data_coherant_topo[0m to [1;34m./Inputs/allpdb.nci__coherent.topo.csv[0m


In [23]:
nci_data_coherant_topo

Unnamed: 0,NCI_ID,Complex_ID,PDB_Code,L_type,R_type,NCI_atom_pair,NCI_intuitive,NCI_ODDT,Resolution,CP_type,...,Group,lr_id,rl_type,rec_atom_type,new,in_use,x,y,z,ResFullID
0,13gs-12::92,13gs::GSH,13gs,O.co2,S.3,O.co2::S.3,Sigma hole involved,hbonds,1.90,(3::-1),...,other,12::92,S.3::O.co2,other,1,1,10.824,4.413,25.228,A_210
1,13gs-14::44,13gs::SAS,13gs,O.co2,N.pl3,N.pl3::O.co2,Hydrophilic (N/O)...Hydrophilic (N/O),salt_bridges,1.90,(3::-1),...,protein,14::44,N.pl3::O.co2,C_pi_don,1,1,12.152,6.121,23.221,A_13
2,13gs-14::62,13gs::SAS,13gs,O.co2,C.3,C.3::O.co2,Aliphatic C…Hydrophilic (N/O),not_rule,1.90,(3::-1),...,protein,14::62,C.3::O.co2,C_ali_apol,1,1,11.166,4.982,21.931,A_104
3,13gs-16::83,13gs::SAS,13gs,C.ar,C.3,C.3::C.ar,Aromatic…Aliphatic C,not_rule,1.90,(3::-1),...,protein,16::83,C.3::C.ar,C_ali_don,1,1,7.181,-0.427,28.955,A_205
4,13gs-2::29,13gs::SAS,13gs,O.2,C.3,C.3::O.2,Aliphatic C…Hydrophilic (N/O),not_rule,1.90,(3::-1),...,protein,2::29,C.3::O.2,C_ali_don,1,1,8.392,-1.424,33.685,A_9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171772,6iiu-8::70,6iiu::A8X,6iiu,C.ar,C.3,C.3::C.ar,Aromatic…Aliphatic C,not_rule,2.50,(3::-1),...,protein,8::70,C.3::C.ar,C_ali_apol,-1,0,27.477,166.288,147.549,A_85
171773,6m9t-1::197,6m9t::GOL,6m9t,C.3,C.3,C.3::C.3,Aliphatic C…Aliphatic C,not_rule,2.50,(3::-1),...,protein,1::197,C.3::C.3,C_ali_apol,-1,0,127.340,-12.712,152.181,A_335
171774,6ma1-0::112,6ma1::J9P,6ma1,Cl,C.3,C.3::Cl,Sigma hole involved,not_rule,2.75,(3::-1),...,protein,0::112,C.3::Cl,C_ali_apol,-1,0,-28.335,-4.199,2.895,A_898
171775,6q6y-12::119,6q6y::JA4,6q6y,C.ar,O.2,C.ar::O.2,Aromatic…Hydrophilic (N/O),not_rule,2.03,(3::-1),...,protein,12::119,O.2::C.ar,O_pi_acc,-1,0,-10.209,-36.413,23.847,A_897


In [24]:
_set_coh = set(nci_data_coherent.PDB_Code.unique())
_set_coh_topo = set(nci_data_coherant_topo.PDB_Code.unique())
_dropped_topo = _set_coh - _set_coh_topo
_dropped_topo

{'1xnz',
 '2evc',
 '2evm',
 '2qx0',
 '2yhy',
 '3ekw',
 '3ipx',
 '3v1r',
 '4c1h',
 '4llp',
 '5l4f',
 '5ltn',
 '5wei',
 '6gnp'}

In [32]:
ligand_coherent_topo_df = ligand_coherent_df[~ligand_coherent_df["pdb_code"].isin(_dropped_topo)]
del ligand_coherent_topo_df["Unnamed: 0"]
ligand_coherent_topo_df.to_csv(ligand_coherent_topo_df_save_fpath)
print(f"Saved {r}ligand_coherent_topo_df{suf} to {b}{ligand_coherent_topo_df_save_fpath}{suf}")

Saved [1;31mligand_coherent_topo_df[0m to [1;34m./Inputs/allpdb.ligands__3name.coherent.topo.csv[0m


In [33]:
ligand_coherent_topo_df

Unnamed: 0,pdb_code,ligand_name,ligand_name_in_aff,ligand_name_in_nci,ligand_fpath,affinity,datagroup
0,2jh6,894,894,894,../../data/pdbbind2020/refined-set/2jh6/2jh6_l...,7.77,refined
1,3lk8,Z79,Z79,Z79,../../data/pdbbind2020/refined-set/3lk8/3lk8_l...,7.71,refined
2,4qjw,WWO,WWO,WWO,../../data/pdbbind2020/refined-set/4qjw/4qjw_l...,7.80,refined
3,5o9o,9PB,9PB,9PB,../../data/pdbbind2020/refined-set/5o9o/5o9o_l...,4.89,refined
4,4wov,3SM,3SM,3SM,../../data/pdbbind2020/refined-set/4wov/4wov_l...,7.14,refined
...,...,...,...,...,...,...,...
12585,4j0v,1H7,1H7,1H7,../../data/pdbbind2020/v2020-other-PL/4j0v/4j0...,7.55,general
12586,4tyl,39O,39O,39O,../../data/pdbbind2020/v2020-other-PL/4tyl/4ty...,3.22,general
12587,2ew6,Y13,Y13,Y13,../../data/pdbbind2020/v2020-other-PL/2ew6/2ew...,4.97,general
12589,3hwx,TPP,TPP,TPP,../../data/pdbbind2020/v2020-other-PL/3hwx/3hw...,7.86,general


In [30]:
pdb_coherent_topo_df = pdb_df[pdb_df.pdb_code.isin(ligand_coherent_topo_df.pdb_code.unique())]
del pdb_coherent_topo_df["Unnamed: 0"]
pdb_coherent_topo_df.to_csv(pdb_coherent_topo_df_save_fpath)
print(f"Saved {r}pdb_coherent_topo_df{suf} to {b}{pdb_coherent_topo_df_save_fpath}{suf}")

Saved [1;31mpdb_coherent_topo_df[0m to [1;34m./Inputs/allpdb.pdbs__coherent.topo.csv[0m


In [31]:
pdb_coherent_topo_df

Unnamed: 0,pdb_code,pdb_fpath,datagroup
0,2jh6,../../data/pdbbind2020/refined-set/2jh6/2jh6_p...,refined
1,3lk8,../../data/pdbbind2020/refined-set/3lk8/3lk8_p...,refined
2,4qjw,../../data/pdbbind2020/refined-set/4qjw/4qjw_p...,refined
3,5o9o,../../data/pdbbind2020/refined-set/5o9o/5o9o_p...,refined
4,4wov,../../data/pdbbind2020/refined-set/4wov/4wov_p...,refined
...,...,...,...
12585,4j0v,../../data/pdbbind2020/v2020-other-PL/4j0v/4j0...,general
12586,4tyl,../../data/pdbbind2020/v2020-other-PL/4tyl/4ty...,general
12587,2ew6,../../data/pdbbind2020/v2020-other-PL/2ew6/2ew...,general
12589,3hwx,../../data/pdbbind2020/v2020-other-PL/3hwx/3hw...,general
