# sx.nciyes.process.ipynb

Notebook for processing input datas.

- 原始数据
    - merge_0304.csv，NCI 总表，凡 667887 条 NCI 数据。
    - refined_set, 凡 5316 个 PDB 文件夹。
    - refined_data, 凡 5315 条活性数据，PDB_code 与前者一一对应。
    - PL_other, 凡 14127 个 PDB 文件夹。
    - PL_data, 凡 14127 条活性数据，PDB_code 与前者一一对应。

- 清洗
    - NCI 去掉 water：去掉 66528 条；剩余 601359 条数据；
    - NCI 去掉 pdbbind 中不包含的：去掉 86条；剩余 601273 条数据；
    - pdbbind 去掉 NCI 数据中不包含的：
        - refined 去掉 1738 组 PDB; 剩余 3578 组 PDB；
        - general 去掉 5113 组 PDB; 剩余 9014 组 PDB；
    - 进入模型之前共有： 601359 条 NCI 数据，12592 组 PDB；

In [114]:
import os
import pandas as pd
import numpy as np

In [115]:
# path
save_path = "./Inputs/"
nci_fpath = "../../data/ExptNCI/merged_0304.csv"
nci_save_fpath = "./Inputs/allpdb.nci.csv"
refined_path = "../../data/pdbbind2020/refined-set/" # for .pdb files
general_path = "../../data/pdbbind2020/v2020-other-PL/"
#hbligand_path = "../../data/HBLigand/" # for .
#pdbs_save_path = "./Inputs/Dev/RenamedPDBBind/"
pdb_list_save_fpath = "./Inputs/allpdb.pdbs.csv"
#ligands_save_path = "./Inputs/Dev/RenamedPDBBind/"
ligand_list_save_fpath = "./Inputs/allpdb.ligands.csv"
refined_affinity_fpath = "../../data/pdbbind2020/index/INDEX_refined_data_preprocessed.2020"
general_affinity_fpath = "../../data/pdbbind2020/index/INDEX_general_PL_data_preprocessed.2020"
logs_failed_files_fpath = "./Inputs/logs_failed_files.txt"

refined_aff_df_fpath = refined_affinity_fpath+".csv"
general_aff_df_fpath = general_affinity_fpath +".csv"

##### Manually modification record

PL other：
- Rename the columns : PDB, resolution, release_year, -logKd/Ki,Kd/Ki, reference, ligand_name
- Rename ligand name "FMN hq", "FMN sq", "FMN ox", ... to "FMN-hq", "FMN-sq", "FMN-ox"
- Remove "(", ")", "//"
- Line 5512 (p 5507): from "5ot3  2.04  2018   5.23  Kd=5.86uM     5ot3.pdf 9LQ,18-mer" to "9LQ/18-mer"
- Save modified file to "INDEX_general_PL_data_preprocessed.2020"
    

refined:
- Rename the columns : PDB, resolution, release_year, -logKd/Ki,Kd/Ki, reference, ligand_name
- Remove "(", ")", "//"
- Save modified file to "INDEX_refined_data_preprocessed.2020"

### util: affinity_data to .csv

In [147]:
_dpath = refined_affinity_fpath
_folder = refined_path
_dpath = general_affinity_fpath
_folder = general_path
_dpath

'../../data/pdbbind2020/index/INDEX_general_PL_data_preprocessed.2020'

In [117]:
# Generation
for _dpath,_folder in zip([refined_affinity_fpath, general_affinity_fpath], [refined_path, general_path]):
    if True:
        os.system(f"rm {_dpath}.csv")
        ls = []
        with open(_dpath, 'r') as f:
            line = f.readline()
            while line != "" or []:
                items = line.split()
                ls.append(items)
                line = f.readline()
        columns = ls[0]
        del ls[0]
        df = pd.DataFrame(ls, columns=columns)
        df.to_csv(f"{_dpath}.csv")

In [118]:
# Check affinity
_aff = pd.read_csv(f"{_dpath}.csv", index_col=0)
_aff_pdbs = set(_aff.PDB_code.unique())
_folder_pdbs = np.array(os.listdir(_folder))
_folder_pdbs.sort()
for _f in _folder_pdbs:
    if _f not in _aff_pdbs:
        print(_f,"has no affinity data")

In [136]:
# Get PDB list
refined_pdbs = set(os.listdir(refined_path))
general_pdbs = set(os.listdir(general_path))
all_pdbs = refined_pdbs|general_pdbs
len(refined_pdbs), len(general_pdbs), len(refined_pdbs) + len(general_pdbs), len(all_pdbs)

(5316, 14127, 19443, 19443)

### NCI data processing

In [152]:
nci_data = pd.read_csv(nci_fpath)

In [153]:
# Remove water
nci_data = nci_data.drop(nci_data[nci_data['Group']=='water'].index)

# Remove ncis without pdbbind files
drop_1 = nci_data[~nci_data.PDB_Code.isin(all_pdbs)]
drop_1.to_csv(f"{save_path}dropped_nci - without pdbbind files.csv")
nci_data = nci_data[nci_data.PDB_Code.isin(all_pdbs)].reset_index()
del nci_data["index"]

# Check ncis
nci_pdbs = set(nci_data.PDB_Code.unique())
refined_without_nci = refined_pdbs - nci_pdbs
general_without_nci = general_pdbs - nci_pdbs
refined_nci_pdbs = refined_pdbs - refined_without_nci
general_nci_pdbs = general_pdbs - general_without_nci
all_nci_pdbs = refined_nci_pdbs|general_nci_pdbs

# Save files
os.system(f"rm {save_path}\"dropped_refined_pdb - without nci.txt\"")
os.system(f"rm {save_path}\"dropped_general_pdb - without nci.txt\"")
os.system(f"rm {save_path}\"keeped_refined_pdb.txt\"")
os.system(f"rm {save_path}\"keeped_general_pdb.txt\"")
os.system(f"rm {save_path}\"keeped_all_pdb.txt\"")
os.system(f"rm {save_path}\"keeped_all_pdb_with_source.txt\"")
with open(f"{save_path}dropped_refined_pdb - without nci.txt", "a") as f:
    f.writelines([ _line + "\n" for _line in refined_without_nci])
with open(f"{save_path}dropped_general_pdb - without nci.txt", "a") as f:
    f.writelines([ _line + "\n" for _line in general_without_nci])
with open(f"{save_path}keeped_refined_pdb.txt", "a") as f:
    f.writelines([ _line + "\n" for _line in refined_nci_pdbs])
with open(f"{save_path}keeped_general_pdb.txt", "a") as f:
    f.writelines([ _line + "\n" for _line in general_nci_pdbs])
with open(f"{save_path}keeped_all_pdb.txt", "a") as f:
    f.writelines([ _line + "\n" for _line in all_nci_pdbs])
with open(f"{save_path}keeped_all_pdb_with_source.txt", "a") as f:
    f.writelines([ _line + ",refined\n" for _line in refined_nci_pdbs] + [ _line + ",general\n" for _line in general_nci_pdbs])

In [141]:
len(refined_without_nci), len(refined_nci_pdbs), len(refined_without_nci)+len(refined_nci_pdbs),len(general_without_nci), len(general_nci_pdbs), len(general_without_nci)+len(general_nci_pdbs),len(all_nci_pdbs)

(1738, 3578, 5316, 5113, 9014, 14127, 12592)

### Prepare input

In [110]:
len(pdb_without_nci["refined"]), len(pdb_without_nci["general"])

1738
5113


In [None]:
# Add ResFullID
nci_data["ResFullID"] = nci_data["ChainID"]+"_"+nci_data["ResID"].astype(str)

# Save File
nci_data.to_csv(nci_save_fpath)

In [92]:
drop_list = ["4ocq", "4qev", "4qew","4rux","4u0f","4uj1","4xt2","4xtw","4xtx","4uj2","4z84","5d21","5dhu","5f8y","5hrv","5hrw","5hrx"]







In [93]:
df_clear = df_clear.drop(df_clear[df_clear['pdb_code'].isin(drop_list)].index)

In [94]:
df_clear = df_clear.reset_index()
del df_clear['index']

In [95]:
df_clear.to_csv("./Inputs/Dev/Savedfiles/info_clean.csv")

In [7]:
class MolStructureError(Exception):
    def __init__(self, errorinfo):
        super().__init__()
        self.errorinfo = errorinfo
    def __str__(self):
        return self.errorinfo

In [102]:
refined_aff_df = pd.read_csv(refined_aff_df_fpath, index_col=0)
refined_aff_df
other_aff_df = pd.read_csv(other_aff_df_fpath, index_col = 0)
other_aff_df

Unnamed: 0,PDB_code,resolution,release_year,-logKd/Ki,Kd/Ki,reference,ligand_name
0,3zzf,2.20,2012,0.40,Ki=400mM,3zzf.pdf,NLG
1,3gww,2.46,2009,0.45,IC50=355mM,3gwu.pdf,SFX
2,1w8l,1.80,2004,0.49,Ki=320mM,1w8l.pdf,1P3
3,3fqa,2.35,2009,0.49,IC50=320mM,3fq7.pdf,GAB&PMP
4,1zsb,2.00,1996,0.60,Kd=250mM,1zsb.pdf,AZM
...,...,...,...,...,...,...,...
19437,7cpa,2.00,1994,13.96,Ki=11fM,7cpa.pdf,FVF
19438,2xuf,2.55,2010,14.39,Kd=4.1fM,2xud.pdf,TZ4
19439,1avd,2.70,1994,15.00,Kd=1fM,1avd.pdf,BTN
19440,2xui,2.60,2010,15.00,Kd=1.0fM,2xud.pdf,TZ5


In [9]:
from tqdm import tqdm
pdb_df_list = []
ligand_df_list = []

print("Processing .pdb and .mol2 files:")
logs_failed_files = []
for (i,_pdb) in enumerate(tqdm(target_pdb_list, total=len(target_pdb_list))):
    with open(f"{pdbbind_path}{_pdb}/{_pdb}_ligand.mol2", "r") as f:
        text = ""
        while (text != "@<TRIPOS>ATOM\n"):
            text = f.readline()
        text = f.readline().split()
        
        if (text[0] == "1") and (text[-2] != "UNK"):
            ligand_name = text[-2]
        else:
            logs_failed_files.append(f"Protein {_pdb} with ligand {ligand_name} processing failed: mol2 file structure error.\n")
        
        affinity = affinity_df[(affinity_df["pdb_code"]== _pdb)&(affinity_df["ligand_name"] == ligand_name)]
        if len(affinity) == 1:
            affinity = affinity["neg_log_kd_on_ki"].values[0]
        else:
            logs_failed_files.append(f"Protein {_pdb} with ligand {ligand_name} processing failed: affinity data not available.\n")
            continue
    
    pdb_df_list.append([_pdb, f"{pdbbind_path}{_pdb}/{_pdb}_protein.pdb"])
    ligand_df_list.append([_pdb, ligand_name, f"{pdbbind_path}{_pdb}/{_pdb}_ligand.mol2", affinity])
    
pdb_df = pd.DataFrame(pdb_df_list, columns=["pdb_code", "pdb_fpath"])
pdb_df.to_csv(pdb_list_save_fpath)

logs_failed_files = [f"{len(logs_failed_files)} among {len(target_pdb_list)} pairs failed.\n"] + logs_failed_files
with open(logs_failed_files_fpath, "w") as f:
    f.writelines(logs_failed_files)
print(f"Processing finished. {len(logs_failed_files)-1} among {len(target_pdb_list)} pairs failed.")
print(f"Information of failed file pairs saved to {logs_failed_files_fpath}.")
print(f"Information of protein list saved to {pdb_list_save_fpath}.")
ligand_df = pd.DataFrame(ligand_df_list, columns=["pdb_code", "ligand_name", "ligand_fpath", "affinity"])
ligand_df.to_csv(ligand_list_save_fpath)
print(f"Information of ligand list saved to {ligand_list_save_fpath}.")


Processing .pdb and .mol2 files:


100%|██████████| 2/2 [00:00<00:00, 140.33it/s]

Processing finished. 0 among 2 pairs failed.
Information of failed file pairs saved to ./Inputs/Dev/logs_failed_files.txt.
Information of protein list saved to ./Inputs/Dev/small.dev.pdbs.csv.
Information of ligand list saved to ./Inputs/Dev/small.dev.ligands.csv.





In [106]:
ls

PDBbind_v2020_NL.tar.gz    PDBbind_v2020_other_PL.tar.gz          [0m[01;34mindex[0m/
PDBbind_v2020_PN.tar.gz    PDBbind_v2020_plain_text_index.tar.gz  [01;34mreadme[0m/
PDBbind_v2020_PP.tar.gz    PDBbind_v2020_refined.tar.gz           [01;34mrefined-set[0m/
PDBbind_v2020_mol2.tar.gz  PDBbind_v2020_sdf.tar.gz               [01;34msdf[0m/


In [None]:
!tar -zxvf PDBbind_v2020_other_PL.tar.gz 

v2020-other-PL/
v2020-other-PL/9icd/
v2020-other-PL/9icd/9icd_protein.pdb
v2020-other-PL/9icd/9icd_ligand.mol2
v2020-other-PL/9icd/9icd_ligand.sdf
v2020-other-PL/9icd/9icd_pocket.pdb
v2020-other-PL/4kqo/
v2020-other-PL/4kqo/4kqo_ligand.sdf
v2020-other-PL/4kqo/4kqo_protein.pdb
v2020-other-PL/4kqo/4kqo_pocket.pdb
v2020-other-PL/4kqo/4kqo_ligand.mol2
v2020-other-PL/5qj2/
v2020-other-PL/5qj2/5qj2_pocket.pdb
v2020-other-PL/5qj2/5qj2_ligand.mol2
v2020-other-PL/5qj2/5qj2_ligand.sdf
v2020-other-PL/5qj2/5qj2_protein.pdb
v2020-other-PL/4a1w/
v2020-other-PL/4a1w/4a1w_ligand.sdf
v2020-other-PL/4a1w/4a1w_pocket.pdb
v2020-other-PL/4a1w/4a1w_protein.pdb
v2020-other-PL/4a1w/4a1w_ligand.mol2
v2020-other-PL/5k0s/
v2020-other-PL/5k0s/5k0s_ligand.mol2
v2020-other-PL/5k0s/5k0s_protein.pdb
v2020-other-PL/5k0s/5k0s_ligand.sdf
v2020-other-PL/5k0s/5k0s_pocket.pdb
v2020-other-PL/6f7t/
v2020-other-PL/6f7t/6f7t_protein.pdb
v2020-other-PL/6f7t/6f7t_pocket.pdb
v2020-other-PL/6f7t/6f7t_ligand.mol2
v2020-other-PL/6f7