# sx.nciyes.process.<font color="red">new</font>.ipynb

Notebook for processing input datas.
New nci data will be used.
Data in <font color="red">data/HBPDB/Ligand</font> and <font color="red">data/pdbbind2020/</font> are used.

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
# configs
version = "v8.30" 


# You needn't modify the settings below under normal conditions.
save_path = f"./Inputs/22ETE/{version}/"
os.system(f"mkdir -p {save_path}")
nci_path = "../../dataspace/pdb_bind_2020/"
nci_data_fpath = "../../dataspace/exptnci_220825.csv"
refined_path = "../../data/pdbbind2020/refined-set/" # for .pdb files
general_path = "../../data/pdbbind2020/v2020-other-PL/"



refined_affinity_fpath = "../../data/pdbbind2020/index/INDEX_refined_data_preprocessed.2020"
general_affinity_fpath = "../../data/pdbbind2020/index/INDEX_general_PL_data_preprocessed.2020"
refined_aff_df_fpath = refined_affinity_fpath + ".csv"
general_aff_df_fpath = general_affinity_fpath + ".csv"


R = "\033[1;31m"
B = "\033[1;34m"
S = "\033[0m"

### 1. Manually modification record

PL other：
- Rename the columns : PDB, resolution, release_year, -logKd/Ki,Kd/Ki, reference, ligand_name
- Rename ligand name "FMN hq", "FMN sq", "FMN ox", ... to "FMN-hq", "FMN-sq", "FMN-ox"
- Remove "(", ")", "//"
- Line 5512 (p 5507): from "5ot3  2.04  2018   5.23  Kd=5.86uM     5ot3.pdf 9LQ,18-mer" to "9LQ/18-mer"
- Save modified file to "INDEX_general_PL_data_preprocessed.2020"
    

refined:
- Rename the columns : PDB, resolution, release_year, -logKd/Ki,Kd/Ki, reference, ligand_name
- Remove "(", ")", "//"
- Save modified file to "INDEX_refined_data_preprocessed.2020"

### 2. Affinity files generation and check

In [3]:
# Generation
for _dpath,_folder in zip([refined_affinity_fpath, general_affinity_fpath], [refined_path, general_path]):
    if True:
        os.system(f"rm {_dpath}.csv")
        ls = []
        with open(_dpath, 'r') as f:
            line = f.readline()
            while line != "" or []:
                items = line.split()
                ls.append(items)
                line = f.readline()
        columns = ls[0]
        del ls[0]
        df = pd.DataFrame(ls, columns=columns)
        df.to_csv(f"{_dpath}.csv")

In [4]:
# Check affinity
_aff = pd.read_csv(f"{_dpath}.csv", index_col=0)
_aff_pdbs = set(_aff.PDB_code.unique())
_folder_pdbs = np.array(os.listdir(_folder))
_folder_pdbs.sort()
for _f in _folder_pdbs:
    if _f not in _aff_pdbs:
        print(_f,"has no affinity data")

In [5]:
# Get PDB list
refined_pdbs = set(os.listdir(refined_path))
general_pdbs = set(os.listdir(general_path))
all_pdbs = refined_pdbs|general_pdbs
nci_pdbs = set(os.listdir(nci_path))
len(refined_pdbs), len(general_pdbs), len(refined_pdbs) + len(general_pdbs), len(all_pdbs), len(nci_pdbs)

(5316, 14127, 19443, 19443, 15189)

We have 5316 pdbs in refined set and 14127 in general set. The two sets are disjoint.

### 3. NCI & PDB data initilization
- Remove water group.
- Remove insufficient resolution.
- Remove wrong CP type, only (3,-1) will be kept.
- Remove NCI with both negative EDs.
- Remove empty ligatomname

#### Step 1

In [6]:

_logs = []
raw_nci_data = pd.read_csv(nci_data_fpath)
_a = len(raw_nci_data[raw_nci_data.Group=="water"])
_b = len(raw_nci_data[(raw_nci_data["ED_2A"]<0)&(raw_nci_data["ED_3A"]<0)])
_c = len(raw_nci_data[raw_nci_data["Resolution"]>2.5])
_d = len(raw_nci_data[raw_nci_data.CP_type!="(3::-1)"])
_logs.append(f"Raw water: {_a}.\n")
_logs.append(f"Raw both-negative EDs: {_b}.\n")
_logs.append(f"Raw insufficient Resolution: {_c}.\n")
_logs.append(f"Raw wrong CP_type: {_d}.\n")

In [29]:
_num = len(raw_nci_data)
nci_data = raw_nci_data[raw_nci_data["Group"]!="water"]
_logs.append(f"Remove water: {_num-len(nci_data)} removed, {len(nci_data)} remain.        -- (Group != water) kept.\n")

_num = len(nci_data)
nci_data = nci_data[(nci_data["ED_2A"]>=0)|(nci_data["ED_3A"]>=0)]
_logs.append(f"Remove both-negative EDs: {_num-len(nci_data)} removed, {len(nci_data)} remain.        -- (ED_2A>=0 or ED_3A>=0) kept.\n")

_num = len(nci_data)
nci_data=nci_data[nci_data["Resolution"]<=2.5]
_logs.append(f"Remove insufficent Resolution: {_num-len(nci_data)} removed, {len(nci_data)} remain.         -- (Resolution<=2.5) kept.\n")

_num = len(nci_data)
nci_data["LigAtomName"].fillna(0,inplace=True)
nci_data=nci_data[nci_data["LigAtomName"]!=0]
_logs.append(f"Remove empty LigAtomName: {_num-len(nci_data)} removed, {len(nci_data)} remain.         -- (LigAtomName is not NaN) kept.\n")      

_num = len(nci_data)
nci_data = nci_data[nci_data.CP_type == "(3::-1)"]
_logs.append(f"Remove wrong CP_type: {_num-len(nci_data)} removed, {len(nci_data)} remain.         -- (CP_type == \"(3::-1)\") kept.\n")

# Add ResFullID
nci_data["ResFullID"] = nci_data["ChainID"]+"_"+nci_data["ResID"].astype(str)+"_"+nci_data["ResName"].astype(str)
_logs.append("Column ResFullID added.\n")



with open(f"{save_path}Removed.{version}.step_1_NCIs_{len(raw_nci_data)-len(nci_data)} - Initialization.txt", "w") as f:
    f.writelines(_logs)
nci_data.to_csv(f"{save_path}Intermediate.{version}.step_1_NCIs_{len(nci_data)} - Initialized.csv")

nci_data

Unnamed: 0,NCI_ID,complex_composed_id,PDB_Code,L_type,R_type,NCI_atom_pair,NCI_intuitive,NCI_ODDT,Resolution,CP_type,...,lr_id,rl_type,rec_atom_type,type,cpx,cpy,cpz,iso_value,data_version,ResFullID
40,3m35-34::13-1,3m35::M35,3m35,C.ar,O.2,C.ar::O.2,Aromatic...Hydrophilic,not_rule,2.20,(3::-1),...,34::13,O.2::C.ar,O_pi_acc,1,16.437,80.444,22.234,0.025,1.661407e+09,A_97_ASN
41,3m35-6::78-1,3m35::M35,3m35,C.ar,O.3,C.ar::O.3,Aromatic...Hydrophilic,not_rule,2.20,(3::-1),...,6::78,O.3::C.ar,O_mix,1,15.556,67.259,22.667,0.025,1.661407e+09,A_195_SER
43,3m35-8::49-1,3m35::M35,3m35,N.3,C.2,C.2::N.3,Aliphatic C...Hydrophilic,not_rule,2.20,(3::-1),...,8::49,C.2::N.3,C_pi_neg,1,21.133,66.667,28.153,0.025,1.661407e+09,A_189_ASP
46,3m35-30::106-1,3m35::M35,3m35,C.3,C.2,C.2::C.3,Aliphatic C...Aliphatic C,not_rule,2.20,(3::-1),...,30::106,C.2::C.3,C_pi,2,18.492,69.333,27.143,0.025,1.661407e+09,A_215_TRP
49,3m35-9::106-1,3m35::M35,3m35,O.2,C.2,C.2::O.2,Aliphatic C...Hydrophilic,not_rule,2.20,(3::-1),...,9::106,C.2::O.2,C_pi,1,18.198,71.556,24.255,0.025,1.661407e+09,A_215_TRP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
819910,5xhr-14::53-1,5xhr::PXH,5xhr,O.3,N.am,N.am::O.3,Hydrophilic...Hydrophilic,hbonds,1.80,(3::-1),...,14::53,N.am::O.3,N_pi,1,27.040,89.550,95.781,0.025,1.661352e+09,A_152_ASN
819912,5xhr-18::66-1,5xhr::PXH,5xhr,S.2,C.ar,C.ar::S.2,Aromatic...Hydrophilic,pi_stacking,1.80,(3::-1),...,18::66,C.ar::S.2,C_pi,2,24.067,89.405,90.515,0.025,1.661352e+09,A_221_TYR
819949,5d7x-18::52-1,5d7x::XZ8,5d7x,C.ar,C.3,C.3::C.ar,Aromatic...Aliphatic C,not_rule,1.35,(3::-1),...,18::52,C.3::C.ar,C_ali_apol,2,4.918,-12.392,5.742,0.025,1.661349e+09,A_657_VAL
819953,5d7x-13::114-1,5d7x::XZ8,5d7x,O.2,N.am,N.am::O.2,Hydrophilic...Hydrophilic,hbonds,1.35,(3::-1),...,13::114,N.am::O.2,N_pi,1,7.005,-11.875,0.736,0.025,1.661349e+09,A_708_ASN


#### Step 2-3

In [30]:
# Remove ncis without pdbbind files
drop_1 = nci_data[~nci_data.PDB_Code.isin(all_pdbs)]
drop_1.to_csv(f"{save_path}Removed.{version}.step_2_NCIs_{len(drop_1)} - Dropped without PDBBind files.csv")

drop_2 = nci_data[~nci_data.PDB_Code.isin(nci_pdbs)]
drop_2.to_csv(f"{save_path}Removed.{version}.step_3_NCIs_{len(drop_2)} - Dropped without ligand files.csv")

nci_data = nci_data[(nci_data.PDB_Code.isin(all_pdbs))&(nci_data.PDB_Code.isin(nci_pdbs))]


# Check ncis
nci_pdbs = set(nci_data.PDB_Code.unique())
all_pdbs = refined_pdbs|general_pdbs
all_nci_pdbs = nci_pdbs.intersection(all_pdbs, nci_pdbs)
refined_nci_pdbs = refined_pdbs.intersection(all_nci_pdbs)
general_nci_pdbs = general_pdbs.intersection(all_nci_pdbs)


refined_without_nci_pdbs = refined_pdbs - refined_nci_pdbs
general_without_nci_pdbs = general_pdbs - general_nci_pdbs

nci_data = nci_data[nci_data.PDB_Code.isin(all_nci_pdbs)]
nci_data = nci_data.reset_index().rename(columns={"index":"original_index"})

drop_3 = [[_line, "refined"] for _line in refined_without_nci_pdbs]
drop_3.extend([[_line, "general"] for _line in general_without_nci_pdbs])
drop_3 = pd.DataFrame(drop_3, columns = ["pdb_code", "pdb_group"])
drop_3.to_csv(f"{save_path}Removed.{version}.step_4_PDBs_{len(drop_3)} - Dropped without NCI.csv")
nci_data.to_csv(f"{save_path}Intermediate.{version}.step_4_NCIs_{len(nci_data)} - With Coherent Files.csv")

In [31]:
# Check ligand name uniqueness for each pdb_code
nci_ligand_name_data = nci_data[["PDB_Code", "LigName", "LigAtomName", "ResFullID", "CP_type", "Group"]]
nci_ligand_name_data = nci_ligand_name_data.groupby(["PDB_Code", "LigName"]).count().reset_index()
nci_ligand_name_data

Unnamed: 0,PDB_Code,LigName,LigAtomName,ResFullID,CP_type,Group
0,10gs,VWW,11,11,11,11
1,13gs,SAS,2,2,2,2
2,184l,I4B,2,2,2,2
3,185l,IND,1,1,1,1
4,186l,N4B,1,1,1,1
...,...,...,...,...,...,...
12326,6uvy,QJP,7,7,7,7
12327,6uwp,QKA,3,3,3,3
12328,6uwv,QK7,5,5,5,5
12329,6v1c,NDG,7,7,7,7


In [32]:
len(nci_ligand_name_data['PDB_Code']), len(nci_ligand_name_data['PDB_Code'].unique())

(12331, 12331)

Since we have 10227 unique pdb_codes and 10227 rows, each pdb is related to a unique ligname in nci.

### 4. Ligand name coherence.

In [33]:
if True:
    from Bio.PDB import PDBParser
    parser = PDBParser()
    pdb_df_list = []
    ligand_df_list = []
    drop_4 = []

    # refined
    for (_name, _pdb_list, _pdb_path, _aff_fpath) in zip(
        ["refined","general"],[refined_nci_pdbs, general_nci_pdbs], [refined_path, general_path], [refined_aff_df_fpath, general_aff_df_fpath]
        ):
        print(f"Processing {B}.pdb{S} files for {R}{_name}{S}:")
        aff_df = pd.read_csv(_aff_fpath, index_col=0)
        for (i,_pdb) in enumerate(tqdm(_pdb_list, total=len(_pdb_list))):
            model = parser.get_structure("pdb", f"{nci_path}{_pdb}/{_pdb}_ligand.pdb")[0]
            ligand_name_set = set()
            for _chain in model:
                for _residue in _chain:
                    ligand_name_set.add(_residue.get_full_id()[3][0])
            if len(ligand_name_set) == 1:
                ligand_name = list(ligand_name_set)[0].replace("H_","")
            else:
                drop_4.append([_pdb, f"{nci_path}{_pdb}/{_pdb}_ligand.pdb", f"More than one ligand name : {str(ligand_name_set)}"])
                continue 
            
            aff_line = aff_df[(aff_df["PDB_code"]== _pdb)]
            
            if len(aff_line) == 1:
                aff = aff_line["-logKd/Ki"].values[0]
                release_year = aff_line["release_year"].values[0]
                if not isinstance(aff, int) and not isinstance(aff, float):
                    drop_4.append([_pdb, f"{nci_path}{_pdb}/{_pdb}_ligand.pdb", f"Error with affinity data : {aff}"])
                    continue
            else:
                drop_4.append([_pdb, f"{nci_path}{_pdb}/{_pdb}_ligand.pdb", f"Error with affinity data : {aff}"])
                continue
            
            ligand_name2 = aff_line['ligand_name'].values[0]
            ligand_name3 = nci_data[(nci_data["PDB_Code"]==_pdb)].LigName.unique().__str__().replace("[","").replace("]","").replace("\'", "")
            pdb_df_list.append([_pdb, f"{_pdb_path}{_pdb}/{_pdb}_protein.pdb", _name, release_year])
            ligand_df_list.append([_pdb, ligand_name, ligand_name2, ligand_name3, f"{nci_path}{_pdb}/{_pdb}_ligand.pdb", aff, _name, release_year])


Processing [1;34m.pdb[0m files for [1;31mrefined[0m:


  0%|          | 0/4449 [00:00<?, ?it/s]



Processing [1;34m.pdb[0m files for [1;31mgeneral[0m:


  0%|          | 0/7882 [00:00<?, ?it/s]



In [34]:
pdb_df = pd.DataFrame(pdb_df_list, columns=["pdb_code", "pdb_fpath", "datagroup", "release_year"])
_s = f"{save_path}Intermediate.{version}.step_5_PDBs_{len(pdb_df)} - With LigName.csv"
pdb_df.to_csv(_s)
print(f"Saved {R}pdb_df{S} to {B}{_s}{S}")
    
ligand_df = pd.DataFrame(ligand_df_list, columns=["pdb_code", "ligand_name_ncifile", "ligand_name_aff", "ligand_name_nci", "ligand_fpath", "affinity", "datagroup", "release_year"])
_s = f"{save_path}Intermediate.{version}.step_5_Ligands_{len(pdb_df)} - With LigName.csv"
ligand_df.to_csv(_s)
print(f"Saved {R}ligand_df{S} to {B}{_s}{S}")
    
drop_4 = pd.DataFrame(drop_4, columns=["pdb_code", "ligand_fpath", "error_info"])
_s = f"{save_path}Removed.{version}.step_5_PDBs_{len(drop_4)} - Dropped with LigName Generation Error.csv"
drop_4.to_csv(_s)
print(f"Saved {R}drop_4{S} to {B}{_s}{S}")

Saved [1;31mpdb_df[0m to [1;34m./Inputs/22ETE/v8.30/Intermediate.v8.30.step_5_PDBs_12331 - With LigName.csv[0m
Saved [1;31mligand_df[0m to [1;34m./Inputs/22ETE/v8.30/Intermediate.v8.30.step_5_Ligands_12331 - With LigName.csv[0m
Saved [1;31mdrop_4[0m to [1;34m./Inputs/22ETE/v8.30/Removed.v8.30.step_5_PDBs_0 - Dropped with LigName Generation Error.csv[0m


In [35]:
ligand_coherent_df = ligand_df[(ligand_df["ligand_name_ncifile"]==ligand_df["ligand_name_aff"])&(ligand_df["ligand_name_ncifile"]==ligand_df["ligand_name_nci"])]
ligand_coherent_df = ligand_coherent_df.reset_index()
del ligand_coherent_df["index"]
_s = f"{save_path}Data.{version}.Ligands.csv"
ligand_coherent_df.to_csv(_s)
print(f"Saved {R}ligand_coherent_df{S} to {B}{_s}{S}")

drop_5 = ligand_df[(ligand_df["ligand_name_ncifile"]!=ligand_df["ligand_name_aff"])|(ligand_df["ligand_name_ncifile"]!=ligand_df["ligand_name_nci"])]
_s = f"{save_path}Removed.{version}.step_6_Ligands_{len(drop_5)} - Dropped with incoherent LigName.csv"
drop_5.to_csv(_s)
print(f"Saved {R}drop_5: ligand with incoherent name{S} to {B}{_s}{S}")

pdb_coherent_df = pdb_df[~pdb_df.pdb_code.isin(drop_5.pdb_code.unique())].reset_index()
del pdb_coherent_df["index"]
_s = f"{save_path}Data.{version}.PDBs.csv"
pdb_coherent_df.to_csv(_s)
print(f"Saved {R}pdb_coherent_df{S} to {B}{_s}{S}")



drop_6 = pdb_df[pdb_df.pdb_code.isin(drop_5.pdb_code.unique())]
_s = f"{save_path}Removed.{version}.step_6_PDBs_{len(drop_6)} - Dropped with incoherent LigName.csv"
drop_6.to_csv(_s)
print(f"Saved {R}drop_6: pdb with incoherent name{S} to {B}{_s}{S}")

nci_data_coherent = nci_data[~nci_data.PDB_Code.isin(drop_5.pdb_code.unique())].reset_index()
del nci_data_coherent["index"]
_s = f"{save_path}Data.{version}.NCIs.csv"
nci_data_coherent.to_csv(_s)
print(f"Saved {R}nci_data_coherent{S} to {B}{_s}{S}")

drop_7 = nci_data[nci_data.PDB_Code.isin(drop_5.pdb_code.unique())]
_s = f"{save_path}Removed.{version}.step_6_NCIs_{len(drop_7)} - Dropped with incoherent LigName.csv"
drop_7.to_csv(_s)
print(f"Saved {R}drop_7: NCI with incoherent name{S} to {B}{_s}{S}")

with open(f"{save_path}Datainfo.txt", "w") as f:
    f.write(f"Data version: {version}\n")
    f.write(f"PDBs: {len(pdb_coherent_df)}, Ligands: {len(pdb_coherent_df)}, NCIs: {len(nci_data_coherent)}")

print(f"\nProcessed files: ")
_s = f"{save_path}Data.{version}.Ligands.csv"
print(f"{R}Ligand{B}:    {B}{_s}{S}   num: {R}{len(pdb_coherent_df)}{S}")
_s = f"{save_path}Data.{version}.PDBs.csv"
print(f"{R}Proteins{B}:  {B}{_s}{S}      num: {R}{len(pdb_coherent_df)}{S}")
_s = f"{save_path}Data.{version}.NCIs.csv"
print(f"{R}NCI Table{B}: {B}{_s}{S}      num: {R}{len(nci_data_coherent)}{S}")

Saved [1;31mligand_coherent_df[0m to [1;34m./Inputs/22ETE/v8.30/Data.v8.30.Ligands.csv[0m
Saved [1;31mdrop_5: ligand with incoherent name[0m to [1;34m./Inputs/22ETE/v8.30/Removed.v8.30.step_6_Ligands_285 - Dropped with incoherent LigName.csv[0m
Saved [1;31mpdb_coherent_df[0m to [1;34m./Inputs/22ETE/v8.30/Data.v8.30.PDBs.csv[0m
Saved [1;31mdrop_6: pdb with incoherent name[0m to [1;34m./Inputs/22ETE/v8.30/Removed.v8.30.step_6_PDBs_285 - Dropped with incoherent LigName.csv[0m
Saved [1;31mnci_data_coherent[0m to [1;34m./Inputs/22ETE/v8.30/Data.v8.30.NCIs.csv[0m
Saved [1;31mdrop_7: NCI with incoherent name[0m to [1;34m./Inputs/22ETE/v8.30/Removed.v8.30.step_6_NCIs_1342 - Dropped with incoherent LigName.csv[0m

Processed files: 
[1;31mLigand[1;34m:    [1;34m./Inputs/22ETE/v8.30/Data.v8.30.Ligands.csv[0m   num: [1;31m12046[0m
[1;31mProteins[1;34m:  [1;34m./Inputs/22ETE/v8.30/Data.v8.30.PDBs.csv[0m      num: [1;31m12046[0m
[1;31mNCI Table[1;34m: [1;34m./I