# sx.nciyes.process.<font color="red">new</font>.ipynb

Notebook for processing input datas.
New nci data will be used.
Data in <font color="red">data/HBPDB/Ligand</font> and <font color="red">data/pdbbind2020/</font> are used.

In [176]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [177]:
# configs
version = "v9.1" 
mode_ls = ["tankbind", "nciyes", "frag"]
cfg_mode = "tankbind"

# You needn't modify the settings below under normal conditions.
save_path = f"./Inputs/{cfg_mode}/{version}/"
os.system(f"mkdir -p {save_path}")

nci_path = "../../dataspace/pdb_bind_2020/"
nci_data_fpath = "../../dataspace/exptnci_220825.csv"

refined_path = "../../data/pdbbind2020/refined-set/" # for .pdb files
general_path = "../../data/pdbbind2020/v2020-other-PL/"



refined_affinity_fpath = "../../data/pdbbind2020/index/INDEX_refined_data_preprocessed.2020"
general_affinity_fpath = "../../data/pdbbind2020/index/INDEX_general_PL_data_preprocessed.2020"
refined_aff_df_fpath = refined_affinity_fpath + ".csv"
general_aff_df_fpath = general_affinity_fpath + ".csv"


R = "\033[1;31m"
B = "\033[1;34m"
S = "\033[0m"

### 1. Manually modification record

PL other：
- Rename the columns : PDB, resolution, release_year, -logKd/Ki,Kd/Ki, reference, ligand_name
- Rename ligand name "FMN hq", "FMN sq", "FMN ox", ... to "FMN-hq", "FMN-sq", "FMN-ox"
- Remove "(", ")", "//"
- Line 5512 (p 5507): from "5ot3  2.04  2018   5.23  Kd=5.86uM     5ot3.pdf 9LQ,18-mer" to "9LQ/18-mer"
- Save modified file to "INDEX_general_PL_data_preprocessed.2020"
    

refined:
- Rename the columns : PDB, resolution, release_year, -logKd/Ki,Kd/Ki, reference, ligand_name
- Remove "(", ")", "//"
- Save modified file to "INDEX_refined_data_preprocessed.2020"

### 2. Affinity files generation and check

In [178]:
if True:
    # Generation
    for _dpath,_folder in zip([refined_affinity_fpath, general_affinity_fpath], [refined_path, general_path]):
        if True:
            #print(_dpath)
            os.system(f"rm {_dpath}.csv")
            ls = []
            with open(f"{_dpath}", 'r') as f:
                line = f.readline()
                #print(line)
                while line != "" or []:
                    items = line.split()
                    ls.append(items)
                    line = f.readline()
            columns = ls[0]
            #print(ls[0])
            del ls[0]
            df = pd.DataFrame(ls, columns=columns)
            df.to_csv(f"{_dpath}.csv")
    # Check affinity
    _aff = pd.read_csv(f"{_dpath}.csv", index_col=0)
    _aff_pdbs = set(_aff.PDB_code.unique())
    _folder_pdbs = np.array(os.listdir(_folder))
    _folder_pdbs.sort()
    for _f in _folder_pdbs:
        if _f not in _aff_pdbs:
            print(_f,"has no affinity data")

In [179]:
# Get PDB list
refined_pdbs = set(os.listdir(refined_path))
general_pdbs = set(os.listdir(general_path))
all_pdbs = refined_pdbs|general_pdbs
nci_pdbs = set(os.listdir(nci_path))
len(refined_pdbs), len(general_pdbs), len(refined_pdbs) + len(general_pdbs), len(all_pdbs), len(nci_pdbs)

(5316, 14127, 19443, 19443, 15189)

We have 5316 pdbs in refined set and 14127 in general set. The two sets are disjoint.

### 2.5 Frag and tankband data

In [180]:
if cfg_mode == "frag" or cfg_mode == "tankbind":
    from Bio.PDB import PDBParser
    parser = PDBParser()
    pdb_df_list = []
    ligand_df_list = []
    drop_4 = []

    # refined
    for (_name, _pdb_list, _pdb_path, _aff_fpath) in zip(
        ["refined","general"],[refined_pdbs, general_pdbs], [refined_path, general_path], [refined_aff_df_fpath, general_aff_df_fpath]
        ):
        print(f"Processing {B}.pdb{S} files for {R}{_name}{S}:")
        aff_df = pd.read_csv(_aff_fpath, index_col=0)
        for (i,_pdb) in enumerate(tqdm(_pdb_list, total=len(_pdb_list))):
            with open(f"{_pdb_path}{_pdb}/{_pdb}_ligand.mol2") as f:
                lines = f.readlines()
                for i in range(len(lines)):
                    if "@<TRIPOS>ATOM" in lines[i] and (i+1)< len(lines):
                        ligand_name = lines[i+1].split()[-2]
                        break
                    ligand_name = "ERROR"
                        
            aff_line = aff_df[(aff_df["PDB_code"]== _pdb)]
            
            if len(aff_line) == 1:
                aff = aff_line["-logKd/Ki"].values[0]
                release_year = aff_line["release_year"].values[0]
                if not isinstance(aff, int) and not isinstance(aff, float):
                    drop_4.append([_pdb, f"{_pdb_path}{_pdb}/{_pdb}_ligand.sdf", f"Error with affinity data : {aff}"])
                    continue
            else:
                drop_4.append([_pdb, f"{_pdb_path}{_pdb}/{_pdb}_ligand.sdf", f"Error with affinity data : {aff}"])
                continue
            
            ligand_name2 = aff_line['ligand_name'].values[0]
            pdb_df_list.append([_pdb, f"{_pdb_path}{_pdb}/{_pdb}_protein.sdf", _name, release_year])
            ligand_df_list.append([_pdb, ligand_name, ligand_name2, f"{_pdb_path}{_pdb}/{_pdb}_ligand.sdf", aff, _name, release_year])

    pdb_df = pd.DataFrame(pdb_df_list, columns=["pdb_code", "pdb_fpath", "datagroup", "release_year"])
    _s = f"{save_path}Intermediate.{version}.step_1_PDBs_{len(pdb_df)} - With LigName.csv"
    pdb_df.to_csv(_s)
    print(f"Saved {R}pdb_df{S} to {B}{_s}{S}")

    ligand_df = pd.DataFrame(ligand_df_list, columns=["pdb_code", "ligand_name_file", "ligand_name_aff", "ligand_fpath", "affinity", "datagroup", "release_year"])
    _s = f"{save_path}Intermediate.{version}.step_1_Ligands_{len(pdb_df)} - With LigName.csv"
    ligand_df.to_csv(_s)
    print(f"Saved {R}ligand_df{S} to {B}{_s}{S}")

    drop_4 = pd.DataFrame(drop_4, columns=["pdb_code", "ligand_fpath", "error_info"])
    _s = f"{save_path}Removed.{version}.step_1_PDBs_{len(drop_4)} - Dropped with LigName Generation Error.csv"
    drop_4.to_csv(_s)
    print(f"Saved {R}drop_1{S} to {B}{_s}{S}")
    
    ligand_coherent_df = ligand_df[(ligand_df["ligand_name_file"]==ligand_df["ligand_name_aff"])]
    ligand_coherent_df = ligand_coherent_df.reset_index()
    del ligand_coherent_df["index"]
    _s = f"{save_path}Data.{version}.Ligands.csv"
    ligand_coherent_df.to_csv(_s)
    print(f"Saved {R}ligand_coherent_df{S} to {B}{_s}{S}")

    drop_5 = ligand_df[(ligand_df["ligand_name_file"]!=ligand_df["ligand_name_aff"])]
    _s = f"{save_path}Removed.{version}.step_2_Ligands_{len(drop_5)} - Dropped with incoherent LigName.csv"
    drop_5.to_csv(_s)
    print(f"Saved {R}drop_2: ligand with incoherent name{S} to {B}{_s}{S}")

    pdb_coherent_df = pdb_df[~pdb_df.pdb_code.isin(drop_5.pdb_code.unique())].reset_index()
    del pdb_coherent_df["index"]
    _s = f"{save_path}Data.{version}.PDBs.csv"
    pdb_coherent_df.to_csv(_s)
    print(f"Saved {R}pdb_coherent_df{S} to {B}{_s}{S}")
    drop_6 = pdb_df[pdb_df.pdb_code.isin(drop_5.pdb_code.unique())]
    _s = f"{save_path}Removed.{version}.step_2_PDBs_{len(drop_6)} - Dropped with incoherent LigName.csv"
    drop_6.to_csv(_s)
    print(f"Saved {R}drop_3: pdb with incoherent name{S} to {B}{_s}{S}")

    print(f"\nProcessed files: ")
    _s = f"{save_path}Data.{version}.Ligands.csv"
    print(f"{R}Ligand{B}:    {B}{_s}{S}   num: {R}{len(pdb_coherent_df)}{S}")
    _s = f"{save_path}Data.{version}.PDBs.csv"
    print(f"{R}Proteins{B}:  {B}{_s}{S}      num: {R}{len(pdb_coherent_df)}{S}")

Processing [1;34m.pdb[0m files for [1;31mrefined[0m:


  0%|          | 0/5316 [00:00<?, ?it/s]

Processing [1;34m.pdb[0m files for [1;31mgeneral[0m:


  0%|          | 0/14127 [00:00<?, ?it/s]

Saved [1;31mpdb_df[0m to [1;34m./Inputs/tankbind/v9.1/Intermediate.v9.1.step_1_PDBs_19443 - With LigName.csv[0m
Saved [1;31mligand_df[0m to [1;34m./Inputs/tankbind/v9.1/Intermediate.v9.1.step_1_Ligands_19443 - With LigName.csv[0m
Saved [1;31mdrop_1[0m to [1;34m./Inputs/tankbind/v9.1/Removed.v9.1.step_1_PDBs_0 - Dropped with LigName Generation Error.csv[0m
Saved [1;31mligand_coherent_df[0m to [1;34m./Inputs/tankbind/v9.1/Data.v9.1.Ligands.csv[0m
Saved [1;31mdrop_2: ligand with incoherent name[0m to [1;34m./Inputs/tankbind/v9.1/Removed.v9.1.step_2_Ligands_3129 - Dropped with incoherent LigName.csv[0m
Saved [1;31mpdb_coherent_df[0m to [1;34m./Inputs/tankbind/v9.1/Data.v9.1.PDBs.csv[0m
Saved [1;31mdrop_3: pdb with incoherent name[0m to [1;34m./Inputs/tankbind/v9.1/Removed.v9.1.step_2_PDBs_3129 - Dropped with incoherent LigName.csv[0m

Processed files: 
[1;31mLigand[1;34m:    [1;34m./Inputs/tankbind/v9.1/Data.v9.1.Ligands.csv[0m   num: [1;31m16314[0m
[1;31

### 3. NCI & PDB data initilization
- Remove water group.
- Remove insufficient resolution.
- Remove wrong CP type, only (3,-1) will be kept.
- Remove NCI with both negative EDs.
- Remove empty ligatomname

#### Step 1

In [168]:
if cfg_mode == "nciyes":
    _logs = []
    raw_nci_data = pd.read_csv(nci_data_fpath)
    _a = len(raw_nci_data[raw_nci_data.Group=="water"])
    _b = len(raw_nci_data[(raw_nci_data["ED_2A"]<0)&(raw_nci_data["ED_3A"]<0)])
    _c = len(raw_nci_data[raw_nci_data["Resolution"]>2.5])
    _d = len(raw_nci_data[raw_nci_data.CP_type!="(3::-1)"])
    _logs.append(f"Raw water: {_a}.\n")
    _logs.append(f"Raw both-negative EDs: {_b}.\n")
    _logs.append(f"Raw insufficient Resolution: {_c}.\n")
    _logs.append(f"Raw wrong CP_type: {_d}.\n")

In [169]:
if cfg_mode == "nciyes":
    _num = len(raw_nci_data)
    nci_data = raw_nci_data[raw_nci_data["Group"]!="water"]
    _logs.append(f"Remove water: {_num-len(nci_data)} removed, {len(nci_data)} remain.        -- (Group != water) kept.\n")

    _num = len(nci_data)
    nci_data = nci_data[(nci_data["ED_2A"]>=0)|(nci_data["ED_3A"]>=0)]
    _logs.append(f"Remove both-negative EDs: {_num-len(nci_data)} removed, {len(nci_data)} remain.        -- (ED_2A>=0 or ED_3A>=0) kept.\n")

    _num = len(nci_data)
    nci_data=nci_data[nci_data["Resolution"]<=2.5]
    _logs.append(f"Remove insufficent Resolution: {_num-len(nci_data)} removed, {len(nci_data)} remain.         -- (Resolution<=2.5) kept.\n")

    _num = len(nci_data)
    nci_data["LigAtomName"].fillna(0,inplace=True)
    nci_data=nci_data[nci_data["LigAtomName"]!=0]
    _logs.append(f"Remove empty LigAtomName: {_num-len(nci_data)} removed, {len(nci_data)} remain.         -- (LigAtomName is not NaN) kept.\n")      

    _num = len(nci_data)
    nci_data = nci_data[nci_data.CP_type == "(3::-1)"]
    _logs.append(f"Remove wrong CP_type: {_num-len(nci_data)} removed, {len(nci_data)} remain.         -- (CP_type == \"(3::-1)\") kept.\n")

    # Add ResFullID
    nci_data["ResFullID"] = nci_data["ChainID"]+"_"+nci_data["ResID"].astype(str)+"_"+nci_data["ResName"].astype(str)
    _logs.append("Column ResFullID added.\n")



    with open(f"{save_path}Removed.{version}.step_1_NCIs_{len(raw_nci_data)-len(nci_data)} - Initialization.txt", "w") as f:
        f.writelines(_logs)
    nci_data.to_csv(f"{save_path}Intermediate.{version}.step_1_NCIs_{len(nci_data)} - Initialized.csv")


#### Step 2-3

In [170]:
if cfg_mode == "nciyes":
    # Remove ncis without pdbbind files
    drop_1 = nci_data[~nci_data.PDB_Code.isin(all_pdbs)]
    drop_1.to_csv(f"{save_path}Removed.{version}.step_2_NCIs_{len(drop_1)} - Dropped without PDBBind files.csv")

    drop_2 = nci_data[~nci_data.PDB_Code.isin(nci_pdbs)]
    drop_2.to_csv(f"{save_path}Removed.{version}.step_3_NCIs_{len(drop_2)} - Dropped without ligand files.csv")

    nci_data = nci_data[(nci_data.PDB_Code.isin(all_pdbs))&(nci_data.PDB_Code.isin(nci_pdbs))]


    # Check ncis
    nci_pdbs = set(nci_data.PDB_Code.unique())
    all_pdbs = refined_pdbs|general_pdbs
    all_nci_pdbs = nci_pdbs.intersection(all_pdbs, nci_pdbs)
    refined_nci_pdbs = refined_pdbs.intersection(all_nci_pdbs)
    general_nci_pdbs = general_pdbs.intersection(all_nci_pdbs)


    refined_without_nci_pdbs = refined_pdbs - refined_nci_pdbs
    general_without_nci_pdbs = general_pdbs - general_nci_pdbs

    nci_data = nci_data[nci_data.PDB_Code.isin(all_nci_pdbs)]
    nci_data = nci_data.reset_index().rename(columns={"index":"original_index"})

    drop_3 = [[_line, "refined"] for _line in refined_without_nci_pdbs]
    drop_3.extend([[_line, "general"] for _line in general_without_nci_pdbs])
    drop_3 = pd.DataFrame(drop_3, columns = ["pdb_code", "pdb_group"])
    drop_3.to_csv(f"{save_path}Removed.{version}.step_4_PDBs_{len(drop_3)} - Dropped without NCI.csv")
    nci_data.to_csv(f"{save_path}Intermediate.{version}.step_4_NCIs_{len(nci_data)} - With Coherent Files.csv")

In [171]:
if cfg_mode == "nciyes":
    # Check ligand name uniqueness for each pdb_code
    nci_ligand_name_data = nci_data[["PDB_Code", "LigName", "LigAtomName", "ResFullID", "CP_type", "Group"]]
    nci_ligand_name_data = nci_ligand_name_data.groupby(["PDB_Code", "LigName"]).count().reset_index()

In [172]:
if cfg_mode == "nciyes":
    len(nci_ligand_name_data['PDB_Code']), len(nci_ligand_name_data['PDB_Code'].unique())

Since we have 10227 unique pdb_codes and 10227 rows, each pdb is related to a unique ligname in nci.

### 4. Ligand name coherence.

In [173]:
if cfg_mode == "nciyes":
    from Bio.PDB import PDBParser
    parser = PDBParser()
    pdb_df_list = []
    ligand_df_list = []
    drop_4 = []

    # refined
    for (_name, _pdb_list, _pdb_path, _aff_fpath) in zip(
        ["refined","general"],[refined_nci_pdbs, general_nci_pdbs], [refined_path, general_path], [refined_aff_df_fpath, general_aff_df_fpath]
        ):
        print(f"Processing {B}.pdb{S} files for {R}{_name}{S}:")
        aff_df = pd.read_csv(_aff_fpath, index_col=0)
        for (i,_pdb) in enumerate(tqdm(_pdb_list, total=len(_pdb_list))):
            model = parser.get_structure("pdb", f"{nci_path}{_pdb}/{_pdb}_ligand.pdb")[0]
            ligand_name_set = set()
            for _chain in model:
                for _residue in _chain:
                    ligand_name_set.add(_residue.get_full_id()[3][0])
            if len(ligand_name_set) == 1:
                ligand_name = list(ligand_name_set)[0].replace("H_","")
            else:
                drop_4.append([_pdb, f"{nci_path}{_pdb}/{_pdb}_ligand.pdb", f"More than one ligand name : {str(ligand_name_set)}"])
                continue 
            
            aff_line = aff_df[(aff_df["PDB_code"]== _pdb)]
            
            if len(aff_line) == 1:
                aff = aff_line["-logKd/Ki"].values[0]
                release_year = aff_line["release_year"].values[0]
                if not isinstance(aff, int) and not isinstance(aff, float):
                    drop_4.append([_pdb, f"{nci_path}{_pdb}/{_pdb}_ligand.pdb", f"Error with affinity data : {aff}"])
                    continue
            else:
                drop_4.append([_pdb, f"{nci_path}{_pdb}/{_pdb}_ligand.pdb", f"Error with affinity data : {aff}"])
                continue
            
            ligand_name2 = aff_line['ligand_name'].values[0]
            ligand_name3 = nci_data[(nci_data["PDB_Code"]==_pdb)].LigName.unique().__str__().replace("[","").replace("]","").replace("\'", "")
            pdb_df_list.append([_pdb, f"{_pdb_path}{_pdb}/{_pdb}_protein.pdb", _name, release_year])
            ligand_df_list.append([_pdb, ligand_name, ligand_name2, ligand_name3, f"{nci_path}{_pdb}/{_pdb}_ligand.pdb", aff, _name, release_year])


In [174]:
if cfg_mode == "nciyes":
    pdb_df = pd.DataFrame(pdb_df_list, columns=["pdb_code", "pdb_fpath", "datagroup", "release_year"])
    _s = f"{save_path}Intermediate.{version}.step_5_PDBs_{len(pdb_df)} - With LigName.csv"
    pdb_df.to_csv(_s)
    print(f"Saved {R}pdb_df{S} to {B}{_s}{S}")

    ligand_df = pd.DataFrame(ligand_df_list, columns=["pdb_code", "ligand_name_ncifile", "ligand_name_aff", "ligand_name_nci", "ligand_fpath", "affinity", "datagroup", "release_year"])
    _s = f"{save_path}Intermediate.{version}.step_5_Ligands_{len(pdb_df)} - With LigName.csv"
    ligand_df.to_csv(_s)
    print(f"Saved {R}ligand_df{S} to {B}{_s}{S}")

    drop_4 = pd.DataFrame(drop_4, columns=["pdb_code", "ligand_fpath", "error_info"])
    _s = f"{save_path}Removed.{version}.step_5_PDBs_{len(drop_4)} - Dropped with LigName Generation Error.csv"
    drop_4.to_csv(_s)
    print(f"Saved {R}drop_4{S} to {B}{_s}{S}")

In [175]:
if cfg_mode == "nciyes":
    ligand_coherent_df = ligand_df[(ligand_df["ligand_name_ncifile"]==ligand_df["ligand_name_aff"])&(ligand_df["ligand_name_ncifile"]==ligand_df["ligand_name_nci"])]
    ligand_coherent_df = ligand_coherent_df.reset_index()
    del ligand_coherent_df["index"]
    _s = f"{save_path}Data.{version}.Ligands.csv"
    ligand_coherent_df.to_csv(_s)
    print(f"Saved {R}ligand_coherent_df{S} to {B}{_s}{S}")

    drop_5 = ligand_df[(ligand_df["ligand_name_ncifile"]!=ligand_df["ligand_name_aff"])|(ligand_df["ligand_name_ncifile"]!=ligand_df["ligand_name_nci"])]
    _s = f"{save_path}Removed.{version}.step_6_Ligands_{len(drop_5)} - Dropped with incoherent LigName.csv"
    drop_5.to_csv(_s)
    print(f"Saved {R}drop_5: ligand with incoherent name{S} to {B}{_s}{S}")

    pdb_coherent_df = pdb_df[~pdb_df.pdb_code.isin(drop_5.pdb_code.unique())].reset_index()
    del pdb_coherent_df["index"]
    _s = f"{save_path}Data.{version}.PDBs.csv"
    pdb_coherent_df.to_csv(_s)
    print(f"Saved {R}pdb_coherent_df{S} to {B}{_s}{S}")



    drop_6 = pdb_df[pdb_df.pdb_code.isin(drop_5.pdb_code.unique())]
    _s = f"{save_path}Removed.{version}.step_6_PDBs_{len(drop_6)} - Dropped with incoherent LigName.csv"
    drop_6.to_csv(_s)
    print(f"Saved {R}drop_6: pdb with incoherent name{S} to {B}{_s}{S}")

    nci_data_coherent = nci_data[~nci_data.PDB_Code.isin(drop_5.pdb_code.unique())].reset_index()
    del nci_data_coherent["index"]
    _s = f"{save_path}Data.{version}.NCIs.csv"
    nci_data_coherent.to_csv(_s)
    print(f"Saved {R}nci_data_coherent{S} to {B}{_s}{S}")

    drop_7 = nci_data[nci_data.PDB_Code.isin(drop_5.pdb_code.unique())]
    _s = f"{save_path}Removed.{version}.step_6_NCIs_{len(drop_7)} - Dropped with incoherent LigName.csv"
    drop_7.to_csv(_s)
    print(f"Saved {R}drop_7: NCI with incoherent name{S} to {B}{_s}{S}")

    with open(f"{save_path}Datainfo.txt", "w") as f:
        f.write(f"Data version: {version}\n")
        f.write(f"PDBs: {len(pdb_coherent_df)}, Ligands: {len(pdb_coherent_df)}, NCIs: {len(nci_data_coherent)}")

    print(f"\nProcessed files: ")
    _s = f"{save_path}Data.{version}.Ligands.csv"
    print(f"{R}Ligand{B}:    {B}{_s}{S}   num: {R}{len(pdb_coherent_df)}{S}")
    _s = f"{save_path}Data.{version}.PDBs.csv"
    print(f"{R}Proteins{B}:  {B}{_s}{S}      num: {R}{len(pdb_coherent_df)}{S}")
    _s = f"{save_path}Data.{version}.NCIs.csv"
    print(f"{R}NCI Table{B}: {B}{_s}{S}      num: {R}{len(nci_data_coherent)}{S}")