In [1]:
pdbbind_path = "/home/jovyan/data/pdbbind2020/pdbbind_files"
ncipdbbind_path = "/home/jovyan/dataspace/pdb_bind_2020"
# import tankbind util
tankbind_src_folder_path = "../tankbind/"
import sys
sys.path.insert(0, tankbind_src_folder_path)
from feature_utils import write_renumbered_sdf
from io import StringIO
# write_renumbered_sdf(toFile, sdf_fileName, mol2_fileName)

In [2]:
def read_mol_for_mapping(sdf_fileName, mol2_fileName, verbose=False, mol_for_nci=False):
    Chem.WrapLogs()
    stderr = sys.stderr
    sio = sys.stderr = StringIO()
    mol = Chem.MolFromMolFile(sdf_fileName, sanitize=False)
    problem = False
    try:
        Chem.SanitizeMol(mol)
        mol = Chem.RemoveHs(mol)
        sm = Chem.MolToSmiles(mol)
    except Exception as e:
        sm = str(e)
        problem = True
    if problem and not mol_for_nci: # when mol_for_nci is True, there's no mol2_file, skipped.
        mol = Chem.MolFromMol2File(mol2_fileName, sanitize=False)
        problem = False
        try:
            Chem.SanitizeMol(mol)
            mol = Chem.RemoveHs(mol)
            sm = Chem.MolToSmiles(mol)
            problem = False
        except Exception as e:
            sm = str(e)
            problem = True

    if verbose:
        print(sio.getvalue())
    sys.stderr = stderr
    return mol, problem, sm


def write_renumbered_sdf_for_mapping(toFile, sdf_fileName, mol2_fileName):
    # read in mol
    mol, problem = read_mol_for_mapping(sdf_fileName, mol2_fileName)
    # reorder the mol atom number as in smiles.
    m_order = list(mol.GetPropsAsDict(includePrivate=True, includeComputed=True)['_smilesAtomOutputOrder']) # Canonical Order
    mol = Chem.RenumberAtoms(mol, m_order)
    w = Chem.SDWriter(toFile)
    w.write(mol)
    w.close()
    return problem

In [3]:
print(BMol_Order)
print(WMol_Order)
print(BSM)
print(pdb)

NameError: name 'BMol_Order' is not defined

In [None]:
ligand_name_dict["8a3h"]

NameError: name 'ligand_name_dict' is not defined

In [4]:
from selectors import EpollSelector
from rdkit import Chem
from collections import defaultdict
from Bio.PDB import PDBParser
from tqdm.notebook import tqdm
import numpy as np
ligand_name_dict = defaultdict(dict)
parser = PDBParser()

#for pdb in tqdm(['10gs']):
for pdb in tqdm(os.listdir(ncipdbbind_path)):
    # B for pdbbind and W for Wenzhi's NCI database
    B_sdf = f"{pdbbind_path}/{pdb}/{pdb}_ligand.sdf"
    B_mol2 = f"{pdbbind_path}/{pdb}/{pdb}_ligand.mol2"  # In cases where .sdf not work

    W_sdf = f"{ncipdbbind_path}/{pdb}/{pdb}_ligand.sdf"
    W_pdb = f"{ncipdbbind_path}/{pdb}/{pdb}_ligand.pdb"  # Atom names stored

    # Mol, problem_flag, and SMILES
    BMol_sdf_re, BM_problem, BSM = read_mol_for_mapping(B_sdf, B_mol2, mol_for_nci=False)
    WMol_sdf_re, WM_problem, WSM = read_mol_for_mapping(W_sdf, None, mol_for_nci=True)

    if BM_problem: # PDBBind Mol Generation Error
        ligand_name_dict[pdb] = "Error_BM"
        continue
    if WM_problem: # WZ's file Mol Generation Error
        ligand_name_dict[pdb] = "Error_WM"
        continue

    # NO Problem
    WMol_pdb = Chem.MolFromPDBFile(W_pdb) # Read PDB
    BMol_Order = list(BMol_sdf_re.GetPropsAsDict(includePrivate=True, includeComputed=True)['_smilesAtomOutputOrder']) # Get Order
    WMol_Order = list(WMol_sdf_re.GetPropsAsDict(includePrivate=True, includeComputed=True)['_smilesAtomOutputOrder']) # Get Order

    structure = parser.get_structure("pdb", W_pdb)[0] # Get Atom Names
    WMol_pdb_atoms = list(list(list(structure)[0])[0].get_atoms())


    BMol_NumAtoms = BMol_sdf_re.GetNumAtoms()
    WMol_NumAtoms = WMol_sdf_re.GetNumAtoms()
    if BMol_NumAtoms != WMol_NumAtoms:
        ligand_name_dict[pdb] = "Error_AtomNum"
        continue


    atomicNums = []
    atomNames = []
    
    for i, order in enumerate(WMol_Order): # Read AtomicNum and AtomName from Wenzhi's mol by canonical order 
        atomicNums.append(WMol_sdf_re.GetAtoms()[order].GetAtomicNum())
        atomNames.append(WMol_pdb_atoms[order].get_name())
    
    # Since we will use renumbered mol constructed from PDBBind files, we will construct the ligname dict from them, too.
    for i, order in enumerate(BMol_Order): 
        if BMol_sdf_re.GetAtoms()[order].GetAtomicNum() == atomicNums[i]: 
            ligand_name_dict[pdb][i] = (atomNames[i], WMol_Order[i])
        else:
            ligand_name_dict[pdb] = "Error_AtomicNum"
            break

    if ligand_name_dict[pdb] == "Error_AtomicNum":
        continue

    else:
        # In the end, check whether the ligand centers are the same.
        BMol_Coors = []
        WMol_Coors = []
        for i in range(BMol_sdf_re.GetNumAtoms()):
            BMol_Coors.append(np.array(BMol_sdf_re.GetConformer().GetAtomPosition(i)))
            WMol_Coors.append(np.array(WMol_sdf_re.GetConformer().GetAtomPosition(i)))
        BMol_Coors = np.array(BMol_Coors).mean(axis=0)
        WMol_Coors = np.array(WMol_Coors).mean(axis=0)
        if np.mean((BMol_Coors-WMol_Coors)**2) > 2:
            #print(pdb)
            ligand_name_dict[pdb] = "Error_LigCenterCoor"

  0%|          | 0/15189 [00:00<?, ?it/s]



In [5]:
ligand_name_dict[pdb]

{0: ('O6', 10),
 1: ('C6', 5),
 2: ('C5', 4),
 3: ('O5', 9),
 4: ('C1', 0),
 5: ('O1', 19),
 6: ('C4B', 14),
 7: ('C3B', 13),
 8: ('O3B', 18),
 9: ('C2B', 12),
 10: ('O2B', 17),
 11: ('C1B', 11),
 12: ('N2B', 22),
 13: ('C7B', 23),
 14: ('C8B', 24),
 15: ('N1B', 20),
 16: ('C5B', 15),
 17: ('C6B', 16),
 18: ('O6B', 21),
 19: ('C2', 1),
 20: ('O2', 6),
 21: ('C3', 2),
 22: ('O3', 7),
 23: ('C4', 3),
 24: ('O4', 8)}

In [6]:
a, b, c, d, e = 0, 0, 0, 0, 0
for _key in ligand_name_dict.keys():
    if isinstance(ligand_name_dict[_key], str):
        if ligand_name_dict[_key] == "Error_BM":
            a += 1
        elif ligand_name_dict[_key] == "Error_WM":
            b += 1
        elif ligand_name_dict[_key] == "Error_AtomicNum":
            c += 1
        elif ligand_name_dict[_key] == "Error_AtomNum":
            d += 1
        elif ligand_name_dict[_key] == "Error_LigCenterCoor":
            e += 1

print("PDBBind Smiles Error", a, "\nWenzhi's Smiles Error", b, 
      "\nAtomic Number Non-Coherant", c, "\nAtom Number Non-Coherant", d, "\nLigand Centre Non-Coherant", e)
        

PDBBind Smiles Error 238 
Wenzhi's Smiles Error 124 
Atomic Number Non-Coherant 937 
Atom Number Non-Coherant 1004 
Ligand Centre Non-Coherant 167


In [7]:
# And now we recycle some Mols noted as "Error" that could be used.

xx_list = [] # AtomicNum Error List
for _key in ligand_name_dict.keys():
    if ligand_name_dict[_key] == "Error_AtomicNum":
        xx_list.append(_key)

In [8]:
newnew_dict = defaultdict(dict)
mapping_dict = defaultdict(dict)
mapping_set_dict = defaultdict(list)
mayok = []

def caldist(a, b):
    return ((a-b)**2).mean()

for pdb in tqdm(xx_list):
    
    B_sdf = f"{pdbbind_path}/{pdb}/{pdb}_ligand.sdf"
    B_mol2 = f"{pdbbind_path}/{pdb}/{pdb}_ligand.mol2"

    W_sdf = f"{ncipdbbind_path}/{pdb}/{pdb}_ligand.sdf"
    W_pdb = f"{ncipdbbind_path}/{pdb}/{pdb}_ligand.pdb"

    BMol_sdf, BM_problem, BSM = read_mol_for_mapping(B_sdf, B_mol2, mol_for_nci=False)
    WMol_sdf, WM_problem, WSM = read_mol_for_mapping(W_sdf, None, mol_for_nci=True)

    WMol_pdb = Chem.MolFromPDBFile(W_pdb)
    BMol_Order = list(BMol_sdf.GetPropsAsDict(includePrivate=True, includeComputed=True)['_smilesAtomOutputOrder'])
    WMol_Order = list(WMol_sdf.GetPropsAsDict(includePrivate=True, includeComputed=True)['_smilesAtomOutputOrder'])

    # Length non-coherant
    if len(BMol_Order) != len(WMol_Order):
        newnew_dict[pdb] = "Error_AtomNum"
        continue

    else:
        bcoors = []
        wcoors = []
        for i in range(BMol_sdf.GetNumAtoms()):
            bcoors.append(np.array(BMol_sdf.GetConformer().GetAtomPosition(i)))
            wcoors.append(np.array(WMol_sdf.GetConformer().GetAtomPosition(i)))
            
        #print(BMol_Order)
        for i, order in enumerate(BMol_Order): # Canonical Order.
            #print("BMOL_Order", i, order)
            _bcoor = bcoors[order] # the i-th atom in canonical order is the order-th atom in BMol
            minimum = 10000
            cur_j = -1
            for (j, _wcoor) in enumerate(wcoors):   # Get the closest atom in Wenzhi‘s mol for each atom in PDBBind(canonical);
                # We don't use WMol_Order! Because the l_id column in NCI DataFrame is constructed from the original atom order.
                if caldist(_bcoor, _wcoor) < minimum:
                    minimum = caldist(_bcoor, _wcoor)
                    cur_j = j
            
            if minimum > 0.5:
                #print("NONONO")
                newnew_dict[pdb]  = "Error_AtomicNum"
                if pdb in mapping_dict:
                    del mapping_dict[pdb]
                break 
            
            if BMol_sdf.GetAtoms()[order].GetAtomicNum() != WMol_sdf.GetAtoms()[cur_j].GetAtomicNum():
                #print("NO")
                newnew_dict[pdb]  = "Error_AtomicNum"
                if pdb in mapping_dict:
                    del mapping_dict[pdb]
                break     
            
            mapping_dict[pdb][i] = cur_j # order: PDBBind canonial; cur_j: NCI original
            mapping_set_dict[pdb].append(cur_j)
        
        #print(mapping_dict[pdb])
        if isinstance(newnew_dict[pdb], str):
            if pdb in mapping_dict:
                del mapping_dict[pdb]
            continue
        
        if len(mapping_set_dict[pdb]) != len(set(mapping_set_dict[pdb])) or (len(mapping_set_dict[pdb])!=BMol_sdf.GetNumAtoms()): # if several atoms are mapped to the same atom, error!
            newnew_dict[pdb]  = "Error_AtomCoor"
            if pdb in mapping_dict:
                del mapping_dict[pdb]
            continue
        
        else:
            mayok.append(pdb)

  0%|          | 0/937 [00:00<?, ?it/s]

In [9]:
len(mapping_set_dict['4u5l'])

1

In [10]:
len(newnew_dict)

937

In [11]:
len(mayok)

728

In [25]:
torch.save(mayok, "/home/jovyan/dataspace/NFT/main/sdf_from_NCI_list.pt")

In [28]:
for i in LigDict_id:
    if i in mayok:
        continue
    else:
        print("asgd")

In [12]:
len(mapping_dict)

728

In [13]:
print(BMol_Order)
print(WMol_Order)

[6, 5, 7, 4, 3, 2, 13, 14, 15, 16, 17, 19, 18, 20, 24, 23, 22, 21, 0, 1, 12, 8, 9, 10, 11]
[23, 21, 24, 22, 15, 12, 13, 14, 11, 9, 10, 3, 1, 0, 2, 4, 5, 6, 7, 8, 16, 17, 18, 19, 20]


In [14]:
aaa = list([a.GetAtomicNum() for a in BMol_sdf.GetAtoms()])
bbb = list([a.GetAtomicNum() for a in WMol_sdf.GetAtoms()])
print(aaa)
print(bbb)

[6, 8, 6, 6, 16, 6, 6, 6, 6, 6, 8, 8, 7, 7, 6, 8, 6, 6, 8, 8, 6, 6, 16, 6, 6]
[8, 6, 8, 6, 6, 6, 6, 16, 6, 6, 8, 7, 6, 6, 8, 6, 7, 6, 6, 8, 8, 6, 16, 6, 6]


In [15]:
ccc = mapping_dict['6un3']
print(ccc)

{0: 24, 1: 21, 2: 23, 3: 22, 4: 15, 5: 12, 6: 11, 7: 9, 8: 10, 9: 3, 10: 1, 11: 0, 12: 2, 13: 4, 14: 5, 15: 6, 16: 7, 17: 8, 18: 13, 19: 14, 20: 16, 21: 17, 22: 18, 23: 20, 24: 19}


In [16]:
print(BSM)

CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](C(=O)O)c3ccsc3)C(=O)N2[C@H]1C(=O)O


In [17]:
LigDict_name_id = {}
for _key in ligand_name_dict:
    if isinstance(ligand_name_dict[_key], str):
        continue
    else:
        LigDict_name_id[_key] = ligand_name_dict[_key]

LigDict_id = {}
for _key in mapping_dict.keys():
    LigDict_id[_key] = mapping_dict[_key]

In [18]:
import torch

In [19]:
os.system("mkdir -p ../../dataspace/NFT/main/ligand_name_dicts")

0

In [20]:
torch.save(LigDict_id, "../../dataspace/NFT/main/ligand_name_dicts/ligdict_id.pt")
torch.save(LigDict_name_id, "../../dataspace/NFT/main/ligand_name_dicts/ligdict_nameid.pt")

In [21]:
pdb = "4u5l"
if True:
    B_sdf = f"{pdbbind_path}/{pdb}/{pdb}_ligand.sdf"
    B_mol2 = f"{pdbbind_path}/{pdb}/{pdb}_ligand.mol2"

    W_sdf = f"{ncipdbbind_path}/{pdb}/{pdb}_ligand.sdf"
    W_pdb = f"{ncipdbbind_path}/{pdb}/{pdb}_ligand.pdb"

    BMol_sdf, BM_problem, BSM = read_mol_for_mapping(B_sdf, B_mol2, mol_for_nci=False)
    WMol_sdf, WM_problem, WSM = read_mol_for_mapping(W_sdf, None, mol_for_nci=True)

    WMol_pdb = Chem.MolFromPDBFile(W_pdb)
    BMol_Order = list(BMol_sdf.GetPropsAsDict(includePrivate=True, includeComputed=True)['_smilesAtomOutputOrder'])
    WMol_Order = list(WMol_sdf.GetPropsAsDict(includePrivate=True, includeComputed=True)['_smilesAtomOutputOrder'])

NameError: name 'LigDict_nameid' is not defined

In [None]:
BSM

'NC(=O)[C@H](CCCC[NH3+])[NH2+]Cc1ccc(-c2cccnc2)cc1'

In [None]:
WSM

'NCCCC[C@H](NCc1ccc(-c2cccnc2)cc1)C(N)=O'

In [None]:
BMol_sdf.GetNumAtoms()

23

In [None]:
WMol_sdf.GetNumAtoms()

23

In [None]:
nci_df = 

In [None]:
import os

len(os.listdir("/home/jovyan/dataspace/NFT/main/nci_protein_ligand_matrix"))

1406