In [12]:
import os
import numpy as np
from rdkit import Chem
from rdkit.Chem.rdchem import BondType
from rdkit.Chem.rdmolfiles import MolFromMol2File
from rdkit.Chem.rdmolops import GetAdjacencyMatrix, RemoveHs

In [2]:
# Define the atom types we are interested in for the protein
atomDict = {'C': np.int8(1), 'O': np.int8(2), 'N': np.int8(3), 'S': np.int8(4), 'P': np.int8(5), 'H': np.int8(6), 'X': np.int8(7)}
rAtomDict = {v: k for (k,v) in atomDict.items()}
rAtomDict[np.int8(0)] = None

# Define atom types and bond orders we are interested in for the ligand
# We encode aromatics as a separate bond type (1.5: np.int8(4))
bondType = {0.0: np.int8(0), 1.0: np.int8(1), 2.0: np.int8(2), 3.0: np.int8(3), 1.5: np.int8(4)}
bondMap = np.vectorize(lambda x: bondType[x])
# The None entry in ligAtom is necessary to ensure correct one-hot encoding
ligAtom = {None: 0, 'C': 1, 'N': 2, 'O': 3, 'F': 4, 'S': 5, 'X': 6}
ligAtomMap = np.vectorize(lambda x: ligAtom[x.GetSymbol()] if x.GetSymbol() in ligAtom else 6)

In [21]:
pl_dir = 'data/pdbbind/tiny-set'
atom_set = set([])
for  PDBID in os.listdir(pl_dir):
    # Load the protein by parsing the .pdb file
    with open(f'data/pdbbind/tiny-set/{PDBID}/{PDBID}_protein.pdb') as f:
        data = [(line[-1], [float(i) for i in line.split()[6:9]]) \
                        for line in f.read().split('\n') if line[:4]=='ATOM' and line[-1]!='H']
        print(PDBID, set([i[0] for i in data]))
        # for i in data:
        #     atom_set.add(i[0])

1a1e {'O', 'S', 'C', 'N'}
1a4r {'O', 'S', 'C', 'N'}
1ai4 {'O', 'S', 'C', 'N'}
1ajp {'O', 'S', 'C', 'N'}
1aid {'O', 'S', 'C', 'N'}
1ajx {'O', 'S', 'C', 'N'}
1ajq {'O', 'S', 'C', 'N'}
1ajv {'O', 'S', 'C', 'N'}
1ai5 {'O', 'S', 'C', 'N'}
1add {'O', 'S', 'C', 'N'}
1amw {'O', 'S', 'C', 'N'}
1amk {'O', 'S', 'C', 'N'}
1adl {'O', 'S', 'C', 'N'}
1a99 {'O', 'S', 'C', 'N'}
1a30 {'O', 'S', 'C', 'N'}
1a4w {'O', 'S', 'C', 'N'}
1atl {'O', 'S', 'C', 'N'}
1a4k {'4', '0', '5', '8', '7', '1', '6', '3', '2', '9'}
1a28 {'O', 'S', 'C', 'N'}
1ai7 {'O', 'S', 'C', 'N'}
1afk {'O', 'S', 'C', 'N'}
1afl {'O', 'S', 'C', 'N'}
1ajn {'O', 'S', 'C', 'N'}
1alw {'O', 'S', 'C', 'N'}
1aj7 {'O', 'S', 'C', 'N'}
1aaq {'O', 'S', 'C', 'N'}
1ado {'O', 'S', 'C', 'N'}
1apv {'O', 'S', 'C', 'N'}
1a9m {'O', 'S', 'C', 'N'}
1a94 {'O', 'S', 'C', 'N'}
1a9q {'O', 'S', 'C', 'N'}
1a69 {'O', 'S', 'C', 'N'}


In [18]:
atom_set

{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'C', 'N', 'O', 'S'}

In [11]:
at

{'C', 'N', 'O', 'S'}