In [1]:
import numpy as np
from rdkit import Chem
from rdkit.Chem.rdchem import BondType
from rdkit.Chem.rdmolfiles import MolFromMol2File
from rdkit.Chem.rdmolops import GetAdjacencyMatrix, RemoveHs

In [3]:
# Define the atom types we are interested in for the protein
atomDict = {'C': np.int8(1), 'O': np.int8(2), 'N': np.int8(3), 'S': np.int8(4), 'P': np.int8(5), 'H': np.int8(6), 'X': np.int8(7)}
rAtomDict = {v: k for (k,v) in atomDict.items()}
rAtomDict[np.int8(0)] = None

# Define atom types and bond orders we are interested in for the ligand
# We encode aromatics as a separate bond type (1.5: np.int8(4))
bondType = {0.0: np.int8(0), 1.0: np.int8(1), 2.0: np.int8(2), 3.0: np.int8(3), 1.5: np.int8(4)}
bondMap = np.vectorize(lambda x: bondType[x])
# The None entry in ligAtom is necessary to ensure correct one-hot encoding
ligAtom = {None: 0, 'C': 1, 'N': 2, 'O': 3, 'F': 4, 'S': 5, 'X': 6}
ligAtomMap = np.vectorize(lambda x: ligAtom[x.GetSymbol()] if x.GetSymbol() in ligAtom else 6)

In [114]:
PDBID = '1a1e'
# Load the protein by parsing the .pdb file
with open(f'data/pdbbind/refined-set/{PDBID}/{PDBID}_protein.pdb') as f:
    data = [(line[-1], [float(i) for i in line.split()[6:9]]) for line in f.read().split('\n') if line[:4]=='ATOM' and line[-1]!='H']

In [66]:
atoms.add('fd')

In [67]:
atoms

{'fd'}