In [71]:
import numpy as np
import csv
import glob
import htmd.ui as ht
import htmd.molecule.voxeldescriptors as vd
import htmd.molecule.vmdparser as vp
from tqdm import *
import pickle
import bcolz as bc

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Read the data file and get all the pdb ids
def read_score():
    pdb_ids = []
    scores = {}
    with open('pdbbind_refined_set.csv', 'r') as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None) # Skip the header
        for row in reader:
            pdb_ids.append(row[1])
            scores[row[1]]= float(row[5])

    return pdb_ids, scores

In [60]:
def _findDonors(mol, bonds):
    donors = np.zeros(mol.numAtoms, dtype=bool)
    hydrogens = np.where((mol.element == 'HD') | (mol.element == 'HS'))[0]
    for h in hydrogens:
        partners = bonds[bonds[:, 0] == h, 1]
        partners = np.hstack((partners, bonds[bonds[:, 1] == h, 0]))
        for p in partners:
            if mol.name[p][0] == 'N' or mol.name[p][0] == 'O':
                donors[p] = True
    return donors

In [5]:
def get_channels(mol):
    from collections import OrderedDict
    props = OrderedDict()
    _order = ('hydrophobic', 'aromatic', 'hbond_acceptor', 'hbond_donor', 'positive_ionizable',
          'negative_ionizable', 'metal', 'occupancies')
    elements = np.array([el.upper() for el in mol.element])

    props['hydrophobic'] = (elements == 'C') | (elements == 'A')
    props['aromatic'] = elements == 'A'
    props['hbond_acceptor'] = (elements == 'NA') | (elements == 'NS') | (elements == 'OA') | (elements == 'OS') | (
    elements == 'SA')
    props['hbond_donor'] = _findDonors(mol, mol._getBonds())
    props['positive_ionizable'] = mol.charge > 0
    props['negative_ionizable'] = mol.charge < 0
    props['metal'] = (elements == 'MG') | (elements == 'ZN') | (elements == 'MN') | \
                     (elements == 'CA') | (elements == 'FE')
    props['occupancies'] = (elements != 'H') & (elements != 'HS') & (elements != 'HD')

    channels = np.zeros((len(elements), len(props)), dtype=bool)
    for i, p in enumerate(_order):
        channels[:, i] = props[p]
    return channels

In [6]:
def get_all_channels(mol):
    from collections import OrderedDict
    props = OrderedDict()
    _order = ('p_hydrophobic', 'p_aromatic', 'p_hbond_acceptor', 'p_hbond_donor',
              'p_positive_ionizable', 'p_negative_ionizable', 'p_metal', 'p_occupancies',
              'l_hydrophobic', 'l_aromatic', 'l_hbond_acceptor', 'l_hbond_donor',
              'l_positive_ionizable', 'l_negative_ionizable', 'l_metal', 'l_occupancies')
    elements = np.array([el.upper() for el in mol.element])

    # Proteins
    _map = (elements == 'C') | (elements == 'A')
    props['p_hydrophobic'] = np.array([i and j for (i, j) in zip(_map, mol.atomselect('protein'))])
    
    _map = elements == 'A'
    props['p_aromatic'] = np.array([i and j for (i, j) in zip(_map, mol.atomselect('protein'))])
    
    _map = (elements == 'NA') | (elements == 'NS') | (elements == 'OA') | (elements == 'OS') | (
    elements == 'SA')    
    props['p_hbond_acceptor'] = np.array([i and j for (i, j) in zip(_map, mol.atomselect('protein'))])
    
    _map = _findDonors(mol, mol._getBonds())
    props['p_hbond_donor'] = np.array([i and j for (i, j) in zip(_map, mol.atomselect('protein'))])
    
    _map = mol.charge > 0
    props['p_positive_ionizable'] = np.array([i and j for (i, j) in zip(_map, mol.atomselect('protein'))])
    
    _map = mol.charge < 0
    props['p_negative_ionizable'] = np.array([i and j for (i, j) in zip(_map, mol.atomselect('protein'))])
    
    _map = (elements == 'MG') | (elements == 'ZN') | (elements == 'MN') | \
                     (elements == 'CA') | (elements == 'FE')
    props['p_metal'] = np.array([i and j for (i, j) in zip(_map, mol.atomselect('protein'))])
    
    _map = (elements != 'H') & (elements != 'HS') & (elements != 'HD')
    props['p_occupancies'] = np.array([i and j for (i, j) in zip(_map, mol.atomselect('protein'))])
    
    # Ligands
    _map = (elements == 'C') | (elements == 'A')
    props['l_hydrophobic'] = np.array([i and j for (i, j) in zip(_map, mol.atomselect('not protein'))])
    
    _map = elements == 'A'
    props['l_aromatic'] = np.array([i and j for (i, j) in zip(_map, mol.atomselect('not protein'))])
    
    _map = (elements == 'NA') | (elements == 'NS') | (elements == 'OA') | (elements == 'OS') | (
    elements == 'SA')    
    props['l_hbond_acceptor'] = np.array([i and j for (i, j) in zip(_map, mol.atomselect('not protein'))])
    
    _map = _findDonors(mol, mol._getBonds())
    props['l_hbond_donor'] = np.array([i and j for (i, j) in zip(_map, mol.atomselect('not protein'))])
    
    _map = mol.charge > 0
    props['l_positive_ionizable'] = np.array([i and j for (i, j) in zip(_map, mol.atomselect('not protein'))])
    
    _map = mol.charge < 0
    props['l_negative_ionizable'] = np.array([i and j for (i, j) in zip(_map, mol.atomselect('not protein'))])
    
    _map = (elements == 'MG') | (elements == 'ZN') | (elements == 'MN') | \
                     (elements == 'CA') | (elements == 'FE')
    props['l_metal'] = np.array([i and j for (i, j) in zip(_map, mol.atomselect('not protein'))])
    
    _map = (elements != 'H') & (elements != 'HS') & (elements != 'HD')
    props['l_occupancies'] = np.array([i and j for (i, j) in zip(_map, mol.atomselect('not protein'))])
    

    channels = np.zeros((len(elements), len(props)), dtype=bool)
    for i, p in enumerate(_order):
        channels[:, i] = props[p]
    return channels

In [61]:
def _reshape(x):
    new_shape = [50, 50, 50, 16]
    dim_diff = np.array([i-j for i, j in zip(new_shape, x.shape)])
    pad_dim = np.round(dim_diff / 2).astype(int)
    x = np.pad(x, [(pad_dim[0], dim_diff[0]-pad_dim[0]),
                   (pad_dim[1], dim_diff[1]-pad_dim[1]),
                   (pad_dim[2], dim_diff[2]-pad_dim[2]),
                   (0, 0)],
               'constant')
    return x #[13:37, 13:37, 13:37, :]

In [8]:
# Read experimental values for each pdb ids
pdb_ids, scores = read_score()
len(pdb_ids), len(scores)

(4154, 4154)

In [75]:
limit = 2000
data_x = np.zeros([limit, 50, 50, 50, 16])
data_y = np.zeros(limit)
# Get the pocket files
data_dir = "../../pdbbind_data/"
files = glob.glob(data_dir + "*/*/*_pocket.pdb", recursive=True)
count = 0

pbar = tqdm_notebook(total = limit)
for file in files:
    mol = ht.Molecule(file)
    _id = mol.viewname[:4]
    if _id not in pdb_ids:
        continue
    c = np.mean(mol.coords, axis=0)
    mol.moveBy(-c)
    # Renumber residues
    mol.renumberResidues()
    try:
        # Get the voxeldescriptors
        f, centers, natoms = vd.getVoxelDescriptors(mol, channels=get_all_channels(mol))#, method='CUDA')
        f = f.reshape(natoms[0], natoms[1], natoms[2], -1)
        data_x[count] = _reshape(f)
        data_y[count] = scores[_id]
        
        count = count + 1
        pbar.update()
        if count == limit:
            break
    except:
        continue

In [63]:
data_x.shape, data_y.shape

((2000, 50, 50, 50, 16), (2000,))

In [64]:
feat = data_x.reshape((-1, 16))
feat.shape

(250000000, 16)

In [65]:
np.amax(feat, axis=0)

array([1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1.])

In [66]:
np.amin(feat, axis=0)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [67]:
np.var(feat, axis=0)

array([2.61169047e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.23940693e-02,
       0.00000000e+00, 0.00000000e+00, 8.21072448e-07, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 2.40449957e-05, 2.14429839e-03])

In [68]:
np.var(data_y)

4.053929466889376

In [69]:
with open('data_x_2000.pickle', 'wb') as f:
    pickle.dump(data_x, f, pickle.HIGHEST_PROTOCOL)
with open('data_y_2000.pickle', 'wb') as f:
    pickle.dump(data_y, f, pickle.HIGHEST_PROTOCOL)

In [73]:
# Create bcolz array
bc_x = bc.carray(data_x, rootdir='bc_x_2000')

In [74]:
# Write to the disk
bc_x.flush()