In [22]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from IPython.core.display import display
from copy import deepcopy
from rdkit.Chem import AllChem

## Reading protein data

In [2]:
data = pd.read_csv("pdbbind_core_df.csv")

In [3]:
data.head(4)

Unnamed: 0,pdb_id,smiles,complex_id,protein_pdb,ligand_pdb,ligand_mol2,label
0,2d3u,CC1CCCCC1S(O)(O)NC1CC(C2CCC(CN)CC2)SC1C(O)O,2d3uCC1CCCCC1S(O)(O)NC1CC(C2CCC(CN)CC2)SC1C(O)O,"['HEADER 2D3U PROTEIN\n', 'COMPND 2D3U P...","['COMPND 2d3u ligand \n', 'AUTHOR GENERA...","['### \n', '### Created by X-TOOL on Thu Aug 2...",6.92
1,3cyx,CC(C)(C)NC(O)C1CC2CCCCC2C[NH+]1CC(O)C(CC1CCCCC...,3cyxCC(C)(C)NC(O)C1CC2CCCCC2C[NH+]1CC(O)C(CC1C...,"['HEADER 3CYX PROTEIN\n', 'COMPND 3CYX P...","['COMPND 3cyx ligand \n', 'AUTHOR GENERA...","['### \n', '### Created by X-TOOL on Thu Aug 2...",8.0
2,3uo4,OC(O)C1CCC(NC2NCCC(NC3CCCCC3C3CCCCC3)N2)CC1,3uo4OC(O)C1CCC(NC2NCCC(NC3CCCCC3C3CCCCC3)N2)CC1,"['HEADER 3UO4 PROTEIN\n', 'COMPND 3UO4 P...","['COMPND 3uo4 ligand \n', 'AUTHOR GENERA...","['### \n', '### Created by X-TOOL on Fri Aug 2...",6.52
3,1p1q,CC1ONC(O)C1CC([NH3+])C(O)O,1p1qCC1ONC(O)C1CC([NH3+])C(O)O,"['HEADER 1P1Q PROTEIN\n', 'COMPND 1P1Q P...","['COMPND 1p1q ligand \n', 'AUTHOR GENERA...","['### \n', '### Created by X-TOOL on Thu Aug 2...",4.89


## Getting unique atoms and number of unique atoms in the dataframe

In [4]:
def GetAtomData(smiles_data):
    atoms = set()
    for mol in smiles_data:
        mol_atoms = Chem.MolFromSmiles(mol).GetAtoms()
        for atom in mol_atoms:
            atoms.add(atom.GetSymbol())
    return list(atoms), len(atoms)

In [5]:
smiles = list(data['smiles'].values)
smiles[:2]

['CC1CCCCC1S(O)(O)NC1CC(C2CCC(CN)CC2)SC1C(O)O',
 'CC(C)(C)NC(O)C1CC2CCCCC2C[NH+]1CC(O)C(CC1CCCCC1)NC(O)C(CC(N)O)NC(O)C1CCC2CCCCC2N1']

In [6]:
atoms, vector_size = GetAtomData(smiles)

In [7]:
print "Atoms: ", atoms
print "Vector size: ", vector_size

Atoms:  ['C', 'Cl', 'I', 'F', 'O', 'N', 'P', 'S', 'Br']
Vector size:  9


## Atoms are assigned to a unique integer number

* _Assigned integer values of atoms will be used to perform one-hot-encoding._

In [8]:
def atomDicts(atoms):
    atom2label = {}
    label2atom = {}
    for i, atom in enumerate(atoms):
        atom2label[atom] = i
        label2atom[i] = atom    
    return atom2label, label2atom

In [9]:
atom2label, label2atom = atomDicts(atoms)
print atom2label
print label2atom

{'C': 0, 'F': 3, 'I': 2, 'Cl': 1, 'O': 4, 'N': 5, 'P': 6, 'S': 7, 'Br': 8}
{0: 'C', 1: 'Cl', 2: 'I', 3: 'F', 4: 'O', 5: 'N', 6: 'P', 7: 'S', 8: 'Br'}


## Creating (target, context) pairs

In [10]:
def targetContextPairs(mols):
    target_context_pairs = []
    for mol in mols:
        mol = Chem.MolFromSmiles(mol)
        for atom_idx in range(mol.GetNumAtoms()):
            target = mol.GetAtomWithIdx(atom_idx).GetSymbol() # taking center atom as "target"
            atom = mol.GetAtomWithIdx(atom_idx)
            neighbors = atom.GetNeighbors()
            for neighbor in neighbors:
                context = neighbor.GetSymbol() # taking neighbor atoms as context
                pair = (target, context)
                target_context_pairs.append(pair)

                #bond_type = mol.GetBondBetweenAtoms(atom_idx, neighbor.GetIdx()).GetBondType()
    return target_context_pairs       

In [11]:
target_context_pairs = targetContextPairs(smiles)

In [12]:
target_context_pairs[:20]

[('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'S'),
 ('C', 'C'),
 ('S', 'C'),
 ('S', 'O'),
 ('S', 'O'),
 ('S', 'N'),
 ('O', 'S')]

## One-hot-encoding

The following function will prepare input (x) and output (Y) data. We will use these arrays to train our machine learning model.
* x will contain target atoms' one-hot-encoded represantation.
* Y will contain one-hot-encoded represantation of context atoms.

In [13]:
def createInputOutputData(target_context_pairs, vector_size, atom2label):
    x,Y = [],[] # input and output vectors
    for pair in target_context_pairs:
        # initialize x_ and y_ vectors for x and y instances
        x_ = [0]*vector_size # instance x
        y_ = [0]*vector_size # instance y
        
        x_label, y_label = atom2label[pair[0]], atom2label[pair[1]]
        # put 1 to the index where the atom's label equals to
        x_[x_label] = 1
        y_[y_label] = 1
        
        x.append(x_)
        Y.append(y_)
    return x, Y

In [31]:
x,Y = createInputOutputData(target_context_pairs, vector_size, atom2label)

In [32]:
train_x, train_Y = np.array(x), np.array(Y)

In [33]:
print train_x.shape
print train_Y.shape

(10286, 9)
(10286, 9)


## Using a MLP with 2 hidden layers to Vectorize Atoms

In [21]:
import tensorflow as tf

In [59]:
# Parameters
learning_rate = 0.001
training_epochs = 15
batch_size = 50
display_step = 1

# Network Parameters
n_hidden_1 = 81 # 1st layer number of neurons
n_hidden_2 = 27 # 2nd layer number of neurons
n_input = 9 # An atom is represented with 9 digits
n_classes = 9 # An atom is represented with 9 digits

In [60]:
# tf Graph input
X = tf.placeholder("float", [None, n_input])
Y = tf.placeholder("float", [None, n_classes])

# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

In [61]:
# Create model
def multilayer_perceptron(x):
    # Hidden fully connected layer with 256 neurons
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    # Hidden fully connected layer with 256 neurons
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    # Output fully connected layer with a neuron for each class
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
    return out_layer

In [62]:
# Construct model
logits = multilayer_perceptron(X)

# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
    logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

In [63]:
# Initializing the variables
init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    batch_start = 0

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        batch_end = batch_start + batch_size
        total_batch = int(train_x.shape[0]/batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_x, batch_y = np.array(train_x[batch_start:batch_end]), np.array(train_Y[batch_start:batch_end])
            # Run optimization op (backprop) and cost op (to get loss value)
            _, c = sess.run([train_op, loss_op], feed_dict={X: batch_x,
                                                            Y: batch_y})
            # Compute average loss
            avg_cost += c / total_batch
        batch_start = batch_end
        # Display logs per epoch step
        if epoch % display_step == 0:
            print "Epoch:", '%04d' % (epoch+1), "cost={:.9f}".format(avg_cost)
    print("Optimization Finished!")

    # Test model
    pred = tf.nn.softmax(logits)  # Apply softmax to logits
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(Y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print("Training set Accuracy:", accuracy.eval({X: train_x, Y: train_Y}))

Epoch: 0001 cost=5.502621198
Epoch: 0002 cost=1.151240101
Epoch: 0003 cost=0.547088783
Epoch: 0004 cost=0.575510609
Epoch: 0005 cost=0.463730979
Epoch: 0006 cost=0.655795952
Epoch: 0007 cost=0.576615155
Epoch: 0008 cost=0.691499697
Epoch: 0009 cost=1.322073454
Epoch: 0010 cost=0.683685144
Epoch: 0011 cost=1.786308809
Epoch: 0012 cost=0.855141848
Epoch: 0013 cost=0.531419835
Epoch: 0014 cost=0.563166715
Epoch: 0015 cost=0.515693263
Optimization Finished!
('Training set Accuracy:', 0.7703675)


# Vectors of atoms

First hidden layer's weights will give us atoms' vectors. Since we created the first hiddenlayer with 81 neurons, atoms' vector size will be equal to 81.

In [64]:
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
v = sess.run(weights['h1']) 

In [65]:
v.shape

(9, 81)

In [67]:
for i in range(9):
    print label2atom[i], " : ",
    print v[i]

C  :  [-0.30018643 -0.17919602  1.8915352  -0.02534358 -0.30059046 -1.0837185
 -2.3945599  -0.29558158 -0.39843354  0.08644951 -1.3563019   0.7131067
 -0.56933266  0.8171031  -0.5803338   0.22451052  0.44380444 -0.46835646
  0.97925353  0.40043256 -0.9409464  -0.36971062  0.06300923  0.55001384
 -0.5860386  -0.74259055  0.9715004   0.40199783  0.9863616   0.43245214
 -2.3986623   0.17079322 -0.0055041  -0.9013104   1.217958   -1.4547238
  1.7389457  -0.9314999  -0.14577913 -0.5361576  -0.7907156   0.07985448
  1.1786083  -0.2847233   0.7530563   0.8074845   0.04999667  0.6912234
 -0.9677285   2.7346764  -2.9489512  -1.0514557   0.56491184  0.24221055
  0.7621061  -0.6038643  -1.0055978   0.08590884 -0.90245456  1.6845027
 -0.01998931 -0.30453536  1.2562642   1.2689692   1.844025    0.85132533
 -0.04528829 -2.2920086  -0.6998998   0.0959399  -0.4436187   0.43356785
  1.3437775  -0.7333994   0.12874192  0.8986841  -0.37116918  2.0463784
 -2.8087692  -1.7718322  -1.3715562 ]
Cl  :  [-0.29

# Visualization

In [68]:
from sklearn.decomposition import PCA

In [69]:
pca = PCA(n_components=3)
v_3d = pca.fit_transform(v)

In [70]:
v_3d

array([[ 0.441528  ,  8.556913  ,  2.001026  ],
       [-3.331392  , -2.8735244 ,  8.277223  ],
       [ 8.087277  , -2.7843344 , -0.21877557],
       [-0.18156533,  3.791488  , -0.7471194 ],
       [-5.849486  , -2.9271717 , -1.3686564 ],
       [-1.3968394 , -0.62842566, -1.8197258 ],
       [-2.1790166 , -1.4029634 , -5.2868557 ],
       [-0.4007664 ,  0.55093586, -1.9734367 ],
       [ 4.810259  , -2.2829173 ,  1.1363195 ]], dtype=float32)

In [75]:
%matplotlib inline
import matplotlib.pyplot as plt

ImportError: cannot import name cbook

In [72]:
import matplotlib.pyplot as plt


x1 = np.linspace(0.0, 5.0)
x2 = np.linspace(0.0, 2.0)

y1 = np.cos(2 * np.pi * x1) * np.exp(-x1)
y2 = np.cos(2 * np.pi * x2)

plt.subplot(2, 1, 1)
plt.plot(x1, y1, 'o-')
plt.title('A tale of 2 subplots')
plt.ylabel('Damped oscillation')

plt.subplot(2, 1, 2)
plt.plot(x2, y2, '.-')
plt.xlabel('time (s)')
plt.ylabel('Undamped')

plt.show()

ImportError: cannot import name cbook

## Protein Classification

In [52]:
# kendi oluşturduğumuz atom vektörleriyle proteinler represent edilecek
# proteinler sınıflandırılacak, accuracy ölçülecek
# toxicity, solubility dataları kullanılabilir

## Vectorize Substructures

In [51]:
# Atomlara yaptığımız gibi substructure'ları vektörleştirebiliriz