In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from IPython.core.display import display
from copy import deepcopy
from rdkit.Chem import AllChem

## Reading protein data

In [2]:
data = pd.read_csv("pdbbind_core_df.csv")

In [3]:
data.head(4)

Unnamed: 0,pdb_id,smiles,complex_id,protein_pdb,ligand_pdb,ligand_mol2,label
0,2d3u,CC1CCCCC1S(O)(O)NC1CC(C2CCC(CN)CC2)SC1C(O)O,2d3uCC1CCCCC1S(O)(O)NC1CC(C2CCC(CN)CC2)SC1C(O)O,"['HEADER 2D3U PROTEIN\n', 'COMPND 2D3U P...","['COMPND 2d3u ligand \n', 'AUTHOR GENERA...","['### \n', '### Created by X-TOOL on Thu Aug 2...",6.92
1,3cyx,CC(C)(C)NC(O)C1CC2CCCCC2C[NH+]1CC(O)C(CC1CCCCC...,3cyxCC(C)(C)NC(O)C1CC2CCCCC2C[NH+]1CC(O)C(CC1C...,"['HEADER 3CYX PROTEIN\n', 'COMPND 3CYX P...","['COMPND 3cyx ligand \n', 'AUTHOR GENERA...","['### \n', '### Created by X-TOOL on Thu Aug 2...",8.0
2,3uo4,OC(O)C1CCC(NC2NCCC(NC3CCCCC3C3CCCCC3)N2)CC1,3uo4OC(O)C1CCC(NC2NCCC(NC3CCCCC3C3CCCCC3)N2)CC1,"['HEADER 3UO4 PROTEIN\n', 'COMPND 3UO4 P...","['COMPND 3uo4 ligand \n', 'AUTHOR GENERA...","['### \n', '### Created by X-TOOL on Fri Aug 2...",6.52
3,1p1q,CC1ONC(O)C1CC([NH3+])C(O)O,1p1qCC1ONC(O)C1CC([NH3+])C(O)O,"['HEADER 1P1Q PROTEIN\n', 'COMPND 1P1Q P...","['COMPND 1p1q ligand \n', 'AUTHOR GENERA...","['### \n', '### Created by X-TOOL on Thu Aug 2...",4.89


## Getting unique atoms and number of unique atoms in the dataframe

In [4]:
def GetAtomData(smiles_data):
    atoms = set()
    for mol in smiles_data:
        mol_atoms = Chem.MolFromSmiles(mol).GetAtoms()
        for atom in mol_atoms:
            atoms.add(atom.GetSymbol())
    return list(atoms), len(atoms)

In [20]:
smiles = list(data['smiles'].values)
smiles[:2]

['CC1CCCCC1S(O)(O)NC1CC(C2CCC(CN)CC2)SC1C(O)O',
 'CC(C)(C)NC(O)C1CC2CCCCC2C[NH+]1CC(O)C(CC1CCCCC1)NC(O)C(CC(N)O)NC(O)C1CCC2CCCCC2N1']

In [6]:
atoms, vector_size = GetAtomData(smiles)

In [7]:
print "Atoms: ", atoms
print "Vector size: ", vector_size

Atoms:  ['C', 'Cl', 'I', 'F', 'O', 'N', 'P', 'S', 'Br']
Vector size:  9


## Atoms are assigned to a unique integer number

* _Assigned integer values of atoms will be used to perform one-hot-encoding._

In [36]:
def atomDicts(atoms):
    atom2label = {}
    label2atom = {}
    for i, atom in enumerate(atoms):
        atom2label[atom] = i
        label2atom[i] = atom    
    return atom2label, label2atom

In [37]:
atom2label, label2atom = atomDicts(atoms)
print atom2label
print label2atom

{'C': 0, 'F': 3, 'I': 2, 'Cl': 1, 'O': 4, 'N': 5, 'P': 6, 'S': 7, 'Br': 8}
{0: 'C', 1: 'Cl', 2: 'I', 3: 'F', 4: 'O', 5: 'N', 6: 'P', 7: 'S', 8: 'Br'}


## Creating (target, context) pairs

In [33]:
def targetContextPairs(mols):
    target_context_pairs = []
    for mol in mols:
        mol = Chem.MolFromSmiles(mol)
        for atom_idx in range(mol.GetNumAtoms()):
            target = mol.GetAtomWithIdx(atom_idx).GetSymbol() # taking center atom as "target"
            atom = mol.GetAtomWithIdx(atom_idx)
            neighbors = atom.GetNeighbors()
            for neighbor in neighbors:
                context = neighbor.GetSymbol() # taking neighbor atoms as context
                pair = (target, context)
                target_context_pairs.append(pair)

                #bond_type = mol.GetBondBetweenAtoms(atom_idx, neighbor.GetIdx()).GetBondType()
    return target_context_pairs       

In [34]:
target_context_pairs = targetContextPairs(smiles)

In [35]:
target_context_pairs[:20]

[('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'C'),
 ('C', 'S'),
 ('C', 'C'),
 ('S', 'C'),
 ('S', 'O'),
 ('S', 'O'),
 ('S', 'N'),
 ('O', 'S')]

## One-hot-encoding

The following function will prepare input (x) and output (Y) data. We will use these arrays to train our machine learning model.
* x will contain target atoms' one-hot-encoded represantations.
* Y will contain one-hot-encoded represantations of context atoms.

In [44]:
def createInputOutputData(target_context_pairs, vector_size, atom2label):
    x,Y = [],[] # input and output vectors
    for pair in target_context_pairs:
        # initialize x_ and y_ vectors for x and y instances
        x_ = [0]*vector_size # instance x
        y_ = [0]*vector_size # instance y
        
        x_label, y_label = atom2label[pair[0]], atom2label[pair[1]]
        # put 1 to the index where the atom's label equals to
        x_[x_label] = 1
        y_[y_label] = 1
        
        x.append(x_)
        Y.append(y_)
    return x, Y

In [45]:
x,Y = createInputOutputData(target_context_pairs, vector_size, atom2label)

In [47]:
x[:20]

[[1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 0]]

In [49]:
Y[:20]

[[1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 0]]

## Using a Deep Learning Model to Vectorize Atoms

In [53]:
import keras

Using TensorFlow backend.


ImportError: No module named tensorflow

In [50]:
# keras yada tensorflow farketmez
# iki hidden layer'ı olan bi MLP kur
# epoch sayısı da fazla olsun
# ilk hidden layer'ın input weight'leri atomların vektörleştirilmiş hali olacak
# atomları 2 boyutlu düzlemde görselleştir

## Protein Classification

In [52]:
# kendi oluşturduğumuz atom vektörleriyle proteinler represent edilecek
# proteinler sınıflandırılacak, accuracy ölçülecek
# toxicity, solubility dataları kullanılabilir

## Vectorize Substructures

In [51]:
# Atomlara yaptığımız gibi substructure'ları vektörleştirebiliriz