<a href="https://colab.research.google.com/github/hojungnam/LAIDD_DTI/blob/main/LAIDD_AI_DTI_by_hjnam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## RDkit을 이용한 compound descriptor 생성 예제

In [None]:
!pip install rdkit-pypi # install rdkit

In [None]:
from __future__ import absolute_import
import rdkit
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors # Module containing functions to compute molecular descriptors
from rdkit.Chem import Descriptors
import rdkit.rdBase
from rdkit.Chem.MACCSkeys import GenMACCSKeys
from rdkit.Chem import AllChem
from rdkit.Chem import Draw

In [None]:
# Reading single molecules
m = Chem.MolFromSmiles("CN1C=NC2=C1C(=O)N(C(=O)N2C)C") # caffine

from rdkit.Chem.Draw import IPythonConsole #Needed to show molecules
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions #Only needed if modifying defaults
%matplotlib inline
m

In [None]:
rdMolDescriptors.CalcExactMolWt(m) # returns the molecule’s exact molecular weight

In [None]:
Descriptors.MolLogP(m) 

In [None]:
rdMolDescriptors.CalcMolFormula(m) # returns the molecule’s formula

In [None]:
rdMolDescriptors.CalcNumHBA(m) # returns the number of H-bond acceptors for a molecule

In [None]:
rdMolDescriptors.CalcNumHBD(m) # returns the number of H-bond donors for a molecule

In [None]:
# molecule into MACCSKey
maccs_fp = GenMACCSKeys(m)
maccs_fp.ToBitString()

In [None]:
# molecule into MorganFingerprint
morgan_fp = AllChem.GetMorganFingerprintAsBitVect(m,2, nBits=1024)
morgan_fp.ToBitString()

## Protein embedding


### example - 1 hot embedding

In [None]:
from tensorflow.keras.utils import to_categorical
from keras.models import Model
from keras.layers import Embedding
from keras.preprocessing import sequence
import pandas as pd
import numpy as np

In [None]:
seq_rdic = ['A','I','L','V','F','W','Y','N','C','Q','M','S','T','D','E','R','H','K','G','P','O','U','X','B','Z']
seq_dic = {w: i+1 for i,w in enumerate(seq_rdic)}
def encodeSeq(seq, seq_dic):  # change AA to number
    if pd.isnull(seq):
        return [0]
    else:
        return [seq_dic[aa] for aa in seq]

protein = encodeSeq("MPACCSCSDVFQYETNKVTRIQSMNYGTIKWFFHVIIFSYVCFALVDDKK", seq_dic)
protein = sequence.pad_sequences(np.array([protein]) , maxlen=2500)

In [None]:
one_hot = to_categorical(protein)
one_hot_df = pd.DataFrame(one_hot[0,:,:])
one_hot_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### example - Keras integer encoding / adopted from DeepConv-DTI

In [None]:
from keras.models import Model
from keras.layers import Input, Embedding
from keras.preprocessing import sequence
import pandas as pd
import numpy as np

In [None]:
seq_rdic = ['A','I','L','V','F','W','Y','N','C','Q','M','S','T','D','E','R','H','K','G','P','O','U','X','B','Z']
seq_dic = {w: i+1 for i,w in enumerate(seq_rdic)}
def encodeSeq(seq, seq_dic):  # change AA to number
    if pd.isnull(seq):
        return [0]
    else:
        return [seq_dic[aa] for aa in seq]

protein = encodeSeq("MPACCSCSDVFQYETNKVTRIQSMNYGTIKWFFHVIIFSYVCFALVDDKK", seq_dic)
protein = sequence.pad_sequences(np.array([protein]) , maxlen=2500)

In [None]:
protein_df = pd.DataFrame(protein)
protein_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,2460,2461,2462,2463,2464,2465,2466,2467,2468,2469,2470,2471,2472,2473,2474,2475,2476,2477,2478,2479,2480,2481,2482,2483,2484,2485,2486,2487,2488,2489,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,5,10,7,15,13,8,18,4,13,16,2,10,12,11,8,7,19,13,2,18,6,5,5,17,4,2,2,5,12,7,4,9,5,1,3,4,14,14,18,18


In [None]:
input_p = Input(shape=(2500,))
model_p = Embedding(26,20)(input_p) # Keras embedding layer

model_embedding = Model(inputs=[input_p], outputs=model_p)
protein_embedding = model_embedding.predict([protein])

In [None]:
protein_embedding

array([[[ 0.02796635, -0.02285252, -0.01567079, ..., -0.01765518,
         -0.01329537,  0.0166272 ],
        [ 0.02796635, -0.02285252, -0.01567079, ..., -0.01765518,
         -0.01329537,  0.0166272 ],
        [ 0.02796635, -0.02285252, -0.01567079, ..., -0.01765518,
         -0.01329537,  0.0166272 ],
        ...,
        [ 0.00628328, -0.02531121,  0.02676379, ..., -0.04914433,
          0.01696781, -0.04917204],
        [ 0.01488573, -0.01522392, -0.03500531, ...,  0.04755663,
         -0.0210974 , -0.02338868],
        [ 0.01488573, -0.01522392, -0.03500531, ...,  0.04755663,
         -0.0210974 , -0.02338868]]], dtype=float32)

In [None]:
print (protein_embedding.shape)
type(protein_embedding)

(1, 2500, 20)


numpy.ndarray

In [None]:
embedding_df = pd.DataFrame(protein_embedding[0,:,:])

In [None]:
embedding_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.027966,-0.022853,-0.015671,-0.016536,0.030555,-0.016744,-0.012349,0.034835,-0.004321,-0.044527,0.022911,0.038945,0.022235,-0.015541,-0.017239,0.028315,-0.009742,-0.017655,-0.013295,0.016627
1,0.027966,-0.022853,-0.015671,-0.016536,0.030555,-0.016744,-0.012349,0.034835,-0.004321,-0.044527,0.022911,0.038945,0.022235,-0.015541,-0.017239,0.028315,-0.009742,-0.017655,-0.013295,0.016627
2,0.027966,-0.022853,-0.015671,-0.016536,0.030555,-0.016744,-0.012349,0.034835,-0.004321,-0.044527,0.022911,0.038945,0.022235,-0.015541,-0.017239,0.028315,-0.009742,-0.017655,-0.013295,0.016627
3,0.027966,-0.022853,-0.015671,-0.016536,0.030555,-0.016744,-0.012349,0.034835,-0.004321,-0.044527,0.022911,0.038945,0.022235,-0.015541,-0.017239,0.028315,-0.009742,-0.017655,-0.013295,0.016627
4,0.027966,-0.022853,-0.015671,-0.016536,0.030555,-0.016744,-0.012349,0.034835,-0.004321,-0.044527,0.022911,0.038945,0.022235,-0.015541,-0.017239,0.028315,-0.009742,-0.017655,-0.013295,0.016627
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,-0.020901,-0.030004,-0.002188,-0.020266,-0.034022,-0.033888,-0.024090,0.005681,-0.036582,0.047582,0.048017,-0.013654,0.004454,-0.014047,0.034650,-0.041141,0.048812,-0.001794,0.014851,-0.039003
2496,0.006283,-0.025311,0.026764,0.048657,-0.000552,-0.016682,-0.045210,0.039489,0.045329,-0.013962,-0.013657,-0.034154,0.020713,-0.045432,0.023333,-0.033562,-0.008607,-0.049144,0.016968,-0.049172
2497,0.006283,-0.025311,0.026764,0.048657,-0.000552,-0.016682,-0.045210,0.039489,0.045329,-0.013962,-0.013657,-0.034154,0.020713,-0.045432,0.023333,-0.033562,-0.008607,-0.049144,0.016968,-0.049172
2498,0.014886,-0.015224,-0.035005,-0.009012,-0.031761,-0.035822,0.000281,-0.034790,0.043411,-0.040869,0.029212,0.045371,-0.024615,0.003762,-0.007901,-0.003016,0.004582,0.047557,-0.021097,-0.023389


In [None]:

aaa = protein_embedding[0,1,:]
print (aaa)
np.sqrt(aaa)

[ 0.02796635 -0.02285252 -0.01567079 -0.0165356   0.03055452 -0.01674402
 -0.01234943  0.03483542 -0.00432129 -0.04452665  0.02291074  0.03894487
  0.0222354  -0.01554086 -0.01723867  0.02831483 -0.00974164 -0.01765518
 -0.01329537  0.0166272 ]


  after removing the cwd from sys.path.


array([0.16723141,        nan,        nan,        nan, 0.17479852,
              nan,        nan, 0.1866425 ,        nan,        nan,
       0.15136294, 0.19734454, 0.14911538,        nan,        nan,
       0.16827011,        nan,        nan,        nan, 0.12894651],
      dtype=float32)