<a href="https://colab.research.google.com/github/hojungnam/LAIDD_DTI/blob/main/LAIDD_AI_DTI_by_hjnam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## RDkit을 이용한 compound descriptor 생성 예제

In [None]:
!pip install rdkit-pypi # install rdkit

In [None]:
from __future__ import absolute_import
import rdkit
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors # Module containing functions to compute molecular descriptors
from rdkit.Chem import Descriptors
import rdkit.rdBase
from rdkit.Chem.MACCSkeys import GenMACCSKeys
from rdkit.Chem import AllChem
from rdkit.Chem import Draw

In [None]:
# Reading single molecules
m = Chem.MolFromSmiles("CN1C=NC2=C1C(=O)N(C(=O)N2C)C") # caffine

from rdkit.Chem.Draw import IPythonConsole #Needed to show molecules
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions #Only needed if modifying defaults
%matplotlib inline
m

In [None]:
rdMolDescriptors.CalcExactMolWt(m) # returns the molecule’s exact molecular weight

In [None]:
Descriptors.MolLogP(m) 

In [None]:
rdMolDescriptors.CalcMolFormula(m) # returns the molecule’s formula

In [None]:
rdMolDescriptors.CalcNumHBA(m) # returns the number of H-bond acceptors for a molecule

In [None]:
rdMolDescriptors.CalcNumHBD(m) # returns the number of H-bond donors for a molecule

In [None]:
# molecule into MACCSKey
maccs_fp = GenMACCSKeys(m)
maccs_fp.ToBitString()

In [None]:
# molecule into MorganFingerprint
morgan_fp = AllChem.GetMorganFingerprintAsBitVect(m,2, nBits=1024)
morgan_fp.ToBitString()

## PyBioMed을 이용한 protein descriptor

In [None]:
!pip install rdkit-pypi # install rdkit

In [None]:
!pip install pybel_tools # install pybel

In [None]:
!git clone https://github.com/gadsbyfly/PyBioMed.git
%cd PyBioMed
!python setup.py install

### Using PyBioMed - AA composition

In [None]:
import PyBioMed
from PyBioMed.PyProtein import AAComposition

In [None]:
protein="AAAAAAAAVGE"
AAC=AAComposition.CalculateAAComposition(protein)
print (AAC)

In [None]:
AAC_D = AAC.values()

In [None]:
print (AAC_D)

In [None]:
len(AAC)

In [None]:
AAD=AAComposition.CalculateAADipeptideComposition(protein)
print (AAD)

In [None]:
len(AAD)

### Using PyBioMed - CTD descriptor

In [None]:
from PyBioMed.PyProtein import CTD

protein="AAAAAAAAVGE"
protein_descriptor = CTD.CalculateCTD(protein)
print (protein_descriptor)

In [None]:
print (len(protein_descriptor))

In [None]:
protein_descriptor

In [None]:
protein_descriptor_v = protein_descriptor.values()
print (protein_descriptor_v)

## Protein embedding 예제


### example - 1 hot embedding

In [1]:
from tensorflow.keras.utils import to_categorical
from keras.models import Model
from keras.layers import Embedding
from keras.preprocessing import sequence
import pandas as pd
import numpy as np

In [2]:
seq_rdic = ['A','I','L','V','F','W','Y','N','C','Q','M','S','T','D','E','R','H','K','G','P','O','U','X','B','Z']
seq_dic = {w: i+1 for i,w in enumerate(seq_rdic)}
def encodeSeq(seq, seq_dic):  # change AA to number
    if pd.isnull(seq):
        return [0]
    else:
        return [seq_dic[aa] for aa in seq]
print (seq_dic)

{'A': 1, 'I': 2, 'L': 3, 'V': 4, 'F': 5, 'W': 6, 'Y': 7, 'N': 8, 'C': 9, 'Q': 10, 'M': 11, 'S': 12, 'T': 13, 'D': 14, 'E': 15, 'R': 16, 'H': 17, 'K': 18, 'G': 19, 'P': 20, 'O': 21, 'U': 22, 'X': 23, 'B': 24, 'Z': 25}


In [3]:
protein = encodeSeq("MPACCSCSDVFQYETNKVTRIQSMNYGTIKWFFHVIIFSYVCFALVDDKKZ", seq_dic)
print (protein)
print (len(protein))

[11, 20, 1, 9, 9, 12, 9, 12, 14, 4, 5, 10, 7, 15, 13, 8, 18, 4, 13, 16, 2, 10, 12, 11, 8, 7, 19, 13, 2, 18, 6, 5, 5, 17, 4, 2, 2, 5, 12, 7, 4, 9, 5, 1, 3, 4, 14, 14, 18, 18, 25]
51


In [4]:
protein = sequence.pad_sequences(np.array([protein]) , maxlen=100)
print (protein)

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0 11 20  1  9  9 12  9 12 14  4  5 10  7 15 13  8 18  4 13 16  2 10 12
  11  8  7 19 13  2 18  6  5  5 17  4  2  2  5 12  7  4  9  5  1  3  4 14
  14 18 18 25]]


In [5]:
one_hot = to_categorical(protein)
one_hot_df = pd.DataFrame(one_hot[0,:,:])
one_hot_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### example - Keras integer encoding / adopted from DeepConv-DTI

In [6]:
from keras.models import Model
from keras.layers import Input, Embedding
from keras.preprocessing import sequence
import pandas as pd
import numpy as np

In [7]:
seq_rdic = ['A','I','L','V','F','W','Y','N','C','Q','M','S','T','D','E','R','H','K','G','P','O','U','X','B','Z']
seq_dic = {w: i+1 for i,w in enumerate(seq_rdic)}
def encodeSeq(seq, seq_dic):  # change AA to number
    if pd.isnull(seq):
        return [0]
    else:
        return [seq_dic[aa] for aa in seq]

protein = encodeSeq("MPACCSCSDVFQYETNKVTRIQSMNYGTIKWFFHVIIFSYVCFALVDDKKZ", seq_dic)
protein = sequence.pad_sequences(np.array([protein]) , maxlen=100)

In [8]:
protein_df = pd.DataFrame(protein)
protein_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,10,7,15,13,8,18,4,13,16,2,10,12,11,8,7,19,13,2,18,6,5,5,17,4,2,2,5,12,7,4,9,5,1,3,4,14,14,18,18,25


In [9]:
input_p = Input(shape=(100,))
model_p = Embedding(26,20)(input_p) # Keras embedding layer

model_embedding = Model(inputs=[input_p], outputs=model_p)
protein_embedding = model_embedding.predict([protein])

In [10]:
embedding_df = pd.DataFrame(protein_embedding[0,:,:])

In [11]:
embedding_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.034496,-0.021885,0.008522,-0.023791,-0.030547,0.041169,-0.014744,0.042474,-0.014674,-0.009554,-0.015578,-0.014704,-0.036156,0.048641,-0.009215,0.038844,-0.036094,0.024915,0.045346,-0.030843
1,-0.034496,-0.021885,0.008522,-0.023791,-0.030547,0.041169,-0.014744,0.042474,-0.014674,-0.009554,-0.015578,-0.014704,-0.036156,0.048641,-0.009215,0.038844,-0.036094,0.024915,0.045346,-0.030843
2,-0.034496,-0.021885,0.008522,-0.023791,-0.030547,0.041169,-0.014744,0.042474,-0.014674,-0.009554,-0.015578,-0.014704,-0.036156,0.048641,-0.009215,0.038844,-0.036094,0.024915,0.045346,-0.030843
3,-0.034496,-0.021885,0.008522,-0.023791,-0.030547,0.041169,-0.014744,0.042474,-0.014674,-0.009554,-0.015578,-0.014704,-0.036156,0.048641,-0.009215,0.038844,-0.036094,0.024915,0.045346,-0.030843
4,-0.034496,-0.021885,0.008522,-0.023791,-0.030547,0.041169,-0.014744,0.042474,-0.014674,-0.009554,-0.015578,-0.014704,-0.036156,0.048641,-0.009215,0.038844,-0.036094,0.024915,0.045346,-0.030843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.044069,-0.042337,0.042486,0.031406,-0.010583,-0.038985,-0.011479,-0.045384,0.021845,-0.027157,0.034716,-0.044587,-0.042444,-0.024022,0.021321,0.046482,-0.040060,-0.008731,0.019504,0.042504
96,0.044069,-0.042337,0.042486,0.031406,-0.010583,-0.038985,-0.011479,-0.045384,0.021845,-0.027157,0.034716,-0.044587,-0.042444,-0.024022,0.021321,0.046482,-0.040060,-0.008731,0.019504,0.042504
97,-0.031093,-0.002615,0.045176,0.002753,0.018813,0.047322,-0.039241,0.015643,0.012318,-0.018939,0.049070,-0.049901,-0.003296,0.006624,0.008762,-0.008751,-0.002175,-0.016037,-0.028953,-0.028866
98,-0.031093,-0.002615,0.045176,0.002753,0.018813,0.047322,-0.039241,0.015643,0.012318,-0.018939,0.049070,-0.049901,-0.003296,0.006624,0.008762,-0.008751,-0.002175,-0.016037,-0.028953,-0.028866
