<a href="https://colab.research.google.com/github/hojungnam/LAIDD_DTI/blob/main/LAIDD_AI_DTI_by_hjnam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## RDkit을 이용한 compound descriptor 생성 예제

In [None]:
!pip install rdkit-pypi # install rdkit

In [None]:
from __future__ import absolute_import
import rdkit
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors # Module containing functions to compute molecular descriptors
from rdkit.Chem import Descriptors
import rdkit.rdBase
from rdkit.Chem.MACCSkeys import GenMACCSKeys
from rdkit.Chem import AllChem
from rdkit.Chem import Draw

In [None]:
# Reading single molecules
m = Chem.MolFromSmiles("CN1C=NC2=C1C(=O)N(C(=O)N2C)C") # caffine

from rdkit.Chem.Draw import IPythonConsole #Needed to show molecules
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions #Only needed if modifying defaults
%matplotlib inline
m

In [None]:
rdMolDescriptors.CalcExactMolWt(m) # returns the molecule’s exact molecular weight

In [None]:
Descriptors.MolLogP(m) 

In [None]:
rdMolDescriptors.CalcMolFormula(m) # returns the molecule’s formula

In [None]:
rdMolDescriptors.CalcNumHBA(m) # returns the number of H-bond acceptors for a molecule

In [None]:
rdMolDescriptors.CalcNumHBD(m) # returns the number of H-bond donors for a molecule

In [None]:
# molecule into MACCSKey
maccs_fp = GenMACCSKeys(m)
maccs_fp.ToBitString()

In [None]:
# molecule into MorganFingerprint
morgan_fp = AllChem.GetMorganFingerprintAsBitVect(m,2, nBits=1024)
morgan_fp.ToBitString()

## PyBioMed을 이용한 protein descriptor

In [None]:
!pip install rdkit-pypi # install rdkit

In [None]:
!pip install pybel_tools # install pybel

In [None]:
!git clone https://github.com/gadsbyfly/PyBioMed.git
%cd PyBioMed
!python setup.py install

### Using PyBioMed - AA composition

In [None]:
import PyBioMed
from PyBioMed.PyProtein import AAComposition

In [None]:
protein="AAAAAAAAVGE"
AAC=AAComposition.CalculateAAComposition(protein)
print (AAC)

In [None]:
AAC_D = AAC.values()

In [None]:
print (AAC_D)

In [None]:
len(AAC)

In [None]:
AAD=AAComposition.CalculateAADipeptideComposition(protein)
print (AAD)

In [None]:
len(AAD)

### Using PyBioMed - CTD descriptor

In [None]:
from PyBioMed.PyProtein import CTD

protein="AAAAAAAAVGE"
protein_descriptor = CTD.CalculateCTD(protein)
print (protein_descriptor)

In [None]:
print (len(protein_descriptor))

In [None]:
protein_descriptor

In [None]:
protein_descriptor_v = protein_descriptor.values()
print (protein_descriptor_v)

## Protein embedding 예제


### example - 1 hot embedding

In [None]:
from tensorflow.keras.utils import to_categorical
from keras.models import Model
from keras.layers import Embedding
from keras.preprocessing import sequence
import pandas as pd
import numpy as np

In [None]:
seq_rdic = ['A','I','L','V','F','W','Y','N','C','Q','M','S','T','D','E','R','H','K','G','P','O','U','X','B','Z']
seq_dic = {w: i+1 for i,w in enumerate(seq_rdic)}
def encodeSeq(seq, seq_dic):  # change AA to number
    if pd.isnull(seq):
        return [0]
    else:
        return [seq_dic[aa] for aa in seq]
print (seq_dic)

In [None]:
protein = encodeSeq("MPACCSCSDVFQYETNKVTRIQSMNYGTIKWFFHVIIFSYVCFALVDDKKZ", seq_dic)
print (protein)
print (len(protein))

In [None]:
protein = sequence.pad_sequences(np.array([protein]) , maxlen=100)
print (protein)

In [None]:
one_hot = to_categorical(protein)
one_hot_df = pd.DataFrame(one_hot[0,:,:])
one_hot_df

### example - Keras integer encoding / adopted from DeepConv-DTI

In [None]:
from keras.models import Model
from keras.layers import Input, Embedding
from keras.preprocessing import sequence
import pandas as pd
import numpy as np

In [None]:
seq_rdic = ['A','I','L','V','F','W','Y','N','C','Q','M','S','T','D','E','R','H','K','G','P','O','U','X','B','Z']
seq_dic = {w: i+1 for i,w in enumerate(seq_rdic)}
def encodeSeq(seq, seq_dic):  # change AA to number
    if pd.isnull(seq):
        return [0]
    else:
        return [seq_dic[aa] for aa in seq]

protein = encodeSeq("MPACCSCSDVFQYETNKVTRIQSMNYGTIKWFFHVIIFSYVCFALVDDKKZ", seq_dic)
protein = sequence.pad_sequences(np.array([protein]) , maxlen=100)

In [None]:
protein_df = pd.DataFrame(protein)
protein_df

In [None]:
input_p = Input(shape=(100,))
model_p = Embedding(26,20)(input_p) # Keras embedding layer

model_embedding = Model(inputs=[input_p], outputs=model_p)
protein_embedding = model_embedding.predict([protein])

In [None]:
embedding_df = pd.DataFrame(protein_embedding[0,:,:])

In [None]:
embedding_df