## 蛋白质编码

读取AMPs和notAMPs序列，转化为2个通道的特征向量表示。其中通道1的数据来自hmmer的profile；通道2的数据来自氨基酸Onehot编码;通道3来自AA的物化性质

In [20]:
import numpy as np
from Bio import SeqIO
import json
import re
M1 = 800#879
M2 = 800#2405
M = M1+M2

In [15]:
# 对矩阵进行归一化
def maxminnorm(array):
    maxcols=array.max(axis=0)
    mincols=array.min(axis=0)
    data_shape = array.shape
    data_rows = data_shape[0]
    data_cols = data_shape[1]
    t=np.empty((data_rows,data_cols))
    for i in range(data_cols):
        if maxcols[i] > mincols[i]:
            t[:,i]=(array[:,i]-mincols[i])/(maxcols[i]-mincols[i])
    return t

# 加载来自hmmer profil的数据
# r=0 表示从头取numAA个氨基酸的profil，尾不足补0; r=-1 表示从尾往前取numAA个氨基酸的profil，头不足补0
def load_hmm_prof(numAA=50, r=0):
    files = ['e:/repoes/ampnet/data/benchmark/AMPs_50_hmm_profil.json',
         'e:/repoes/ampnet/data/benchmark/notAMPs_50_hmm_profil.json']
    fastafiles=[r'E:\Repoes\AMPnet\data\benchmark\AMPs_50.fasta',r'E:\Repoes\AMPnet\data\benchmark\notAMPs_50.fasta']
    #files = ['e:/repoes/ampnet/data/benchmark/wpAMPs_hmm_profil.json',
    #    'e:/repoes/ampnet/data/benchmark/wpnotAMPs_hmm_profil.json']
    #fastafiles=[r'E:\Repoes\AMPnet\data\benchmark\wpAMPs.fasta',r'E:\Repoes\AMPnet\data\benchmark\wpnotAMPs.fasta']
    N = numAA * 20
    X = np.ndarray((M,N))
    y = np.ones(M)
    y[M1:] = 0
    k = 0
    for ii in [0,1]:
        fr = open(files[ii],'r')
        p = json.load(fr)
        
        for seq_record in SeqIO.parse(fastafiles[ii],'fasta'):
            key = str(seq_record.id)
            ary = p[key]
            
            tm = np.array(ary).reshape([-1,20])
               
            c = len(ary)
            if r == 0:
                if c < N:
                    tm = maxminnorm(tm)# 归一化
                    X[k][:c] = tm.reshape(c)
                    X[k][c:] = 0
                elif c == N:
                    tm = maxminnorm(tm)# 归一化
                    X[k] = tm.reshape(c)
                else:
                    t = tm[:numAA,:]
                    t = maxminnorm(t)# 归一化
                    X[k] = t.reshape(N)
            else:
                if c < N:
                    tm = maxminnorm(tm)
                    X[k][-c:] = tm.reshape(c)
                    X[k][:-c] = 0
                elif c==N:
                    tm = maxminnorm(tm)
                    X[k] = tm.reshape(c)
                else:
                    t = tm[-numAA:,:]
                    t = maxminnorm(t)
                    X[k] = t.reshape(N)
            
            k += 1
        fr.close()
        
    return X, y

In [16]:
def AAOneHot():
    files=[r'E:\Repoes\AMPnet\data\benchmark\AMPs_50.fasta',r'E:\Repoes\AMPnet\data\benchmark\notAMPs_50.fasta']
    #files=[r'E:\Repoes\AMPnet\data\benchmark\wpAMPs.fasta',r'E:\Repoes\AMPnet\data\benchmark\wpnotAMPs.fasta']
    text='PQRYWTMNVELHSFCIKADG'
    N = 1000
    X = np.ndarray((M,N))
    y = np.ones(M)
    y[M1:] = 0
    k = 0
    for file in files:
        for seq_record in SeqIO.parse(file,'fasta'):
            seq = str(seq_record.seq)
            seq = re.sub('[XZUB]',"",seq)
            #print("\r{}".format(seq),end="")
            c = len(seq)
            m = np.zeros((len(seq),20))
            for i in range(c):
                j = text.index(seq[i])
                m[i][j] = 1
           
            m = m.reshape((1,-1))
            #print("in {},{}:{}".format(file,seq_record.id,m.shape))
            # 只截取蛋白质序列前50个aa，不足的补0
            c = c*20
            if c < N:
                X[k][:c] = m[0]
                X[k][c:] = 0
            elif c == N:
                X[k] = m[0]
            else:
                X[k] = m[0][:N] 
            k += 1
    return X,y

In [17]:
# 两联体编码
def dAAOneHot():
    files=[r'E:\Repoes\AMPnet\data\benchmark\AMPs_50.fasta',r'E:\Repoes\AMPnet\data\benchmark\notAMPs_50.fasta']
    #files=[r'E:\Repoes\AMPnet\data\benchmark\wpAMPs.fasta',r'E:\Repoes\AMPnet\data\benchmark\wpnotAMPs.fasta']
    text='PQRYWTMNVELHSFCIKADG'
    daa=[x+y for x in text for y in text]
    N = 400
    X = np.ndarray([M,400])
    y = np.ones(M)
    y[M1:] = 0
    k = 0
    for file in files:
        for seq_record in SeqIO.parse(file,'fasta'):
            seq = str(seq_record.seq)
            seq = re.sub('[XZUB]',"",seq)
            t = np.ndarray(400)
            for j in range(400):
                t[j] = seq.count(daa[j])
            t = t.reshape([-1,20])
            t = maxminnorm(t)
            X[k] = t.reshape(N)
            k += 1
    return X,y

In [5]:
def AAPhyChemOneHot():
    #files=[r'E:\Repoes\AMPnet\data\benchmark\AMPs_50.fasta',
     #      r'E:\Repoes\AMPnet\data\benchmark\notAMPs_50.fasta']
    files=[r'E:\Repoes\AMPnet\data\benchmark\wpAMPs.fasta',r'E:\Repoes\AMPnet\data\benchmark\wpnotAMPs.fasta']
    text='PQRYWTMNVELHSFCIKADG'
    N = 1000
    X = np.ndarray((M,N))
    y = np.ones(M)
    y[M1:] = 0
    k = 0
    phychemDict={}
    phychemDict["alcohol"]=("S","T")# 有乙醇基
    phychemDict["aliphatic"]=("I","L","V")# 脂肪族
    phychemDict["aromatic"]=("F","H","W","Y")# 芳香族
    phychemDict["charged"]=("D","E","H","K","R")# 带电性
    phychemDict["positive"]=("K","H","R")# 带正电
    phychemDict["negative"]=("D","E")# 带负电
    phychemDict["polar"]=("A","L","I","P","F","W","M")# 非极性
    phychemDict["small"]=("A","C","D","G","N","P","S","T","V")# 小分子
    phychemDict["turnlike"]=("A","C","D","E","G","H","K","N","Q","R","S","T")
    phychemDict["hydrophobic"]=("A","F","I","L","M","P","V","W","Y")# 疏水
    phychemDict["asa"]=("A","N","D","C","P","S","T","G","V")# 可溶解表面积低于平均值
    phychemDict["pr"]=("F","Y","W")# 在紫外区有光吸收能力
    
    keys = phychemDict.keys()
    print("keys:",keys)
    for file in files:
        for seq_record in SeqIO.parse(file,'fasta'):
            seq = str(seq_record.seq)
            seq = re.sub('[XZUB]',"",seq)
            c = len(seq)
            m = np.zeros((len(seq),20))
            for i in range(c):
                j = 0
                for key in keys:
                    val = phychemDict[key]
                    if seq[i] in val:
                        m[i][j] = 1
                    j += 1
                
            m = m.reshape((1,-1))
            #print("in {},{}:{}".format(file,seq_record.id,m.shape))
            # 只截取蛋白质序列前50个aa，不足的补0
            c = c*20
            if c < N:
                X[k][:c] = m[0]
                X[k][c:] = 0
            elif c == N:
                X[k] = m[0]
            else:
                X[k] = m[0][:N] 
            k += 1
    return X,y

定义函数，构建2通道的数据集。通道一是hmmer profile；通道二是AA的ONEHOTE编码

In [6]:
def getTwoChannelsArray():
    X1,y = load_hmm_prof()
    X2,y = AAOneHot()
    X=np.ndarray([M,50,20,2])
    X11 = X1.reshape([M,50,20])
    X21 = X2.reshape([M,50,20])
    X[:,:,:,0]=X11
    X[:,:,:,1]=X21
    return X,y

In [7]:
def getThreeChannelsArray():
    X1,y = load_hmm_prof()
    X2,y = AAOneHot()
    X3,y = AAPhyChemOneHot()
    X=np.ndarray([M,50,20,3])
    X11 = X1.reshape([M,50,20])
    X21 = X2.reshape([M,50,20])
    X31 = X3.reshape([M,50,20])
    X[:,:,:,0]=X11
    X[:,:,:,1]=X21
    X[:,:,:,2]=X31
    return X,y

## 构建卷积网络进行训练

In [8]:
import tflearn
import tensorflow as tf
from tflearn.data_utils import shuffle, to_categorical
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.normalization import local_response_normalization
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.estimator import regression
from tflearn.data_preprocessing import ImagePreprocessing
from tflearn.data_augmentation import ImageAugmentation
from sklearn.model_selection import LeaveOneOut, KFold
from sklearn.metrics import accuracy_score, auc, roc_curve, matthews_corrcoef

curses is not supported on this machine (please install/reinstall curses for an optimal experience)


构建网络结构

In [30]:
def create_alexnet(num_classes,channels=1):
    # Building 'AlexNet'
    network = input_data(shape=[None, 50, 20, channels])
    network = conv_2d(network, 96, 11, strides=4, activation='relu')
    network = max_pool_2d(network, 3, strides=2)#[-1,25,10,96]
    network = local_response_normalization(network)
    network = conv_2d(network, 256, 5, activation='relu')
    network = max_pool_2d(network, 3, strides=2)#[-1,13,5,256]
    network = local_response_normalization(network)
    network = conv_2d(network, 384, 3, activation='relu')
    network = conv_2d(network, 384, 3, activation='relu')
    network = conv_2d(network, 256, 3, activation='relu')
    network = max_pool_2d(network, 3, strides=2)#[-1,7,3,256]
    network = local_response_normalization(network)
    network = fully_connected(network, 4096, activation='tanh')
    network = dropout(network, 0.5)
    network = fully_connected(network, 4096, activation='tanh')
    network = dropout(network, 0.5)
    network = fully_connected(network, num_classes, activation='softmax')
    network = regression(network, optimizer='momentum',
                         loss='categorical_crossentropy',
                         learning_rate=0.001)
    return network

In [10]:
def create_cifarnet(num_classes,channels=1):
    # Real-time data preprocessing
    img_prep = ImagePreprocessing()
    img_prep.add_featurewise_zero_center()
    img_prep.add_featurewise_stdnorm()
    
    # Real-time data augmentation
    img_aug = ImageAugmentation()
    img_aug.add_random_flip_leftright()
    img_aug.add_random_rotation(max_angle=25.)
    
    # Convolutional network building
    network = input_data(shape=[None, 20, 20, channels],
                         data_preprocessing=img_prep,
                         data_augmentation=img_aug)
    network = conv_2d(network, 32, 3, activation='relu')
    network = max_pool_2d(network, 2)
    network = dropout(network, 0.75)
    network = conv_2d(network, 64, 3, activation='relu')
    network = conv_2d(network, 64, 3, activation='relu')
    network = max_pool_2d(network, 2)
    network = dropout(network, 0.5)
    network = fully_connected(network, 512, activation='tanh')
    network = dropout(network, 0.8)
    network = fully_connected(network, 512, activation='tanh')
    network = dropout(network, 0.8)
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam',
                         loss='categorical_crossentropy',
                         learning_rate=0.001)
    return network
    

In [34]:
# Building 'VGG Network'
def create_vggnet(num_classes,channels=1):
    network = input_data(shape=[None, 20, 20, channels])

    network = conv_2d(network, 64, 3, activation='relu')
    network = conv_2d(network, 64, 3, activation='relu')
    network = max_pool_2d(network, 2, strides=2)#[-1,25,10,64]

    network = conv_2d(network, 128, 3, activation='relu')
    network = conv_2d(network, 128, 3, activation='relu')
    network = max_pool_2d(network, 2, strides=2)#[-1,13,5,128]

    network = conv_2d(network, 256, 3, activation='relu')
    network = conv_2d(network, 256, 3, activation='relu')
    network = conv_2d(network, 256, 3, activation='relu')
    network = max_pool_2d(network, 2, strides=2)#[-1,7,3,256]
    
    network = conv_2d(network, 512, 3, activation='relu')
    network = conv_2d(network, 512, 3, activation='relu')
    network = conv_2d(network, 512, 3, activation='relu')
    network = max_pool_2d(network, 2, strides=2)

    network = conv_2d(network, 512, 3, activation='relu')
    network = conv_2d(network, 512, 3, activation='relu')
    network = conv_2d(network, 512, 3, activation='relu')
    network = max_pool_2d(network, 2, strides=2)
    
    network = fully_connected(network, 2048, activation='relu')
    network = dropout(network, 0.5)
    network = fully_connected(network, 4096, activation='relu')
    network = dropout(network, 0.5)
    network = fully_connected(network, num_classes, activation='softmax')

    network = regression(network, optimizer='rmsprop',
                         loss='softmax_categorical_crossentropy',
                         learning_rate=0.0001)
    return network

定义交叉验证函数

In [11]:
def jackknife_test(X, y, num_classes=2,channels=1,n_epoch=100):
    y_pred = np.zeros(M)
    loo = LeaveOneOut()
    for train_index, test_index in loo.split(X):
        print("\r In predicting {}".format(test_index))
        X_train, X_test = X[train_index], [X[test_index]]
        y_train, y_test = y[train_index], [y[test_index]]
        tf.reset_default_graph()
        net = create_alexnet(num_classes,channels)
        model = tflearn.DNN(net, tensorboard_verbose=0)
        model.fit(X_train, y_train, n_epoch, shuffle=True, 
              validation_set=(X_test,y_test),
              show_metric=True, batch_size=32, 
              run_id='AMP_cnn')
        y_pred[test_index] = model.predict(X_test)
        
    return y_pred


def cross_validate(X,y,n_splits=3,num_classes=2,channels=1,n_epoch=100):
    pred_prob = np.zeros([M,2])
    kf = KFold(n_splits)
    for train_index, test_index in kf.split(X):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        tf.reset_default_graph()
        net = create_alexnet(num_classes,channels)
        model = tflearn.DNN(net, tensorboard_verbose=0)
        model.fit(X_train, y_train, n_epoch, shuffle=True, 
              validation_set=(X_test,y_test),
              show_metric=True, batch_size=32)
        pred_prob[test_index] = model.predict(X_test)
        
    return pred_prob

In [12]:
# 计算准确率acc和可接受曲线下面积AUC
from sklearn.metrics import matthews_corrcoef
def metric(y, predprob):
    d1 = len(y)
    y_pred = np.zeros((d1,2))
    for i in range(d1):
        if predprob[i][0] > predprob[i][1]:
            y_pred[i][0] = 1
        else:
            y_pred[i][1] = 1
            
    accuracy=accuracy_score(y[:,0],y_pred[:,0])
    print("accuracy={}".format(accuracy))
    fpr,tpr,thresholds=roc_curve(y[:,0],predprob[:,0],pos_label=1)
    print("AUC={}".format(auc(fpr,tpr)))
    mcc = matthews_corrcoef(y[:,0],y_pred[:,0])
    print("mcc={}".format(mcc))

### 仅hmmer profil一个通道数据的训练

In [None]:
X1,y = load_hmm_prof()
X1 = X1.reshape([-1,50,20,1])
y = np.zeros((M,2))
for i in range(M1):
    y[i][0] = 1
for i in range(M1,M):
    y[i][1] = 1
# 交叉验证
X1,y = shuffle(X1,y)
predprob = cross_validate(X1,y,n_splits=5,num_classes=2,channels=1,n_epoch=50)
metric(y, predprob)# print: 0.795, 0.868684

### 两个通道数据的训练

In [None]:
# 获取数据
X,y = getTwoChannelsArray()
yy = np.zeros((M,2))
for i in range(M1):
    yy[i][0] = 1
for i in range(M1,M):
    yy[i][1] = 1
# 交叉验证
X,y = shuffle(X,yy)
predprob = cross_validate(X,y,n_splits=5,num_classes=2,channels=2,n_epoch=50)
metric(y, predprob) # print: 0.81375, 0.883389


<font size=5> <b>预测结果</b></font><br>
5-fold, cifanet,30 epochs, 879amps+2405 not amps, acc=0.8562, auc=0.9045, mcc=0.6470<br>


### 三个通道数据训练

In [None]:
X,y = getThreeChannelsArray()
yy = np.zeros((M,2))
for i in range(M1):
    yy[i][0] = 1
for i in range(M1,M):
    yy[i][1] = 1
# 交叉验证
X,y = shuffle(X,yy)
predprob = cross_validate(X,y,n_splits=5,num_classes=2,channels=3,n_epoch=30)
metric(y, predprob)# accuracy=0.820625 AUC=0.9052437499999999

In [None]:
pred_prob = np.zeros(1600)

In [None]:
X_train,X_test=X[:1280],X[1280:]
y_train,y_test=y[:1280],y[1280:]
tf.reset_default_graph()
net = create_vggnet(2,3)
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(X_train, y_train, n_epoch=30, shuffle=True, 
      show_metric=True, batch_size=32)
predval = model.predict(X_test)
metric(y_test, predval)

In [None]:
predval = model.predict(X_test)
metric(y_test, predval)

<font size=4><b>预测结果</b></font><br>


### 两联体-前20hmmrof-后20hmmprof,三个通道数据

#### 王普的数据

In [13]:
# 构建数据集
X1,y1 = load_hmm_prof(20,0)
X2,y2 = load_hmm_prof(20,-1)
X3,y3 = dAAOneHot()
X=np.ndarray([M,20,20,3])
X11 = X1.reshape([M,20,20])
X21 = X2.reshape([M,20,20])
X31 = X3.reshape([M,20,20])
X[:,:,:,0]=X11
X[:,:,:,1]=X21
X[:,:,:,2]=X31
yy = np.zeros((M,2))
for i in range(M1):
    yy[i][0] = 1
for i in range(M1,M):
    yy[i][1] = 1

In [14]:
X,y = shuffle(X,yy)
predprob = cross_validate(X,y,n_splits=5,num_classes=2,channels=3,n_epoch=40)
metric(y, predprob)# 王普的数据，用alexnet, acc=0.90956 auc=0.957706  mcc=0.765609

Training Step: 3319  | total loss: [1m[32m0.00074[0m[0m | time: 55.461s
| Momentum | epoch: 040 | loss: 0.00074 - acc: 1.0000 -- iter: 2624/2628
Training Step: 3320  | total loss: [1m[32m0.00075[0m[0m | time: 58.234s
| Momentum | epoch: 040 | loss: 0.00075 - acc: 1.0000 | val_loss: 0.58507 - val_acc: 0.8994 -- iter: 2628/2628
--
accuracy=0.9095615103532277
AUC=0.9577068536112904
mcc=0.7656093650437452


#### 自己的数据

In [33]:
# 构建数据集
X1,y1 = load_hmm_prof(20,0)
X2,y2 = load_hmm_prof(20,-1)
X3,y3 = dAAOneHot()
X=np.ndarray([M,20,20,3])
X11 = X1.reshape([M,20,20])
X21 = X2.reshape([M,20,20])
X31 = X3.reshape([M,20,20])
X[:,:,:,0]=X11
X[:,:,:,1]=X21
X[:,:,:,2]=X31
yy = np.zeros((M,2))
for i in range(M1):
    yy[i][0] = 1
for i in range(M1,M):
    yy[i][1] = 1

In [37]:
X3

array([[ 0.        ,  3.20353   ,  0.        , ...,  2.89169   ,
         4.03077   ,  2.41478   ],
       [ 0.67902348,  0.69785338,  0.        , ...,  0.5       ,
         0.86159737,  0.        ],
       [ 0.        ,  3.20353   ,  0.        , ...,  0.        ,
         4.03077   ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.        ,  1.        ]])

In [31]:
X,y = shuffle(X,yy)
predprob = cross_validate(X,y,n_splits=5,num_classes=2,channels=2,n_epoch=30)
metric(y, predprob)

Training Step: 526  | total loss: [1m[32m0.68961[0m[0m | time: 5.300s
[2K| Momentum | epoch: 014 | loss: 0.68961 - acc: 0.5348 -- iter: 0192/1280


KeyboardInterrupt: 

## 其它，包括测试代码段

In [35]:
X,y = shuffle(X,yy)
X_train,X_test=X[:-320],X[-320:]
y_train,y_test=y[:-320],y[-320:]
tf.reset_default_graph()
net = create_vggnet(2,3)
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(X_train, y_train, 30, shuffle=True, 
      validation_set=(X_test,y_test),
      show_metric=True, batch_size=32)
pred_prob= model.predict(X_test)
metric(y_test, pred_prob)

Training Step: 1199  | total loss: [1m[32m0.38844[0m[0m | time: 92.498s
| RMSProp | epoch: 030 | loss: 0.38844 - acc: 0.9252 -- iter: 1248/1280
Training Step: 1200  | total loss: [1m[32m0.39339[0m[0m | time: 98.146s
| RMSProp | epoch: 030 | loss: 0.39339 - acc: 0.9202 | val_loss: 0.54341 - val_acc: 0.7688 -- iter: 1280/1280
--
accuracy=0.76875
AUC=0.8533694415008438
mcc=0.5448016013187331


In [None]:
X,y = shuffle(X,yy)


In [None]:
X.shape

In [None]:
X2,y=load_hmm_prof(20,-1)

In [None]:
fastafiles=[r'E:\Repoes\AMPnet\data\benchmark\wpAMPs.fasta',r'E:\Repoes\AMPnet\data\benchmark\wpnotAMPs.fasta']

In [None]:
for seq_record in SeqIO.parse(fastafiles[1],'fasta'):
    print(seq_record.id)