## 蛋白质编码

读取AMPs和notAMPs序列，转化为2个通道的特征向量表示。其中通道1的数据来自hmmer的profile；通道2的数据来自氨基酸Onehot编码;通道3来自AA的物化性质

In [1]:
import numpy as np
from Bio import SeqIO
import json

In [2]:
# 对矩阵进行归一化
def maxminnorm(array):
    maxcols=array.max(axis=0)
    mincols=array.min(axis=0)
    data_shape = array.shape
    data_rows = data_shape[0]
    data_cols = data_shape[1]
    t=np.empty((data_rows,data_cols))
    for i in range(data_cols):
        t[:,i]=(array[:,i]-mincols[i])/(maxcols[i]-mincols[i])
    return t

In [3]:
# 加载来自hmmer profil的数据
def load_hmm_prof():
    files = ['e:/repoes/ampnet/data/benchmark/AMPs_50_hmm_profil.json',
         'e:/repoes/ampnet/data/benchmark/notAMPs_50_hmm_profil.json']
    N = 1000
    X = np.ndarray((1600,N))
    y = np.ones(1600)
    y[800:] = 0
    k = 0
    for f in files:
        fr = open(f,'r')
        p = json.load(fr)
        for key in p.keys():
            ary = p[key]
            tm = np.array(ary).reshape([-1,20])
            c = len(ary)
            if c < N:
                tm = maxminnorm(tm)# 归一化
                X[k][:c] = tm.reshape(c)
                X[k][c:] = 0
            elif c == N:
                tm = maxminnorm(tm)# 归一化
                X[k] = tm.reshape(c)
            else:
                t = tm[:50,:]
                t = maxminnorm(t)# 归一化
                X[k] = t.reshape(N)
            k += 1
        fr.close()
        
    return X, y

In [4]:
import re
def AAOneHot():
    files=[r'E:\Repoes\AMPnet\data\benchmark\AMPs_50.fasta',r'E:\Repoes\AMPnet\data\benchmark\notAMPs_50.fasta']
    text='PQRYWTMNVELHSFCIKADG'
    N = 1000
    X = np.ndarray((1600,N))
    y = np.ones(1600)
    y[800:] = 0
    k = 0
    for file in files:
        for seq_record in SeqIO.parse(file,'fasta'):
            seq = str(seq_record.seq)
            seq = re.sub('[XZUB]',"",seq)
            #print("\r{}".format(seq),end="")
            c = len(seq)
            m = np.zeros((len(seq),20))
            for i in range(c):
                j = text.index(seq[i])
                m[i][j] = 1
           
            m = m.reshape((1,-1))
            #print("in {},{}:{}".format(file,seq_record.id,m.shape))
            # 只截取蛋白质序列前50个aa，不足的补0
            c = c*20
            if c < N:
                X[k][:c] = m[0]
                X[k][c:] = 0
            elif c == N:
                X[k] = m[0]
            else:
                X[k] = m[0][:N] 
            k += 1
    return X,y

In [5]:
def AAPhyChemOneHot():
    files=[r'E:\Repoes\AMPnet\data\benchmark\AMPs_50.fasta',
           r'E:\Repoes\AMPnet\data\benchmark\notAMPs_50.fasta']
    text='PQRYWTMNVELHSFCIKADG'
    N = 1000
    X = np.ndarray((1600,N))
    y = np.ones(1600)
    y[800:] = 0
    k = 0
    phychemDict={}
    phychemDict["alcohol"]=("S","T")# 有乙醇基
    phychemDict["aliphatic"]=("I","L","V")# 脂肪族
    phychemDict["aromatic"]=("F","H","W","Y")# 芳香族
    phychemDict["charged"]=("D","E","H","K","R")# 带电性
    phychemDict["positive"]=("K","H","R")# 带正电
    phychemDict["negative"]=("D","E")# 带负电
    phychemDict["polar"]=("A","L","I","P","F","W","M")# 非极性
    phychemDict["small"]=("A","C","D","G","N","P","S","T","V")# 小分子
    phychemDict["turnlike"]=("A","C","D","E","G","H","K","N","Q","R","S","T")
    phychemDict["hydrophobic"]=("A","F","I","L","M","P","V","W","Y")# 疏水
    phychemDict["asa"]=("A","N","D","C","P","S","T","G","V")# 可溶解表面积低于平均值
    phychemDict["pr"]=("F","Y","W")# 在紫外区有光吸收能力
    
    keys = phychemDict.keys()
    print("keys:",keys)
    for file in files:
        for seq_record in SeqIO.parse(file,'fasta'):
            seq = str(seq_record.seq)
            seq = re.sub('[XZUB]',"",seq)
            c = len(seq)
            m = np.zeros((len(seq),20))
            for i in range(c):
                j = 0
                for key in keys:
                    val = phychemDict[key]
                    if seq[i] in val:
                        m[i][j] = 1
                    j += 1
                
            m = m.reshape((1,-1))
            #print("in {},{}:{}".format(file,seq_record.id,m.shape))
            # 只截取蛋白质序列前50个aa，不足的补0
            c = c*20
            if c < N:
                X[k][:c] = m[0]
                X[k][c:] = 0
            elif c == N:
                X[k] = m[0]
            else:
                X[k] = m[0][:N] 
            k += 1
    return X,y

定义函数，构建2通道的数据集。通道一是hmmer profile；通道二是AA的ONEHOTE编码

In [6]:
def getTwoChannelsArray():
    X1,y = load_hmm_prof()
    X2,y = AAOneHot()
    X=np.ndarray([1600,50,20,2])
    X11 = X1.reshape([1600,50,20])
    X21 = X2.reshape([1600,50,20])
    X[:,:,:,0]=X11
    X[:,:,:,1]=X21
    return X,y

In [7]:
def getThreeChannelsArray():
    X1,y = load_hmm_prof()
    X2,y = AAOneHot()
    X3,y = AAPhyChemOneHot()
    X=np.ndarray([1600,50,20,3])
    X11 = X1.reshape([1600,50,20])
    X21 = X2.reshape([1600,50,20])
    X31 = X3.reshape([1600,50,20])
    X[:,:,:,0]=X11
    X[:,:,:,1]=X21
    X[:,:,:,2]=X31
    return X,y

## 构建卷积网络进行训练

In [8]:
import tflearn
import tensorflow as tf
from tflearn.data_utils import shuffle, to_categorical
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.normalization import local_response_normalization
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.estimator import regression
from tflearn.data_preprocessing import ImagePreprocessing
from tflearn.data_augmentation import ImageAugmentation
from sklearn.model_selection import LeaveOneOut, KFold
from sklearn.metrics import accuracy_score, auc, roc_curve

curses is not supported on this machine (please install/reinstall curses for an optimal experience)


构建网络结构

In [9]:
def create_alexnet(num_classes,channels=1):
    # Building 'AlexNet'
    network = input_data(shape=[None, 50, 20, channels])
    network = conv_2d(network, 96, 11, strides=4, activation='relu')
    network = max_pool_2d(network, 3, strides=2)
    network = local_response_normalization(network)
    network = conv_2d(network, 256, 5, activation='relu')
    network = max_pool_2d(network, 3, strides=2)
    network = local_response_normalization(network)
    network = conv_2d(network, 384, 3, activation='relu')
    network = conv_2d(network, 384, 3, activation='relu')
    network = conv_2d(network, 256, 3, activation='relu')
    network = max_pool_2d(network, 3, strides=2)
    network = local_response_normalization(network)
    network = fully_connected(network, 4096, activation='tanh')
    network = dropout(network, 0.5)
    network = fully_connected(network, 4096, activation='tanh')
    network = dropout(network, 0.5)
    network = fully_connected(network, num_classes, activation='softmax')
    network = regression(network, optimizer='momentum',
                         loss='categorical_crossentropy',
                         learning_rate=0.001)
    return network

In [10]:
def create_cifarnet(num_classes,channels=1):
    # Real-time data preprocessing
    img_prep = ImagePreprocessing()
    img_prep.add_featurewise_zero_center()
    img_prep.add_featurewise_stdnorm()
    
    # Real-time data augmentation
    img_aug = ImageAugmentation()
    img_aug.add_random_flip_leftright()
    img_aug.add_random_rotation(max_angle=25.)
    
    # Convolutional network building
    network = input_data(shape=[None, 50, 20, channels],
                         data_preprocessing=img_prep,
                         data_augmentation=img_aug)
    network = conv_2d(network, 32, 3, activation='relu')
    network = max_pool_2d(network, 2)
    network = dropout(network, 0.75)
    network = conv_2d(network, 64, 3, activation='relu')
    network = conv_2d(network, 64, 3, activation='relu')
    network = max_pool_2d(network, 2)
    network = dropout(network, 0.5)
    network = fully_connected(network, 512, activation='tanh')
    network = dropout(network, 0.8)
    network = fully_connected(network, 512, activation='tanh')
    network = dropout(network, 0.8)
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam',
                         loss='categorical_crossentropy',
                         learning_rate=0.001)
    return network
    

In [15]:
# Building 'VGG Network'
def create_vggnet(num_classes,channels=1):
    network = input_data(shape=[None, 50, 20, channels])

    network = conv_2d(network, 64, 3, activation='relu')
    network = conv_2d(network, 64, 3, activation='relu')
    network = max_pool_2d(network, 2, strides=2)

    network = conv_2d(network, 128, 3, activation='relu')
    network = conv_2d(network, 128, 3, activation='relu')
    network = max_pool_2d(network, 2, strides=2)

    network = conv_2d(network, 256, 3, activation='relu')
    network = conv_2d(network, 256, 3, activation='relu')
    network = conv_2d(network, 256, 3, activation='relu')
    network = max_pool_2d(network, 2, strides=2)

    network = conv_2d(network, 512, 3, activation='relu')
    network = conv_2d(network, 512, 3, activation='relu')
    network = conv_2d(network, 512, 3, activation='relu')
    network = max_pool_2d(network, 2, strides=2)

    network = conv_2d(network, 512, 3, activation='relu')
    network = conv_2d(network, 512, 3, activation='relu')
    network = conv_2d(network, 512, 3, activation='relu')
    network = max_pool_2d(network, 2, strides=2)

    network = fully_connected(network, 4096, activation='relu')
    network = dropout(network, 0.5)
    network = fully_connected(network, 4096, activation='relu')
    network = dropout(network, 0.5)
    network = fully_connected(network, num_classes, activation='softmax')

    network = regression(network, optimizer='rmsprop',
                         loss='categorical_crossentropy',
                         learning_rate=0.0001)
    return network

定义交叉验证函数

In [16]:
def jackknife_test(X, y, num_classes=2,channels=1,epoch=100):
    y_pred = np.zeros(1600)
    loo = LeaveOneOut()
    for train_index, test_index in loo.split(X):
        print("\r In predicting {}".format(test_index))
        X_train, X_test = X[train_index], [X[test_index]]
        y_train, y_test = y[train_index], [y[test_index]]
        tf.reset_default_graph()
        net = create_alexnet(num_classes,channels)
        model = tflearn.DNN(net, tensorboard_verbose=0)
        model.fit(X_train, y_train, n_epoch, shuffle=True, 
              validation_set=(X_test,y_test),
              show_metric=True, batch_size=32, 
              run_id='AMP_cnn')
        y_pred[test_index] = model.predict(X_test)
        
    return y_pred


def cross_validate(X,y,n_splits=3,num_classes=2,channels=1,epoch=100):
    pred_prob = np.zeros([1600,2])
    kf = KFold(n_splits)
    for train_index, test_index in kf.split(X):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        tf.reset_default_graph()
        net = create_vggnet(num_classes,channels)
        model = tflearn.DNN(net, tensorboard_verbose=0)
        model.fit(X_train, y_train, n_epoch, shuffle=True, 
              validation_set=(X_test,y_test),
              show_metric=True, batch_size=32, 
              run_id='AMP_cnn')
        y_pred[test_index] = model.predict(X_test)
        
    return pred_prob

In [12]:
# 计算准确率acc和可接受曲线下面积AUC
def metric(y, predprob):
    d1 = len(y)
    y_pred = np.zeros((d1,2))
    for i in range(d1):
        if predprob[i][0] > predprob[i][1]:
            y_pred[i][0] = 1
        else:
            y_pred[i][1] = 1
            
    accuracy=accuracy_score(y[:,0],y_pred[:,0])
    print("accuracy={}".format(accuracy))
    fpr,tpr,thresholds=roc_curve(y[:,0],predprob[:,0],pos_label=1)
    print("AUC={}".format(auc(fpr,tpr)))

### 两个通道数据的训练

In [None]:
# 获取数据
X,y = getTwoChannelsArray()
yy = np.zeros((1600,2))
for i in range(800):
    yy[i][0] = 1
for i in range(800,1600):
    yy[i][1] = 1
# 交叉验证
X,y = shuffle(X,yy)
predprob = cross_validate(X,y,n_splits=5,num_classes=2,channels=2,epoch=20)
metric(y, predprob) # print: 0.81375, 0.883389

### 仅hmmer profil一个通道数据的训练

In [None]:
X1,y = load_hmm_prof()
X1 = X1.reshape([-1,50,20,1])
y = np.zeros((1600,2))
for i in range(800):
    y[i][0] = 1
for i in range(800,1600):
    y[i][1] = 1
# 交叉验证
X1,y = shuffle(X1,y)
predprob = cross_validate(X1,y,n_splits=5,channels=1)
metric(y, predprob)# print: 0.795, 0.868684

### 三个通道数据训练

In [None]:
X,y = getThreeChannelsArray()
yy = np.zeros((1600,2))
for i in range(800):
    yy[i][0] = 1
for i in range(800,1600):
    yy[i][1] = 1
# 交叉验证
X,y = shuffle(X,yy)
predprob = cross_validate(X,y,n_splits=5,num_classes=2,channels=3)
metric(y, predprob)# accuracy=0.820625 AUC=0.9052437499999999

Training Step: 1258  | total loss: [1m[32m0.04410[0m[0m | time: 64.577s
[2K| RMSProp | epoch: 032 | loss: 0.04410 - acc: 0.9888 -- iter: 0576/1280


## 其它，包括测试代码段

In [None]:
tf.reset_default_graph()
model = net(X_train, y_train, X_test, y_test)
predprob = model.predict(X_test)

In [None]:
pred=model.predict(X_test)

In [None]:
predLabel = model.predict_label([X_test[-1]])

In [None]:
pred

In [None]:
t1=np.ndarray([3,4])
t2=t1.reshape(12)
print(t2.shape)

In [None]:
t3=t1[:2,:].reshape([-1,2])
t4=t3.reshape(8)
print(t4.shape)

In [None]:
X,y = AAPhyChemOneHot()

In [None]:
X[0]

In [None]:
93.7+173.7+250.4+215.2+146.3+197.6+142.6+228.6+135.2+177.7+109.5+182.9+142.1+52.6+271.6+188.1+239.9+182.2+157.2

In [None]:
3287.1/20