# 导入库及定义共享函数

In [None]:
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.estimator import regression
from tflearn.layers.normalization import local_response_normalization
import tensorflow as tf

## 定义网络结构

In [None]:
# 定义网络
#net trained by convnet-mnist
def convnet_mnist():
    net = input_data(shape=[None,28,28,1], name='input')
    net = conv_2d(net, 32,3, activation='relu', regularizer='L2')
    net = max_pool_2d(net,2)
    net = local_response_normalization(net)
    net = conv_2d(net,64,3, activation='relu', regularizer='L2')
    net = max_pool_2d(net,2)
    net = local_response_normalization(net)
    net = fully_connected(net, 128, activation='tanh')
    net = dropout(net, 0.8)
    net = fully_connected(net, 256, activation='tanh',name='feature')
    
    return net

In [None]:
# 定义网络
#net trained by cifar10-convnet-mnist
# Convolutional network building
def cifar10_convnet_mnist():
    network = input_data(shape=[None, 28, 28, 1])
    network = conv_2d(network, 32, 3, activation='relu')
    network = max_pool_2d(network, 2)
    network = dropout(network, 0.75)
    network = conv_2d(network, 64, 3, activation='relu')
    network = conv_2d(network, 64, 3, activation='relu')
    network = max_pool_2d(network, 2)
    network = dropout(network, 0.5)
    network = fully_connected(network, 512, activation='relu')
        
    #network = dropout(network, 0.5)
    #network = fully_connected(network, 6, activation='softmax',restore=False)
    #network = regression(network, optimizer='adam',
    #                     loss='categorical_crossentropy',
    #                    learning_rate=0.001)

    return network

## 把特征向量写入到arff格式的文件

把抽取的特征写入到arff格式的文件<br>
arffname: arff file's name<br>
features: 抽取出的特征<br>
label：样本标签（0或1）<br>
filemodel：文件读写模式，a,a+,w,r等等

In [None]:
def writeOneClassFeaturesToArffFile(arffame, features, label, filemodel):
    num_samples = len(features)
    num_features = len(features[0])
    arff_file = open(arffname,filemodel)
    
    if filemodel == 'w':

        arff_file.write('@relation relationship\n')

        for i in range(1,num_features+1):
            line = '@attribute ' + 'Att' + str(i) + ' numeric\n'
            arff_file.write(line)

        arff_file.write('@attribute class {0,1}\n')
        arff_file.write('@data\n\n')

    for i in range(num_samples):
        line = []
        for f in features[i]:
            line.append(str(f))
        
        line.append(str(label)) 
        arff_file.write(",".join(line))
        arff_file.write('\n')
    arff_file.close()    

## 从保存图像的文件夹中读入数据

从图像读入数据。<br>
filepath --保存每个样本序列CA图像的文件夹路径。在此路径下有N个序列的CA图像<br>
num_feature --图像行*列的值<br>
label --在该文件夹下所保存的样本的标记

In [None]:
import os
import numpy as np
from PIL import Image
def loadOneClassImageArray(filepath,num_feature):
    files = os.listdir(filepath)
    N = len(files)
    X = np.ndarray((N,num_feature),dtype=np.float32)
    
    i = 0
    for file in files:
        k = file.index('.')
        key = file[:k]
        fn = os.path.join(filepath,file)
        img = Image.open(fn,"r")
        m = np.array(img)
        m = m.reshape((1,num_feature))
        X[i] = m
        
        i = i + 1
    return X

## 载入json文件中的数据

In [None]:
# 从json格式文件中载入数据
import numpy as np
def loadJsonData(file):
    X = []
    for line in open(file,'r'):
        line = line.replace("\n","")
        x = []
        for xx in line.split(","):
            x.append(xx)
        X.append(x)
    return np.array(X)
    

# 从预训练的网络抽取6个活性的抗菌肽CA特征

In [None]:
from prepareDataset import load_data
X,Y = load_data('e:/repoes/ampnet/data/img_60/', 'e:/repoes/ampnet/data/benchmark_60_Targets.json')
X = X.reshape((-1,28,28,1))

In [None]:
# net trained by convnet-mnist
net = convnet_mnist()
model = tflearn.DNN(net, tensorboard_verbose=0)
model.load('e:/repoes/ampnet/model/convnet_mnist', weights_only=True)

In [None]:
# 抽取特征，写入文件
features=model.predict(X)
arff_file = open('amp_convnet_mnist_features.arff','w')
arff_file.write('@relation relationship\n')
for i in range(1,257):
    line = '@attribute ' + 'Att' + str(i) + ' numeric\n'
    arff_file.write(line)
arff_file.write('@attribute class {0,1}\n')
arff_file.write('@data\n\n')

for i in range(len(X)):
    line = []
    for f in features[i]:
        line.append(str(f))
    for y in Y[i]:
        line.append(str(int(y))) 
    arff_file.write(",".join(line))
    arff_file.write('\n')
arff_file.close()    

In [None]:
#net trained by cifar10-convnet-mnist
# Real-time data preprocessing
# Convolutional network building
network = cifar10_convnet_mnist()
# Train using classifier
model = tflearn.DNN(network, tensorboard_verbose=0)
model.load('e:/repoes/ampnet/model/cifar10_cnn_mnist', weights_only=True)

In [None]:
features=model.predict(X)
arff_file = open('amp_cifa10_cnn_mnist_features.arff','w')
arff_file.write('@relation relationship\n')
for i in range(1,513):
    line = '@attribute ' + 'Att' + str(i) + ' numeric\n'
    arff_file.write(line)
arff_file.write('@attribute class {0,1}\n')
arff_file.write('@data\n\n')

for i in range(len(X)):
    line = []
    for f in features[i]:
        line.append(str(f))
    for y in Y[i]:
        line.append(str(int(y))) 
    arff_file.write(",".join(line))
    arff_file.write('\n')
arff_file.close()    

In [None]:
Y_pred = model.predict(X)

# 从预训练的网络抽取正样本抗菌肽（AMPs）A和负样本非抗菌肽（notAMPs）的特征

In [None]:
tf.reset_default_graph()
#network = cifar10_convnet_mnist()
network = convnet_mnist()
# Train using classifier
model = tflearn.DNN(network, tensorboard_verbose=0)
model.load('e:/repoes/ampnet/model/convnet_mnist', weights_only=True)

In [None]:
# 抽取特征，写入文件
#ampX = loadOneClassImageArray('e:/repoes/ampnet/data/img/AMPs_50',784)
ampX = loadJsonData("E:\\Repoes\\AMPnet\\data\\AMPs_50_CA_array.json")
ampX = ampX.reshape((-1,28,28,1))
features1 = model.predict(ampX)
arffname = 'e:/repoes/ampnet/amp_and_notamp_alnex.arff'
writeOneClassFeaturesToArffFile(arffname, features1, 1, 'w' )

#notampX = loadOneClassImageArray('e:/repoes/ampnet/data/img/notAMPs_50',784)
notampX = loadJsonData("E:\\Repoes\\AMPnet\\data\\notAMPs_50_CA_array.json")
notampX = ampX.reshape((-1,28,28,1))
features2 = model.predict(notampX)
writeOneClassFeaturesToArffFile(arffname, features2, 0, 'a' )

In [None]:
import numpy as np
from scipy.io import arff
from tflearn.data_utils import shuffle, to_categorical

data,meta = arff.loadarff('e:/repoes/ampnet/amp_and_notamp.arff')
n = len(data)
X = np.ndarray((n,512))
Y = np.zeros(n)
for i in range(n):
    d = data[i]
    for j in range(512):
        X[i][j] = float(d[j])
    Y[i] = int(d[-1])
    
X,Y = shuffle(X,Y)
Y = to_categorical(Y,2)

In [None]:
def buildnet():
    net = input_data(shape=[None, 512])
    net = fully_connected(net, 2, activation='softmax')
    reg = regression(net, optimizer='adam',
                           loss='categorical_crossentropy',
                           learning_rate=0.001)
    model = tflearn.DNN(reg)
    return model

In [None]:
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score, auc, roc_curve
import tensorflow as tf
loo = LeaveOneOut()
y_pred = np.zeros((1600,2))
for train_index, test_index in loo.split(X):
    print("\r In predicting {}".format(test_index))
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    tf.reset_default_graph()
    model = buildnet()
    model.fit(X_train, y_train, n_epoch=10, shuffle=True,
             show_metric=True, batch_size=64, snapshot_step=100,
             snapshot_epoch=False)
    y_pred[test_index] = model.predict(X_test)
    
accuracy = accuracy_score(y,y_pred)
fpr,tpr,thresholds = roc_curve(y,y_pred,pos_label=1)
area = auc(fpr,tpr)

# 从预训练的网络中抽取HMMer的profile特征

## 读入HMMER执行jackhmmer后产生的chk-1文件的文件

In [None]:
# 每个序列取前50个氨基酸（共50*20=1000个特征），如果序列长度不足50，则补0
# 如果序列长度大于50，则截取前50个氨基酸
def load_hmm_prof():
    files = ['e:/repoes/ampnet/data/benchmark/AMPs_50_hmm_profil.json',
         'e:/repoes/ampnet/data/benchmark/notAMPs_50_hmm_profil.json']
    N = 1000
    X = np.ndarray((1600,N))
    y = np.ones(1600)
    y[800:] = 0
    k = 0
    for f in files:
        fr = open(f,'r')
        p = json.load(fr)
        for key in p.keys():
            ary = p[key]
            c = len(ary)
            if c < N:
                X[k][:c] = ary
                X[k][c:] = 0
            elif c == N:
                X[k] = ary
            else:
                X[k] = ary[:N]
            k += 1
        fr.close()
        
    return X, y


In [None]:
# 抽取特征，写入文件
#ampX = loadOneClassImageArray('e:/repoes/ampnet/data/img/AMPs_50',784)
X,y = load_hmm_prof()
X = X.reshape((-1,50,20,1))
features1 = model.predict(ampX)
arffname = 'e:/repoes/ampnet/amp_and_notamp_alnex.arff'
writeOneClassFeaturesToArffFile(arffname, features1, 1, 'w' )

#notampX = loadOneClassImageArray('e:/repoes/ampnet/data/img/notAMPs_50',784)
notampX = loadJsonData("E:\\Repoes\\AMPnet\\data\\notAMPs_50_CA_array.json")
notampX = ampX.reshape((-1,28,28,1))
features2 = model.predict(notampX)
writeOneClassFeaturesToArffFile(arffname, features2, 0, 'a' )