In [1]:
### env = 'tf24-GPU (Python 3.7.13)'
import tensorflow as tf
import numpy as np
import pandas as pd
from pkg.model import T3SEClassEstimator
from pkg.loss import focal_loss
from sklearn.utils import shuffle
from sklearn.model_selection import KFold

In [None]:
### load train data

X_pos_onehot = np.load("..//dataset_process//train-pos-processed.npy")
X_neg_onehot = np.load("..//dataset_process//train-neg-processed.npy")

X_pos_onehot = tf.expand_dims(X_pos_onehot, -1)
X_neg_onehot = tf.expand_dims(X_neg_onehot, -1)
print('pos_data-onehot shape:', X_pos_onehot.shape)
print('neg_data-onehot shape:', X_neg_onehot.shape)

### load test data

T3SE_d_onehot = np.load("..//dataset_process//test-processed.npy")
testX_d_onehot = tf.expand_dims(T3SE_d_onehot, -1)
print('testX_d_onehot shape: ', testX_d_onehot.shape)

testY_d = np.concatenate((np.ones(100), np.zeros(100)), axis=0)
testY_d = pd.get_dummies(testY_d).values  # one-hot
print('testY_d shape:        ', testY_d.shape)

In [None]:
# 5 fold cross-validation

# X_neg_onehot = X_neg_onehot  # all species
X_neg_onehot_np = X_neg_onehot.numpy()
index = [i for i in range(len(X_neg_onehot_np))]
np.random.seed(31)
np.random.shuffle(index)
X_neg_onehot_np_random = X_neg_onehot_np[index]
# X_neg_onehot_np_random = X_neg_onehot_np[index]
X_neg_onehot = tf.convert_to_tensor(X_neg_onehot_np_random, dtype=tf.int32)

X_pos_onehot_np = X_pos_onehot.numpy()
index = [i for i in range(len(X_pos_onehot_np))]
np.random.seed(31)
np.random.shuffle(index)
X_pos_onehot_np_random = X_pos_onehot_np[index]
# X_pos_onehot_np_random = X_pos_onehot_np[index]
X_pos_onehot = tf.convert_to_tensor(X_pos_onehot_np_random, dtype=tf.int32)

X_onehot = np.append(X_pos_onehot, X_neg_onehot, axis=0)
Y = np.append(np.ones(len(X_pos_onehot)), np.zeros(len(X_neg_onehot)))
Y = pd.get_dummies(Y).values  # label one-hot encoding

kf = KFold(n_splits=5, shuffle=True, random_state=31)
fold = 0
for train_index , test_index in kf.split(X_onehot):
    # split train data
    fold += 1    
    trainX_onehot, trainY = X_onehot[train_index], Y[train_index]
    testX_onehot, testY = X_onehot[test_index], Y[test_index]
    print('fold:', fold)
    print('trainX_onehot shape:', trainX_onehot.shape, 'trainY shape:', trainY.shape)
    print('testX_onehot shape: ', testX_onehot.shape , 'testY shape:', testY.shape)
    # train model
    alpha = len(X_pos_onehot)/(len(X_pos_onehot)+len(X_neg_onehot)*1)
    # loss_fun = focal_loss(gamma=[2, 2], alpha=alpha)
    loss_fun = focal_loss(gamma=[1, 1], alpha=0.5)
    clf = T3SEClassEstimator(n_outputs=2, fmap_shape1=(200, 20, 1), dense_layers=[256, 32], epochs=8000, monitor='val_auc', metric='PRC',  # monitor = 'val_loss', 'val_auc'
                            gpuid=0, batch_size=128, lr=1e-4, decay=1e-3, loss=loss_fun)  # metric='ACC', 'PRC', 'ROC'
    clf.patience = 22
    clf.fit(trainX_onehot[:,:200,:,:], trainY, (testX_onehot[:,:200,:,:], testX_onehot[:,:200,:,:]), (testY, testY))
    print('Best epochs: %.2f, Best loss: %.2f' % (clf._performance.best_epoch, clf._performance.best))
    # save model
    import time
    curr_time = (time.strftime("%m-%d-%H%M",time.localtime()))
    # clf._model.save('./saved_model/'+curr_time+'-5fold.h5')
    clf._model.save('./saved_model/'+curr_time+'-5fold-NoFocalLoss.h5')
    # record result
    proba1 = clf._model.predict(testX_onehot[:,:200,:,:])
    pre_dual = np.round(proba1)
    with open('./saved_model/5fold-result.txt', 'a+')as f:
        f.write('\n'+curr_time+'\n')
        TP , TN, FP, FN = 0, 0, 0, 0
        for i in range(len(testY)):
            if str(int(pre_dual[i][1])) == '1' and str(int(testY[i][1])) == '1':
                TP += 1
            if str(int(pre_dual[i][1])) == '0' and str(int(testY[i][1])) == '1':
                FN += 1
            if str(int(pre_dual[i][1])) == '0' and str(int(testY[i][1])) == '0':
                TN += 1
            if str(int(pre_dual[i][1])) == '1' and str(int(testY[i][1])) == '0':
                FP += 1
        f.write('TP\tFN\tTN\tFP\n')
        f.write(str(TP)+'\t'+str(FN)+'\t'+str(TN)+'\t'+str(FP))

In [None]:
# 5 fold cross-validation (E.coli. only)

# X_neg_onehot = X_neg_onehot[3384:4258]  # E.coli only
X_neg_onehot_np = X_neg_onehot[3384:4258].numpy()
index = [i for i in range(len(X_neg_onehot_np))]
np.random.seed(31)
np.random.shuffle(index)
X_neg_onehot_np_random = X_neg_onehot_np[index]
X_neg_onehot = tf.convert_to_tensor(X_neg_onehot_np_random, dtype=tf.int32)

X_pos_onehot_np = X_pos_onehot.numpy()
index = [i for i in range(len(X_pos_onehot_np))]
np.random.seed(31)
np.random.shuffle(index)
X_pos_onehot_np_random = X_pos_onehot_np[index]
X_pos_onehot = tf.convert_to_tensor(X_pos_onehot_np_random, dtype=tf.int32)

X_onehot = np.append(X_pos_onehot, X_neg_onehot, axis=0)
Y = np.append(np.ones(len(X_pos_onehot)), np.zeros(len(X_neg_onehot)))
Y = pd.get_dummies(Y).values  # label one-hot encoding

kf = KFold(n_splits=5, shuffle=True, random_state=31)
fold = 0
for train_index , test_index in kf.split(X_onehot):
    # split train data
    fold += 1    
    trainX_onehot, trainY = X_onehot[train_index], Y[train_index]
    testX_onehot, testY = X_onehot[test_index], Y[test_index]
    print('fold:', fold)
    print('trainX_onehot shape:', trainX_onehot.shape, 'trainY shape:', trainY.shape)
    print('testX_onehot shape: ', testX_onehot.shape , 'testY shape:', testY.shape)
    # train model
    alpha = len(X_pos_onehot)/(len(X_pos_onehot)+len(X_neg_onehot)*1)
    loss_fun = focal_loss(gamma=[2, 2], alpha=alpha)
    clf = T3SEClassEstimator(n_outputs=2, fmap_shape1=(200, 20, 1), dense_layers=[256, 32], epochs=8000, monitor='val_auc', metric='PRC',  # monitor = 'val_loss', 'val_auc'
                            gpuid=0, batch_size=128, lr=1e-4, decay=1e-3, loss=loss_fun)  # metric='ACC', 'PRC', 'ROC'
    clf.patience = 22
    clf.fit(trainX_onehot[:,:200,:,:], trainY, (testX_onehot[:,:200,:,:], testX_onehot[:,:200,:,:]), (testY, testY))
    print('Best epochs: %.2f, Best loss: %.2f' % (clf._performance.best_epoch, clf._performance.best))
    # save model
    import time
    curr_time = (time.strftime("%m-%d-%H%M",time.localtime()))
    clf._model.save('./saved_model/'+curr_time+'-5fold.h5')
    # record result
    proba1 = clf._model.predict(testX_onehot[:,:200,:,:])
    pre_dual = np.round(proba1)
    with open('./saved_model/5fold-result.txt', 'a+')as f:
        f.write('\n'+curr_time+'\n')
        TP , TN, FP, FN = 0, 0, 0, 0
        for i in range(len(testY)):
            if str(int(pre_dual[i][1])) == '1' and str(int(testY[i][1])) == '1':
                TP += 1
            if str(int(pre_dual[i][1])) == '0' and str(int(testY[i][1])) == '1':
                FN += 1
            if str(int(pre_dual[i][1])) == '0' and str(int(testY[i][1])) == '0':
                TN += 1
            if str(int(pre_dual[i][1])) == '1' and str(int(testY[i][1])) == '0':
                FP += 1
        f.write('TP\tFN\tTN\tFP\n')
        f.write(str(TP)+'\t'+str(FN)+'\t'+str(TN)+'\t'+str(FP))

In [None]:
### split train data

X_onehot = np.append(X_pos_onehot, X_neg_onehot, axis=0)
Y = np.append(np.ones(len(X_pos_onehot)), np.zeros(len(X_neg_onehot)))
Y = pd.get_dummies(Y).values  # one-hot

# split dataset
def Rdsplit(df, random_state=31, split_size=[1, 0, 0]):
    base_indices = np.arange(len(df))
    base_indices = shuffle(base_indices, random_state=random_state)
    nb_test = int(len(base_indices) * split_size[2])
    nb_val = int(len(base_indices) * split_size[1])
    test_idx = base_indices[0:nb_test]
    valid_idx = base_indices[nb_test:(nb_test + nb_val)]
    train_idx = base_indices[(nb_test + nb_val):len(base_indices)]
    return train_idx, valid_idx, test_idx

train_idx, valid_idx, test_idx = Rdsplit(Y)
trainX_onehot = X_onehot[train_idx]
trainY = Y[train_idx][:]
validX_onehot = X_onehot[valid_idx]
validY = Y[valid_idx][:]
testX_onehot = X_onehot[test_idx]
testY = Y[test_idx]
print('trainX_onehot shape:', trainX_onehot.shape, 'trainY shape:', trainY.shape)
print('validX_onehot shape:', validX_onehot.shape, 'validY shape:', validY.shape)
print('testX_onehot shape: ', testX_onehot.shape , 'testY shape: ', testY.shape)

In [None]:
alpha = len(X_pos_onehot)/(len(X_pos_onehot)+len(X_neg_onehot)*1)  # modulate the imbalance of pos/neg = 1:1
loss_fun = focal_loss(gamma=[2, 2], alpha=alpha)
clf = T3SEClassEstimator(n_outputs=2, fmap_shape1=(200, 20, 1), dense_layers=[256, 32], epochs=8000, monitor='val_auc', metric='ACC',
                          gpuid=0, batch_size=128, lr=1e-4, decay=1e-3, loss=loss_fun)  # train at least 20 Epochs
clf.patience = 21  # no less than 20
clf.fit(trainX_onehot[:,:200,:,:], trainY, (testX_d_onehot[:,:200,:,:], testX_d_onehot[:,:200,:,:]), (testY_d, testY_d))
print('Best epochs: %.2f, Best loss: %.2f' % (clf._performance.best_epoch, clf._performance.best))

import time
curr_time = (time.strftime("%m-%d-%H%M",time.localtime()))
clf._model.save('./saved_model/'+curr_time+'.h5')

In [None]:
import os
model_list = [i for i in os.listdir('./saved_model') if 'h5' in i]

clf1 = T3SEClassEstimator(n_outputs=2, fmap_shape1=(200, 20, 1), dense_layers=[256, 32],
                           gpuid=0, batch_size=128, lr=1e-4, decay=1e-3, loss=loss_fun)
loss_fun = focal_loss(gamma=[1, 1], alpha=0.5)
for saved_model_name in model_list:
    clf1._model = tf.keras.models.load_model('./saved_model/'+saved_model_name, custom_objects={'focal_loss_fixed':loss_fun})
    proba1 = clf1._model.predict(testX_d_onehot[:,:200,:,:])
    pre_dual = np.round(proba1)    
    print(saved_model_name, sum(pre_dual[:]))

In [None]:
# test single fasta

# test_single_fasta = 'MTTLTTRQIQLAHAWTSVHTGAGLALDWVADVAEKVEEIATKADALSRDLHRARNLSRSLGRVSTTPMGIGFFGLSQAGKSYLISALAADEKGQLLTRLGT'
# fastas = [test_single_fasta]
with open('test.fasta') as f:
    fastas = f.readlines()

def Encode(data):
    alphabet = 'ACDEFGHIKLMNPQRSTVWY'
    char_to_int = dict((c, i) for i, c in enumerate(alphabet))
    integer_encoded = []
    for char in data:
        if char in char_to_int:
            integer_encoded.append(char_to_int[char])
        else:
            integer_encoded.append(-1)
    onehot_encoded = list()
    for value in integer_encoded:
        letter = [0 for _ in range(len(alphabet))]
        if value >= 0:
             letter[value] = 1
        onehot_encoded.append(letter)
    return onehot_encoded

onehot_out = []
for f in fastas:
    if not '>' in f:
        f = f.strip('\n')
        if len(f) > 1000:
            f = f[:1000]
        else:
            for i in range(len(f),1000):
                f = f + 'X'
        seq = Encode(f)
        onehot_out.append(seq)
onehot_out = np.array(onehot_out)

import os
model_list = [i for i in os.listdir('./saved_model') if 'h5' in i]

loss_fun = focal_loss(gamma=[2, 2], alpha=0.5)
clf1 = T3SEClassEstimator(n_outputs=2, fmap_shape1=(200, 20, 1), dense_layers=[256, 32], epochs=8000, monitor='val_auc', metric='ACC',
                           gpuid=0, batch_size=128, lr=1e-4, decay=1e-3, loss=loss_fun)
for saved_model_name in model_list:
    clf1._model = tf.keras.models.load_model('./saved_model/'+saved_model_name, custom_objects={'focal_loss_fixed':loss_fun})
    proba1 = clf1._model.predict(onehot_out[:,:200,:])
    pre_dual = np.round(proba1)
    print(saved_model_name, sum(pre_dual))
    # print(*list((proba1[:,1])))
