# 导入包、定义函数

In [None]:
import numpy as np
import re
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras import layers
from sklearn.utils import shuffle
from sklearn import metrics
from sites_transformer import Encoder, create_padding_mask
from tools import plot_history

In [None]:
def seq_position_encode(seq):
    amino_acids = '#ARNDCQEGHILKMFPSTWYV'
    seq = re.sub('[XZUB]',"",seq)
    t = np.zeros((len(seq), 21))
    for i in range(len(seq)):
        j = amino_acids.index(seq[i])
        if i%2 == 0:
            t[i][j] = np.exp(np.sin(i))
        if i%2 == 1:
            t[i][j] = np.exp(np.cos(i))
    return np.sum(t, axis=0)
            

In [None]:
def load_seq_data(file):
    data = np.load(file, allow_pickle=True)
    posseqs, negseqs = data['pos'], data['neg']
    x_pos = np.ndarray(shape=(len(posseqs),21))
    i = 0
    for seq in posseqs:
        x_pos[i] = seq_position_encode(seq)
        i += 1
    
    x_neg = np.ndarray(shape=(len(negseqs), 21))
    i = 0
    for seq in negseqs:
        x_neg[i] = seq_position_encode(seq)
        i += 1
        
    return x_pos, x_neg

# 准备数据

In [None]:
# prepare data
x_pos_train, x_neg_train = load_seq_data('PDNA_543_train_15.npz')
x_pos_test, x_neg_test = load_seq_data('PDNA_543_test_15.npz')

x_test = np.concatenate((x_pos_test, x_neg_test))
y_test = [0 for _ in range(x_pos_test.shape[0])] + [1 for _ in range(x_neg_test.shape[0])]
y_test = keras.utils.to_categorical(y_test, num_classes=2)

In [None]:
#x_pos_train = np.tile(x_pos_train, reps=(14,1))
x_train = np.concatenate((x_pos_train, x_neg_train))
y_train = [0 for _ in range(x_pos_train.shape[0])] + [1 for _ in range(x_neg_train.shape[0])]


In [None]:
# over-sampling
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
x_train, y_train = sm.fit_resample(x_train, y_train) 
y_train = keras.utils.to_categorical(y_train, num_classes=2)

# Training and Testing

In [None]:
def DNN_train(x_train, y_train, x_test, y_test, epochs=20, batch_size=100, x_weight=None):
    inputs = layers.Input(shape=(21, ))
    x = layers.Dense(300, activation='relu')(inputs)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(1000, activation="relu")(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(300, activation='relu')(x)
    x = layers.Dropout(0.25)(x)
    outputs = layers.Dense(2, activation="softmax")(x)

    model = keras.Model(inputs=inputs, outputs=outputs)
    # Train
    # method 1: weight balancing
    model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])
    
    model.summary()

    if x_weight == None:
        history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)
    else:
        history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, sample_weight=x_weight,
                            validation_split=0.1)
        
    #plot_history(history)
    
    prediction = model.predict(x_test)
    y_pred = np.argmax(prediction, axis=1)
    
    return y_pred, history

# training and testing

In [None]:
epochs = 50
batch_size = 200
x_train, y_train = shuffle(x_train, y_train)
K.clear_session()
y_pred, history = DNN_train(x_train, y_train, x_test, y_test, batch_size=batch_size, epochs=epochs)

# Metrics

In [None]:
%matplotlib inline
plot_history(history)

In [None]:
# predict performance
y_true = np.argmax(y_test, axis=1)
cm = metrics.confusion_matrix(y_true, y_pred)
acc = metrics.accuracy_score(y_true, y_pred)
mcc = metrics.matthews_corrcoef(y_true, y_pred)
print("cm: ", cm)
print("accuracy: ", acc)
print("MCC: ", mcc)