In [1]:
import sys
sys.path.append('e:/Repoes/jci/')
sys.path.append('e:/Repoes/jci/bio/')
sys.path.append('..')

In [2]:
from Bio import SeqIO

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import numpy as np

from nlp_transformer import Encoder, create_padding_mask
from prepare_seq import protseq_to_vec
from tools import displayMetrics, displayMLMetrics, plot_history

from tensorflow import keras
from tensorflow.keras import layers, callbacks

In [3]:
def buildModel(maxlen, vocab_size, embed_dim, num_heads, ff_dim, 
               num_blocks, droprate, fl_size, num_classes):
    inputs = layers.Input(shape=(maxlen,))
    
    encode_padding_mask = create_padding_mask(inputs)
    encoder = Encoder(n_layers=num_blocks, d_model=embed_dim, n_heads=num_heads, 
                      ffd=ff_dim, input_vocab_size=vocab_size, 
                      max_seq_len=maxlen, dropout_rate=droprate)
    x = encoder(inputs, False, encode_padding_mask)
    
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(droprate)(x)
    x = layers.Dense(fl_size, activation="relu")(x)
    x = layers.Dropout(droprate)(x)
    
    outputs = layers.Dense(num_classes, activation="softmax")(x)
    
    model = keras.Model(inputs=inputs, outputs=outputs)
    
    model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])
    return model

In [4]:
def load_seqs():
    """
    read Enzyme and not Enzyme sequences, 
    in which every protein sequence is less than 40% similarity with others.

    Returns
    -------
    seqs:
        protein sequences
    labels:
        if 0 for not Enzyme, else 1 for Enzyme

    """
    
    # read Enzyme and not Enzyme sequences 
    seq_records = SeqIO.parse('data/EC_40.fasta', 'fasta')
    seq_records = shuffle(list(seq_records), random_state=42)
    Enzyme_seqs = []
    for seq_record in seq_records:
        if len(str(seq_record.seq)) >= 50:
            Enzyme_seqs.append(str(seq_record.seq))
            
    seq_records = SeqIO.parse('data/NotEC_40.fasta', 'fasta')
    seq_records = shuffle(list(seq_records), random_state=42)
    notEnzyme_seqs = []
    for seq_record in seq_records:
        if len(str(seq_record.seq)) >= 50:
            notEnzyme_seqs.append(str(seq_record.seq))
    notEnzyme_seqs = shuffle(notEnzyme_seqs)
    notEnzyme_seqs = notEnzyme_seqs[:len(Enzyme_seqs)]
    
    
    seqs = Enzyme_seqs + notEnzyme_seqs
    labels = [1 for i in range(len(Enzyme_seqs))] + [0 for i in range(len(notEnzyme_seqs))]

    return seqs, labels


In [5]:
def transformer_predictor(X_train, y_train, X_test, y_test, modelfile, params):
    keras.backend.clear_session()

    model = buildModel(params['maxlen'], params['vocab_size'], params['embed_dim'], 
                    params['num_heads'], params['ff_dim'],  params['num_blocks'], 
                    params['droprate'], params['fl_size'], params['num_classes'])
    model.summary()

    checkpoint = callbacks.ModelCheckpoint(modelfile, monitor='val_loss',
                                       save_best_only=True, 
                                       save_weights_only=True, 
                                       verbose=1)
    history = model.fit(
        X_train, y_train, 
        batch_size=params['batch_size'], epochs=params['epochs'], 
        validation_data=(X_test, y_test),
        callbacks=[checkpoint]
        )

    plot_history(history)

    #model.load_weights(modelfile)
    score = model.predict(X_test)
    
    return score

In [6]:
# transformer net params
params = {}
params['vocab_size'] = 24
params['maxlen'] = 500
params['embed_dim'] = 16 # Embedding size for each token
params['num_heads'] = 4  # Number of attention heads
params['ff_dim'] = 128  # Hidden layer size in feed forward network inside transformer
params['num_blocks'] = 12
params['droprate'] = 0.2
params['fl_size'] = 96
params['num_classes'] = 2
params['epochs'] = 20
params['batch_size'] = 32

In [7]:
# load data
seqs, labels = load_seqs()

# split data into train and test
seqs_train, seqs_test, labels_train, labels_test = train_test_split(seqs, labels, 
                                                    test_size=0.3, 
                                                    random_state=42,
                                                    stratify=labels)

In [8]:
# tranform protein sequence to word vector

X_train = protseq_to_vec(seqs_train, padding_position="post", maxlen=params['maxlen'])
X_test = protseq_to_vec(seqs_test, padding_position="post", maxlen=params['maxlen'])

y_train = keras.utils.to_categorical(labels_train, params['num_classes'])
y_test = keras.utils.to_categorical(labels_test, params['num_classes'])

In [None]:
# training and test
modelfile = './model/ec/ec_trainsformer_{}_{}.h5'.format(params["maxlen"], "pos")
score = transformer_predictor(X_train, y_train, X_test, y_test, modelfile, params)
pred = np.argmax(score, 1)
displayMetrics(np.argmax(y_test, 1), pred)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 500)]             0         
_________________________________________________________________
tf_op_layer_Equal (TensorFlo [(None, 500)]             0         
_________________________________________________________________
tf_op_layer_Cast (TensorFlow [(None, 500)]             0         
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 1, 1, 500)]       0         
_________________________________________________________________
encoder (Encoder)            (None, 500, 16)           61824     
_________________________________________________________________
global_max_pooling1d (Global (None, 16)                0         
_________________________________________________________________
dropout_25 (Dropout)         (None, 16)                0     