In [1]:
import pandas as pd

In [52]:
X_train_raw = []
y_train_raw = []

X_test_raw = []
y_test_raw = []

for line in open('../data/raw/anti_cp/anticp2_main_internal_positive.txt'):
    line = '$' + line.strip("\n") + '.'
    for i in range(1, len(line.strip('\n'))):
        X_train_raw.append(line[0:i])
        y_train_raw.append(line[i])
        
for line in open('../data/raw/anti_cp/anticp2_alternate_internal_positive.txt'):
    line = '$' + line.strip("\n") + '.'
    for i in range(1, len(line.strip('\n'))):
        X_train_raw.append(line[0:i])
        y_train_raw.append(line[i])
        
for line in open('../data/raw/anti_cp/anticp2_main_validation_positive.txt'):
    line = '$' + line.strip("\n") + '.'
    for i in range(1, len(line.strip('\n'))):
        X_test_raw.append(line[0:i])
        y_test_raw.append(line[i])
        
for line in open('../data/raw/anti_cp/anticp2_alternate_validation_positive.txt'):
    line = '$' + line.strip("\n") + '.'
    for i in range(1, len(line.strip('\n'))):
        X_test_raw.append(line[0:i])
        y_test_raw.append(line[i])

In [53]:
from tensorflow import keras

In [68]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level = True)
tokenizer.fit_on_texts(X_train_raw)
X_train = keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(X_train_raw), maxlen = 60)
X_test = keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(X_test_raw), maxlen = 60)

In [69]:
y_test_raw

['F',
 'L',
 'W',
 'W',
 'L',
 'F',
 'K',
 'W',
 'A',
 'W',
 'K',
 '.',
 'F',
 'A',
 'K',
 'L',
 'A',
 'K',
 'K',
 'A',
 'L',
 'A',
 'K',
 'L',
 'L',
 '.',
 'G',
 'L',
 'F',
 'D',
 'I',
 'V',
 'K',
 'K',
 'I',
 'A',
 'G',
 'H',
 'I',
 'A',
 'G',
 'S',
 'I',
 '.',
 'V',
 'N',
 'F',
 'K',
 'K',
 'L',
 'L',
 'G',
 'K',
 'L',
 'L',
 'K',
 'V',
 'V',
 'K',
 '.',
 'W',
 'K',
 'K',
 'I',
 'P',
 'K',
 'F',
 'L',
 'H',
 'L',
 'L',
 'K',
 'K',
 'F',
 '.',
 'E',
 'Q',
 'C',
 'G',
 'R',
 'Q',
 'A',
 'G',
 'G',
 'K',
 'L',
 'C',
 'P',
 'N',
 'N',
 'L',
 'C',
 'C',
 'S',
 'Q',
 'Y',
 'G',
 'W',
 'C',
 'G',
 'S',
 'S',
 'D',
 'D',
 'Y',
 'C',
 'S',
 'P',
 'S',
 'K',
 'N',
 'C',
 'Q',
 'S',
 'N',
 'C',
 'K',
 'G',
 'G',
 'G',
 '.',
 'E',
 'A',
 'D',
 'E',
 'P',
 'L',
 'W',
 'L',
 'Y',
 'K',
 'G',
 'D',
 'N',
 'I',
 'E',
 'R',
 'A',
 'P',
 'T',
 'T',
 'A',
 'D',
 'H',
 'P',
 'I',
 'L',
 'P',
 'S',
 'I',
 'I',
 'D',
 'D',
 'V',
 'K',
 'L',
 'D',
 'P',
 'N',
 'R',
 'R',
 'Y',
 'A',
 '.',
 'F',
 'V',
 'G'

In [70]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(y_train_raw)
y_train = keras.utils.to_categorical(le.transform(y_train_raw))
y_test  = keras.utils.to_categorical(le.transform(y_test_raw))

In [71]:
le.classes_

array(['.', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N',
       'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'], dtype='<U1')

In [72]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [73]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [74]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


In [129]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

vocab_size = 24
maxlen = 60

embed_dim = 16
num_heads = 4
ff_dim = 32

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)

x = embedding_layer(inputs)
transformer_block_1 = TransformerBlock(embed_dim, num_heads, ff_dim)
transformer_block_2 = TransformerBlock(embed_dim, num_heads, ff_dim)
transformer_block_3 = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block_1(x)
x = transformer_block_2(x)
x = transformer_block_3(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)

outputs = layers.Dense(len(le.classes_), activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.fit(X_train, y_train, validation_data = (X_test, y_test), shuffle = True, epochs = 50, callbacks=[callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f6dcc0fc640>

In [151]:
def sample(a, temperature=1.0):
    a = np.array(a).astype('float64')
    a = np.log(a) / temperature
    a = np.exp(a) / np.sum(np.exp(a))
    return np.argmax(np.random.multinomial(1, a, 1))

def generate_sequence(min_len=5, temperature=1.0):
    while True:
        sequence = '$'
        next_token = None
        while True:
            tokenized  = keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences([sequence]), maxlen = 60)
            next_token = le.inverse_transform([sample(model.predict(tokenized, verbose=False)[0], temperature=temperature)])[0]
            sequence += next_token
            if next_token == '.' or len(sequence) > 60:
                break
        if len(sequence)-2 >= min_len:
            return sequence.strip('$').strip('.')

In [152]:
sample(model.predict(tokenized)[0])



9

In [153]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [154]:
import pickle

def create_classifier():
    vocab_size = 21
    maxlen = 50

    embed_dim = 20
    num_heads = 2
    ff_dim = 32

    inputs = layers.Input(shape=(maxlen,))
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)

    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(20, activation="relu")(x)
    x = layers.Dropout(0.1)(x)

    outputs = layers.Dense(2, activation="softmax")(x)

    model = keras.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    model.load_weights('../data/models/transformer.h5')
    return model

with open('../data/models/classifier_tokenizer.pickle', 'rb') as reader:
    classifier_tokenizer = pickle.loads(reader.read())
    
with open('../data/models/label_encoder.pickle', 'rb') as reader:
    classifier_label_encoder = pickle.loads(reader.read())
    
def classify_peptide(model_classifier, sequence):
    X = keras.preprocessing.sequence.pad_sequences(classifier_tokenizer.texts_to_sequences([sequence]), maxlen = 50)
    y_pred = model_classifier.predict(X, verbose=False)[:,1][0]
    return y_pred

model_classifier = create_classifier()


0.9988513

In [None]:
for i in range(1000):
    sequence = generate_sequence()
    sequence_proba = classify_peptide(model_classifier, sequence)
    if sequence_proba > 0.95:
        print(f'>{i}', sequence_proba)
        print(sequence)

>0 0.9999058
FFTCGSTCGHAGGCTASCWCGGKNGCGTEYTGCCWPPPTC
>1 0.9948495
CHWHETLKCI
>2 0.9996086
FLPIIATIAAKVIPKFFCKISKKC
>3 0.9994405
FAKLLAKLAKKFAKL
>4 0.99996734
KWKLFKKIPKFLGLFKKF
>5 0.9991498
KWKLFKKIDQAKNLAKVLKALKTVGQ
>6 0.998547
WTGTLHCAISKAPKVGGCLRGISC
>7 0.99200433
PERIPIIGQAFIESRYWLCFLNCGPWCKL
>8 0.9993604
FLGTLKGISKAAKKAAKGLANTAECKLTSKC
>9 0.98034644
YKKRIGRIIRRIRLVLG
>10 0.99999714
AWKKAWKAWKKAWKKAAKKW
>11 0.999652
FAKLLAKALKKLAKKL
>12 0.96764886
CCPNTTGTGLWAIRKNA
>14 0.99716824
FLPLLAIVALKVKPNGIKPNIGTIVCSLKC
>15 0.9973219
GMWKKILKKIIR
>17 0.99980575
AKKFAKKALKLAKKL
>18 0.99973553
KWKLFKKIPKFLNAITKF
>20 0.9987122
FAKLLAKLAKKF
>21 0.9544156
GPWRKPL
>23 0.99966264
GVGSIIKNIRCNCCKSKRNKGGYGACRA
>24 0.9992588
FAKLLAKLAKKLAKL
>25 0.9999646
KWKLFKKIPKFLGLFWKF
>28 0.9934644
ASCRCNEANYGGPIWGKNCGKRGITGYLIRVLR
>32 0.999987
AWKWAWKWAKAAKKWAKKAA
>33 0.9986022
FFGGTIWRATKGAGCAFECALHRRPCY
>34 0.9997651
FLPVILGFLGNIIKSIFCFIKKKC
>35 0.99984884
FAKLLAKLAKKLAKLAKKL
>36 0.99965334
GGWLLFFKKPTKKLFGSI

>304 0.9998727
KWKLFKKIPKFLKAF
>305 0.9938513
FAKVLAKLAKLL
>306 0.9943506
MWKRIWAWFL
>307 0.99533623
GFGRLLKCARQPACHLTDKA
>308 0.9560605
GAWKAKAAVPIGGMFAKIGSAVKSYLM
>309 0.9996655
GFGCSESCVFIPCITAVIGCGCKSKVCC
>310 0.99998426
SWKKTLKKIFKKIWKKLKKVL
>311 0.9994312
LGSWTKAISPGILCGGTKIFCGGSIGCCGGS
>312 0.99970883
KWKLFKKIPKFLSLFPIF
>313 0.9955402
FLPIITKFLPSIIWKLLKAI
>314 0.99603146
FAKLLAKLAKL
>315 0.99872404
FAKLFKKLAKL
>316 0.99578947
IWCLPKYKRG
>318 0.9933456
HPPWRAWGWMK
>319 0.99591994
FAKLLAKLAKLA
>320 0.99603146
FAKLLAKLAKL
>321 0.9977774
ATCSLTCDNFTKVTQSGCFKAKECCPSKMKCHPI
>322 0.9998344
KWKLFKKIPKFLTLAGKF
>323 0.99603146
FAKLLAKLAKL
>324 0.9858279
AWKLAW
>325 0.9996488
FAKKLAKLAKKLAKLAL
>326 0.9926899
RLPPGWRKLLW
>327 0.9568823
FLPIIAGVLGKIF
>328 0.9996891
WKLFKKIPKFLSLAGKF
>329 0.99996436
KWKLFKKIPKFLHLAKKF
>330 0.9985971
PPAWGAAWGAAWSTLRWALQRAFRCDDRKKPCRGNPKNGPIVHTMTCK
>331 0.9999608
KWKLFKKIPKFLHAAKKF
>332 0.9984363
FAKLLAKLAKKLL
>333 0.99838126
WKALKKIPRALK
>335 0.99550277
FAKLL

>602 0.99999666
AWKWAWKAAWAKWWKKAAKWAK
>604 0.99983466
KWKLFKKIPKFLSLFSKF
>605 0.9838193
GLFGIIKKVASVIKPL
>607 0.9998349
LALALKALKKLLKKLKKLL
>608 0.9988193
FAKLLAKLAKKFL
>609 0.9993142
FAKLLAKLAKKLAKLAL
>611 0.999684
FAKLLAKALKKLKKLL
>612 0.9999795
GKWKLAKKVLAAVFKNIFKSIIGVIKKRLKK
>616 0.9999702
KWKLFKKIRKLLKKALLKLL
>617 0.99603146
FAKLLAKLAKL
>619 0.99991655
WKLFKKIPKFLHAAKKF
>620 0.99572504
KMKLALKLLGKL
>621 0.9988193
FAKLLAKLAKKFL
>623 0.99840826
FLPIIAAVVAKVFPGIFCAISKKC
>624 0.9987494
WALGPGKIAGKALLAAAKKVFGAS
>625 0.99983466
KWKLFKKIPKFLSLFSKF
>626 0.9998161
GFFKAIKKIFKIIKKGL
>627 0.99958104
GLFKVLKKIAKVLKKFA
>628 0.99725276
FLPLVANALSNVIPWIWCLISKKC
>631 0.9641935
SWPLKASKRLAR
>633 0.9987557
ALPKGAYKLALKIQRWGKKFTRR
>635 0.9992588
FAKLLAKALKKLAKL
>637 0.99961686
FAKALAKALKKLAKKL
>640 0.99998236
AWKWWWLKKLAKAAKALKKAL
>641 0.9964101
CKRRQQLRRYCGYRAKC
>643 0.99600464
EKKATTCILVGGACTFKGWMSLPHK
>644 0.95473313
AAKLKLA
>645 0.99983346
FWKLFKKIPNLVGKILKKF
>646 0.99952066
ITCSCISSIIQWKGNTGPP