In [1]:
from Bio import SeqIO
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler
from tqdm.notebook import tqdm
import numpy as np
import os
import re
import string
import random
import pandas as pd
import glob

2022-11-29 10:14:42.043542: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-29 10:14:42.353173: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-29 10:14:43.234604: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-11-29 10:14:43.234660: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [2]:
uniprot_rows = []

with open("../data/raw/uniprot/swissprot.fasta") as reader:
    for line in reader:
        if '>' not in line:
            uniprot_rows.append({"sequence": line.strip('\n'), "class": 0})
        
df_anticp_raw = pd.DataFrame(uniprot_rows, columns=["sequence", "class"])

In [3]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(['ACDEFGHIKLMNPQRSTVWY.'])

In [4]:
num_words = 20
embed_dim = 16
num_heads = 3
ff_dim    = 32

In [5]:
import glob

anticp_rows = []

with open("../data/raw/anti_cp/anticp2_alternate_internal_negative.txt") as reader:
    for line in reader:
        anticp_rows.append({"sequence": line.strip('\n'), "class": 0})

        
with open("../data/raw/anti_cp/anticp2_alternate_internal_positive.txt") as reader:
    for line in reader:
        anticp_rows.append({"sequence": line.strip('\n'), "class": 1})
        
with open("../data/raw/anti_cp/anticp2_alternate_validation_negative.txt") as reader:
    for line in reader:
        anticp_rows.append({"sequence": line.strip('\n'), "class": 0})

        
with open("../data/raw/anti_cp/anticp2_alternate_validation_positive.txt") as reader:
    for line in reader:
        anticp_rows.append({"sequence": line.strip('\n'), "class": 1})
        
'''
for csv_file in glob.glob("../data/raw/tumorhope/*.csv"):
    df = pd.read_csv(csv_file)
    for r, row in df.iterrows():
        anticp_rows.append({"sequence": row.Sequence, "class": 1})
'''        
df_anticp_raw = pd.DataFrame(anticp_rows, columns=["sequence", "class"])
df_anticp_raw.head(10)

Unnamed: 0,sequence,class
0,LYHEKYKVVEL,0
1,RKAVLLEEQGIEWKPEDTARPSGPREGGRRDGGRDG,0
2,YAAIPLGAAIGALTSGQLAHSVRPGLIMLVSTVGSFLAVGLFAIMPV,0
3,LLINKSPEERAALAEERTEGGTPLLIA,0
4,AAVLVLIHAAVRRSDNLFLDEEAAAVTEASGLMSYPS,0
5,DRVMQELTEYELVPEAWGGDTIFAPISAKFGEGL,0
6,ILSRVGDGTQDNLSGCEK,0
7,ELAKRWFT,0
8,ESEVLTPADEVFHLNKSDYTVPFVCGCRDLGEAAR,0
9,GSDVAVNGSFPTIYRNYSNSVPYERRITTLLQWLDLPKAD,0


In [6]:
df_anticp_raw.tail(10)

Unnamed: 0,sequence,class
1930,KRAKAAGGWSHWSPWSSC,1
1931,AAKKWAKAKWAKAKKWAKAA,1
1932,FLPLIGRVLSGIL,1
1933,PAWFKARRWAWRMLKKAA,1
1934,KWKSFLKTFKSLKKTVLHTLLKLISS,1
1935,FAKKLAKKLAKLL,1
1936,FALALKALKK,1
1937,GLFAVIKKVASVIKGL,1
1938,ADRGWIKTLTKDCPNVISSICAGTIITACKNCA,1
1939,ESEFDRQEYEECKRQCMQLETSGQMRRCVSQCDKRFEEDIDWSKYDNQD,1


In [7]:
df_anticp_raw.shape

(1940, 2)

In [8]:
stop_token = '.'
max_len = 50

X_anticp = []
y_anticp = []

for r, row in df_anticp_raw.iterrows():
    X_anticp.append(row['sequence'])
    y_anticp.append(float(row['class']))
    
    
X_anticp = np.array(X_anticp).reshape((-1,1))
y_anticp = np.array(y_anticp).reshape((-1,1))

X_anticp, y_anticp = RandomUnderSampler().fit_resample(X_anticp, y_anticp)

In [9]:
X_anticp = X_anticp[:,0]

In [10]:
y_anticp = y_anticp.reshape((-1,1))

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_anticp, y_anticp)

tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(['ACDEFGHIKLMNPQRSTVWY.'])

X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens  = tokenizer.texts_to_sequences(X_test)

X_train_tokens_padded = keras.preprocessing.sequence.pad_sequences(X_train_tokens, maxlen=max_len)
X_test_tokens_padded  = keras.preprocessing.sequence.pad_sequences(X_test_tokens, maxlen=max_len)

In [12]:
X_train_tokens_padded.shape

(1455, 50)

In [13]:
def create_model(vocab_size, input_length):

    model = keras.models.Sequential()
    model.add(
        keras.layers.Embedding(len(tokenizer.word_counts)+1, 20, input_length=50)
    )
    model.add(
        keras.layers.Conv1D(512, 8)
    )
    model.add(keras.layers.Dropout(0.5))
    model.add(
        keras.layers.Conv1D(256, 8)
    )
    model.add(keras.layers.Dropout(0.5))
    model.add(
        keras.layers.Conv1D(128, 8)
    )
    model.add(keras.layers.Dropout(0.5))
    model.add(
        keras.layers.Conv1D(64, 8)
    )
    model.add(keras.layers.Dropout(0.5))
    model.add(
        keras.layers.Conv1D(32, 4)
    )
    model.add(keras.layers.Dropout(0.5))
    model.add(
        keras.layers.Flatten()
    )
    model.add(keras.layers.Dropout(0.5))
    model.add(keras.layers.Dense(256, activation='relu'))
    model.add(keras.layers.Dropout(0.5))
    model.add(keras.layers.Dense(128, activation='relu'))
    model.add(keras.layers.Dropout(0.5))
    model.add(keras.layers.Dense(64, activation='relu'))
    model.add(keras.layers.Dropout(0.5))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    
    return model


In [14]:
classification_model = create_model()

history = classification_model.fit(
    X_train_tokens_padded, 
    y_train,
    batch_size=8,
    epochs=20,
    validation_data=(X_test_tokens_padded, y_test)
)

2022-11-29 10:14:49.000679: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-29 10:14:49.056349: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-11-29 10:14:49.056366: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-11-29 10:14:49.057150: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [21]:
from tqdm.notebook import tqdm

def anticp_proba(sequence):
    tokens = tokenizer.texts_to_sequences([sequence])
    padded = keras.preprocessing.sequence.pad_sequences(tokens, maxlen=max_len)
    return classification_model.predict(padded, verbose=0)[0][0]

def mutate(sequence, factor=0.01, iterations=1_000):
    initial_prob =  anticp_proba(sequence)
    aas = 'ACDEFGHIKLMNPQRSTVWY.'
    for iteration in range(iterations):
        current_prob =  anticp_proba(sequence)
        sequence_aas = list(sequence)
        mutation_idx = random.randrange(0, len(sequence))
        sequence_aas[mutation_idx] = random.choice(aas)
        new_sequence = ''.join([aa for aa in sequence_aas if aa in aas]).split('.')[0] + '.'
        if new_sequence == '.':
            continue
        new_prob     = anticp_proba(new_sequence)
        if new_prob > current_prob:
            print(f'>mutated_{iteration}:{new_prob}', new_sequence.replace('.',''), sep='\n')
            sequence = new_sequence
        elif random.random() < factor:
            sequence = new_sequence
    return sequence

In [None]:
mutate('LLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLVPRTES')

In [22]:
sequence = 'LLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLVPRTES'

print('>original\n'+sequence)
halucinate(sequence)

>original
LLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLVPRTES
>mutated_104:0.8197231888771057
LLGDFFRKSK
>mutated_105:0.8428837656974792
LLGDFFRKHK
>mutated_107:0.8791126608848572
LLGAFFRKHK
>mutated_111:0.8816781640052795
FLGAFFRKHK
>mutated_113:0.8817476034164429
FLGAFFMKHK
>mutated_115:0.9175047874450684
FLGAFFKKHK
>mutated_118:0.9304203391075134
FLHAFFKKHK
>mutated_122:0.9436076283454895
FLHAFFKKHKH
>mutated_129:0.9498093724250793
FLHKFFKKHKH
>mutated_139:0.9543346166610718
FLHKFFKKHKK
>mutated_148:0.9619104266166687
FLHKFFKKCKK
>mutated_150:0.9667935967445374
FLHKFFKKCKKL
>mutated_152:0.9717153310775757
FLHKFWKKCKKL
>mutated_154:0.9735897183418274
FDHKFWKKCKKL
>mutated_157:0.974625289440155
FDHKKWKKCKKL
>mutated_179:0.9755672216415405
FDIKKWKKCKKL
>mutated_189:0.978655219078064
FDIWKWKKCKKL
>mutated_195:0.9800183176994324
FDWWKWKKCKKL
>mutated_240:0.9803245067596436
FDWWKWKKKKKL
>mutated_244:0.9829104542732239
FDWWKWKKKKKK
>mutated_268:0.9848653078079224
FDWWWWKKKKKK
>mutated_337:0.985122203826

'CFDWKCWWWCCCC.'

In [None]:
df_hallucinated