In [1]:
import tensorflow as tf
import keras
from keras import layers
import os
import csv
import numpy as np



In [2]:
def read(filepath:str, label=None) -> list:
    '''
    Reads dataset in .fa format
    returns a list of strings
    '''
    # Open the file in read-only mode ('r')
    output = []
    with open(filepath, 'r') as file:
        for line in file:
            if not line.startswith(">"):
                # This line contains sequence data
                output.append(line.strip())
        file.close()

    return output

def conv_amino_to_vector(sequence):
    conversion_dict = {
        'X':0,
        'A':1,
        'C':2,
        'D':3,
        'E':4,
        'F':5,
        'G':6,
        'H':7,
        'I':8,
        'K':9,
        'L':10,
        'M':11,
        'N':12,
        'P':13,
        'Q':14,
        'R':15,
        'S':16,
        'T':17,
        'V':18,
        'W':19,
        'Y':20
    }

    return [conversion_dict[c] for c in sequence]

In [3]:
raw_data = read('data/AMP.tr.fa')
raw_data_neg = read('data/DECOY.tr.fa')
len(raw_data_neg)

712

In [4]:
batch_size = 12
class Dataset:
    def __init__(self, batch_size=32):
        self.batch_size = batch_size # specifies how big to make batch
        self.data_train = []
        for i in raw_data:
            padded = i.rjust(200, 'X')
            self.data_train.append((conv_amino_to_vector(padded), 1))
        for i in raw_data_neg:
            padded = i.rjust(200, 'X')
            self.data_train.append((conv_amino_to_vector(padded), 0))
        self.indices = np.arange(len(self.data_train))
        np.random.shuffle(self.indices)
    def __len__(self):
        return int(len(self.data_train)/self.batch_size)
    def __getitem__(self, index):
        start = index*self.batch_size
        Xs = []
        Ys = []
        for i in self.indices[start:start+self.batch_size]:
            x,y = self.data_train[i]
            Xs.append(x)
            Ys.append(y)
        return np.array(Xs, dtype = np.int32), np.array(Ys, dtype = np.int32)
    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)
            
            self.on_epoch_end()
    def on_epoch_end(self):
        np.random.shuffle(self.indices)
dataset = tf.data.Dataset.from_generator(
    Dataset(batch_size = batch_size),
    output_signature=(
        tf.TensorSpec(shape=(batch_size, 200), dtype=tf.int32),
        tf.TensorSpec(shape=(batch_size), dtype=tf.int32)
    )
)
     

for x,y in dataset:
    print(x, y)
    break

tf.Tensor(
[[ 0  0  0 ... 16 18 17]
 [ 0  0  0 ... 12 19  2]
 [ 0  0  0 ...  6 16  6]
 ...
 [ 0  0  0 ...  3 10 10]
 [ 0  0  0 ...  9 10 13]
 [ 0  0  0 ...  4 14  8]], shape=(12, 200), dtype=int32) tf.Tensor([0 1 0 0 1 0 0 0 1 0 0 0], shape=(12,), dtype=int32)


In [5]:
model = keras.Sequential(
    [
        layers.Embedding(200, 128, input_shape = (200,), name="embed"),
        layers.Conv1D(filters = 64, kernel_size = 16, activation="relu", name="conv"),
        layers.MaxPooling1D(pool_size = 5, name = 'pooling'),
        layers.LSTM(units = 100, unroll = True, stateful = False, dropout = 0.1, name = 'lstm'),
        layers.Dense(1, activation = 'sigmoid')
    ]
)

adam = keras.optimizers.Adam()
model.compile(loss='binary_crossentropy', metrics = 'accuracy', optimizer=adam)

In [6]:
model.fit(dataset, batch_size = batch_size, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x14c377bb0d0>

In [7]:
for x,y in dataset:
    x
print(model.predict(x),y)

[[9.9657983e-01]
 [9.7498298e-01]
 [9.9999976e-01]
 [9.9999875e-01]
 [9.9999267e-01]
 [9.9981344e-01]
 [4.3373186e-02]
 [2.4834611e-03]
 [9.9999976e-01]
 [4.0834875e-05]
 [9.9179977e-01]
 [4.8334587e-05]] tf.Tensor([1 1 1 1 1 1 0 0 1 0 1 0], shape=(12,), dtype=int32)
