### Construct simple CNN model (1D Convolutional Network)

In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Layer, Input, Embedding, Conv1D, BatchNormalization, LeakyReLU, MaxPooling1D, Flatten, Dense, Softmax
from tensorflow.keras.optimizers import Adam

2022-06-08 04:01:58.984500: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


#### Prepare Training Datset

In [5]:
train_5k = pd.read_feather('./../dataset/objs/train_5k.ftr')
test_5k = pd.read_feather('./../dataset/objs/test_5k.ftr')

train_5k, validate_5k = train_test_split(train_5k, train_size = 0.8, random_state=42)

train_X = train_5k['Sequence']
train_y = train_5k['Class']

validate_X = validate_5k['Sequence']
validate_y = validate_5k['Class']

test_X = test_5k['Sequence']
test_y = test_5k['Class']

print(train_5k.head(), '\n') # 80% of train set
print(validate_5k.head(), '\n') # 20% of train set
print(test_5k.head(), '\n') # 100% of test set
print(train_5k.shape, validate_5k.shape, test_5k.shape, sep='\t')

                                                 Sequence        RBP Class
80583   GTCTGCTGTCTCTACACCATCTCCATCATGGGCAATACCACCATCC...   hnRNPC-2     0
11124   AAAAAAAGTTCAAGAGTGTAACTAGTTCACCCCAAGGTAGTGTGTG...     Ago2-1     0
26845   TGAAGATGAGAAACTTCAAGGCAAGATTAACGATGAGGACAAACAG...  eIF4III-1     1
81415   CCCACCCCATCCCAGGTCACCACCTGGCTGAACCCAGGTCCCCGAC...   hnRNPC-2     0
130220  CTGTTCCTATATGCTTCTTAGAATCCTTAAGCCACCTCTCTTGCCT...     TDP-43     0 

                                                 Sequence           RBP Class
58619   GCAGACTTACCATGCCAAAGTGAGCTCTCTTCAGAAGATTCTTTTG...         ESWR1     1
6660    AAATTTGAATAGGAATTGGGTATGAAATCATACAAAGATGATCTAT...    Ago2-MNase     0
42266   GTAGCACCCCGAAGTAGAGCTTTCTGCTCTGCTCCTGGAAAAGGCT...  ELAVL1-MNase     1
142944  CCAGGCGGGGTCAGTGTTGCGCACTGGGGATAGTGCCTCTGCTCGG...         TIAL1     0
119316  ATGTGTAGTCATGGTTTTGATTTTTATTTACACCTTTTGAAATTTG...           QKI     0 

                                            Sequence      RBP Class
0  TTAATTG

In [3]:
set([x for seq in train_X.tolist() for x in seq])

{'A', 'C', 'G', 'N', 'T'}

#### One Hot Encoding & Embedding

In [6]:
# Check bases
bases = ''.join(sorted(set([x for seq in train_X.tolist() for x in seq])))

base_dict = dict((v, k) for (k, v) in dict(enumerate(bases)).items())

train_X_int = [[bases.index(c) for c in seq] for seq in train_X.tolist()]

train_X_onehot = np.eye(len(bases))[train_X_int]

In [40]:
print(bases)
print(train_X_onehot.shape)

ACGNT
(124000, 101, 5)


In [14]:
train_y.head()

80583     0
11124     0
26845     1
81415     0
130220    0
Name: Class, dtype: category
Categories (2, int64): [0, 1]

#### Model Construction

##### Model Configuration

In [13]:
model_config = dict()
model_config['num_filters'] = 64
model_config['kernel_size'] = 3 # codon length == 3
model_config['strides'] = 3 # next codon starts at 3 + previous_codon
model_config['pad_opt'] = 'same'

##### Set Hyperparameters

In [8]:
epoch = 50
lr = 0.001

##### Make 1D-CNN Block

In [9]:
class block1d(Layer):
    def __init__(self, **kwargs): # kwargs allow giving parameters
        super(block1d, self).__init__()

        self.conv1d = Conv1D(
            filters=num_filters,
            kernel_size=kernel_size,
            strides=strides,
            padding=pad_opt
        )
        self.bnrm = BatchNormalization()
        self.lrelu = LeakyReLU()
        self.mxpl1d = MaxPooling1D()
    
    def call(self, inputs):
        x = self.conv1d(inputs)
        x = self.bnrm(x)
        x = self.lrelu(x)
        x = self.mxpl1d(x)

        return x

In [None]:
class blockMLP(Layer):
    def __init__(self):
        super(blockMLP, self).__init__()

        self.dense

##### Define Model using pre-defined Blocks

In [11]:
# 1D Convolution Network with One Hot Encoding
class model_1dcnn_onehot(Model):

    def __init__(self):
        super(model_1dcnn_onehot, self).__init__()

        self.num_blocks = 3
        self.blocks1d = [block1d() for i in range(self.num_blocks)] # stack up 3 1D-CNN layers

        self.flatten = Flatten()
        self.dense_h1 = Dense(64)
        self.dense_out = Dense(2) # use 2 output layers for Softmax() ; 1 output for sigmoid()
        self.softmax = Softmax()

    def call(self, inputs):
        # convolution layer
        x = inputs
        for blk in self.blocks1d:
            x = blk(x)

        # FC layer
        x = self.flatten(x)
        x = self.dense_h1(x)
        x = self.dense_out(x)
        
        output = self.softmax(x)

        return output

In [None]:
# 1D Convolution Network with Embedding Layer
class model_1dcnn_embed(Model):

    def __init__(self):
        super(model_1dcnn_embed, self).__init__()

In [14]:
num_filters = 64
kernel_size = 3 # codon length == 3
strides = 3 # next codon starts at 3 + previous_codon
pad_opt = 'same'

test_model = model_1dcnn_onehot()
test_model.build(input_shape=(None, 101, 5))

ValueError: in user code:

    /tmp/ipykernel_6024/3545025351.py:19 call  *
        x = self.mxpl1d(x)
    /home/myrmecia/anaconda3/envs/ML/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:1012 __call__  **
        outputs = call_fn(inputs, *args, **kwargs)
    /home/myrmecia/anaconda3/envs/ML/lib/python3.8/site-packages/tensorflow/python/keras/layers/pooling.py:75 call
        outputs = self.pool_function(
    /home/myrmecia/anaconda3/envs/ML/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /home/myrmecia/anaconda3/envs/ML/lib/python3.8/site-packages/tensorflow/python/keras/backend.py:5696 pool2d
        x = nn.max_pool(
    /home/myrmecia/anaconda3/envs/ML/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /home/myrmecia/anaconda3/envs/ML/lib/python3.8/site-packages/tensorflow/python/ops/nn_ops.py:4606 max_pool
        return gen_nn_ops.max_pool(
    /home/myrmecia/anaconda3/envs/ML/lib/python3.8/site-packages/tensorflow/python/ops/gen_nn_ops.py:5327 max_pool
        _, _, _op, _outputs = _op_def_library._apply_op_helper(
    /home/myrmecia/anaconda3/envs/ML/lib/python3.8/site-packages/tensorflow/python/framework/op_def_library.py:748 _apply_op_helper
        op = g._create_op_internal(op_type_name, inputs, dtypes=None,
    /home/myrmecia/anaconda3/envs/ML/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py:590 _create_op_internal
        return super(FuncGraph, self)._create_op_internal(  # pylint: disable=protected-access
    /home/myrmecia/anaconda3/envs/ML/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:3528 _create_op_internal
        ret = Operation(
    /home/myrmecia/anaconda3/envs/ML/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:2015 __init__
        self._c_op = _create_c_op(self._graph, node_def, inputs,
    /home/myrmecia/anaconda3/envs/ML/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:1856 _create_c_op
        raise ValueError(str(e))

    ValueError: Negative dimension size caused by subtracting 2 from 1 for '{{node block1d_3/max_pooling1d_2/MaxPool}} = MaxPool[T=DT_FLOAT, data_format="NHWC", explicit_paddings=[], ksize=[1, 2, 1, 1], padding="VALID", strides=[1, 2, 1, 1]](block1d_3/max_pooling1d_2/ExpandDims)' with input shapes: [?,1,1,64].


In [None]:
# User might give customly separated train / test set
# user might give 
def train_model(df, model_config, epoch, learning_rate, onehot=True, save=False):

In [None]:
inputs = Input(shape=())