### Construct simple CNN model (1D Convolutional Network)

In [81]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Layer, Input, Embedding, Conv1D, BatchNormalization, ReLU, LeakyReLU, MaxPooling1D, Flatten, Dense, Softmax, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

#### Prepare Training Datset

In [2]:
train_5k = pd.read_feather('./../dataset/objs/train_5k.ftr')
test_5k = pd.read_feather('./../dataset/objs/test_5k.ftr')

train_5k, validate_5k = train_test_split(train_5k, train_size = 0.8, random_state=42)

train_X = train_5k['Sequence']
train_y = train_5k['Class']

validate_X = validate_5k['Sequence']
validate_y = validate_5k['Class']

test_X = test_5k['Sequence']
test_y = test_5k['Class']

print(train_5k.head(), '\n') # 80% of train set
print(validate_5k.head(), '\n') # 20% of train set
print(test_5k.head(), '\n') # 100% of test set
print(train_5k.shape, validate_5k.shape, test_5k.shape, sep='\t')

                                                 Sequence        RBP Class
80583   GTCTGCTGTCTCTACACCATCTCCATCATGGGCAATACCACCATCC...   hnRNPC-2     0
11124   AAAAAAAGTTCAAGAGTGTAACTAGTTCACCCCAAGGTAGTGTGTG...     Ago2-1     0
26845   TGAAGATGAGAAACTTCAAGGCAAGATTAACGATGAGGACAAACAG...  eIF4III-1     1
81415   CCCACCCCATCCCAGGTCACCACCTGGCTGAACCCAGGTCCCCGAC...   hnRNPC-2     0
130220  CTGTTCCTATATGCTTCTTAGAATCCTTAAGCCACCTCTCTTGCCT...     TDP-43     0 

                                                 Sequence           RBP Class
58619   GCAGACTTACCATGCCAAAGTGAGCTCTCTTCAGAAGATTCTTTTG...         ESWR1     1
6660    AAATTTGAATAGGAATTGGGTATGAAATCATACAAAGATGATCTAT...    Ago2-MNase     0
42266   GTAGCACCCCGAAGTAGAGCTTTCTGCTCTGCTCCTGGAAAAGGCT...  ELAVL1-MNase     1
142944  CCAGGCGGGGTCAGTGTTGCGCACTGGGGATAGTGCCTCTGCTCGG...         TIAL1     0
119316  ATGTGTAGTCATGGTTTTGATTTTTATTTACACCTTTTGAAATTTG...           QKI     0 

                                            Sequence      RBP Class
0  TTAATTG

In [3]:
set([x for seq in train_X.tolist() for x in seq])

{'A', 'C', 'G', 'N', 'T'}

#### One Hot Encoding & Embedding

In [4]:
# Check bases
bases = ''.join(sorted(set([x for seq in train_X.tolist() for x in seq])))

base_dict = dict((v, k) for (k, v) in dict(enumerate(bases)).items())

train_X_int = [[bases.index(c) for c in seq] for seq in train_X.tolist()]

train_X_onehot = np.eye(len(bases))[train_X_int]

In [5]:
print(bases)
print(train_X_onehot.shape)

ACGNT
(124000, 101, 5)


In [6]:
train_y.head()

80583     0
11124     0
26845     1
81415     0
130220    0
Name: Class, dtype: category
Categories (2, int64): [0, 1]

##### try using only single RBP

In [57]:
train_5k = pd.read_feather('./../dataset/objs/train_5k.ftr')
test_5k = pd.read_feather('./../dataset/objs/test_5k.ftr')

In [63]:
ago_eif_train = train_5k[train_5k['RBP'] == 'Ago-EIF']
ago_eif_test = test_5k[test_5k['RBP'] == 'Ago-EIF']
ago_eif_train.shape

(5000, 3)

In [70]:
train_5k[train_5k['RBP'] == 'Ago-EIF'][:10]

Unnamed: 0,Sequence,RBP,Class
0,AAGGGGCTAGGATGAGTTTCTGAATCTCCCAAGGGCGAGATTTCGG...,Ago-EIF,0
1,CCTGAAGCAGCAAGTGAGCGGGCTGGAGGGTGTGCAGGACGACCTG...,Ago-EIF,0
2,CACGGCTCCCCCTCGGCCTATTACACGCGTGCGCAGCCAGGCCTCG...,Ago-EIF,0
3,ATACAAGCAGGAGCACATCGCTCTTTTATGAAAGCCCTTCAACATT...,Ago-EIF,0
4,TCCCTTCAAAGGCGACAGACCCAAGCCCACGTCAGGAGAGGAGCGT...,Ago-EIF,0
5,GCTTTGCAACCCTTGTGTTACGGTGCACAGGTGTGCAAAAATTCTC...,Ago-EIF,0
6,TGGGTGATTCCTGAGCAAGCATGCTGCTGTCTCTCTGGCTCTGGGG...,Ago-EIF,0
7,TGCAGACCCCTGCGGCCAGGGCGAGGACGGATCTGAGCAGCTGGGC...,Ago-EIF,0
8,CAGCTGCTGTGGAAAATAGTCTGGCAGTTTCTCAACAATTACACAG...,Ago-EIF,0
9,TTATGGCCAACACTTCCATTTATTTATCAACAGATTCACCCGTGTC...,Ago-EIF,0


In [95]:
#train_X, train_y = train_5k[train_5k['RBP'] == 'Ago-EIF'][:4000]['Sequence'], train_5k[train_5k['RBP'] == 'Ago-EIF'][:4000]['Class']
#valid_X, valid_y = train_5k[train_5k['RBP'] == 'Ago-EIF'][4000:]['Sequence'], train_5k[train_5k['RBP'] == 'Ago-EIF'][4000:]['Class']
train_X, train_y = train_5k[train_5k['RBP'] == 'Ago-EIF']['Sequence'], train_5k[train_5k['RBP'] == 'Ago-EIF']['Class']
test_X, test_y = test_5k[test_5k['RBP'] == 'Ago-EIF']['Sequence'], test_5k[test_5k['RBP'] == 'Ago-EIF']['Class']

train_X_int = [[bases.index(c) for c in seq] for seq in train_X.tolist()]
train_X_onehot = np.eye(len(bases))[train_X_int]

#valid_X_int = [[bases.index(c) for c in seq] for seq in valid_X.tolist()]
#valid_X_onehot = np.eye(len(bases))[valid_X_int]

test_X_int = [[bases.index(c) for c in seq] for seq in test_X.tolist()]
test_X_onehot = np.eye(len(bases))[test_X_int]

In [92]:
valid_X.head().describe()

count                                                     5
unique                                                    5
top       CTATAAAGTTCAGGACAGTTTGAAATAAAACCCAGGAAACAAGATT...
freq                                                      1
Name: Sequence, dtype: object

#### Model Construction

In [50]:
mymodel_inputs = Input(shape=(101, 5))

x = Conv1D(filters=64, kernel_size=19, strides=1, padding='same')(mymodel_inputs)
x = BatchNormalization()(x)
x = ReLU()(x)
x = MaxPooling1D(strides=2)(x)
x = Dropout(.1)(x)
x = Conv1D(filters=128, kernel_size=5, strides=1, padding='same')(x)
x = BatchNormalization()(x)
x = ReLU()(x)
x = MaxPooling1D(strides=2)(x)
x = Dropout(.1)(x)
x = Flatten()(x)
x = Dense(units=512)(x)
x = BatchNormalization()(x)
x = ReLU()(x)
x = Dropout(.5)(x)
x = Dense(units=2)(x)
mymodel_outputs = Softmax()(x)

mymodel = Model(inputs=mymodel_inputs, outputs=mymodel_outputs)

In [51]:
mymodel.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 101, 5)]          0         
                                                                 
 conv1d_17 (Conv1D)          (None, 101, 64)           6144      
                                                                 
 batch_normalization_22 (Bat  (None, 101, 64)          256       
 chNormalization)                                                
                                                                 
 re_lu_10 (ReLU)             (None, 101, 64)           0         
                                                                 
 max_pooling1d_16 (MaxPoolin  (None, 50, 64)           0         
 g1D)                                                            
                                                                 
 dropout_6 (Dropout)         (None, 50, 64)            0   

In [96]:
mymodel2 = tf.keras.models.Sequential([
    Conv1D(filters=64, kernel_size=19, strides=1, padding='same', input_shape=(101, 5)),
    BatchNormalization(),
    ReLU(),
    MaxPooling1D(strides=2),
    Dropout(.1),

    Conv1D(filters=128, kernel_size=5, strides=1, padding='same'),
    BatchNormalization(),
    ReLU(),
    MaxPooling1D(strides=2),
    Dropout(.1),

    Flatten(),
    Dense(units=512),
    BatchNormalization(),
    ReLU(),
    Dropout(.5),
    Dense(units=2),
    Softmax()
])

In [97]:
mymodel2.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_31 (Conv1D)          (None, 101, 64)           6144      
                                                                 
 batch_normalization_43 (Bat  (None, 101, 64)          256       
 chNormalization)                                                
                                                                 
 re_lu_31 (ReLU)             (None, 101, 64)           0         
                                                                 
 max_pooling1d_30 (MaxPoolin  (None, 50, 64)           0         
 g1D)                                                            
                                                                 
 dropout_27 (Dropout)        (None, 50, 64)            0         
                                                                 
 conv1d_32 (Conv1D)          (None, 50, 128)          

In [102]:
mymodel2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#early_stopping = EarlyStopping(monitor='accuracy', patience=20)

In [None]:
mymodel2.fit(train_X_onehot, train_y, epochs=500, validation_split=0.2)#, callbacks=[early_stopping])

In [None]:
mymodel3 = tf.keras.Sequential([
    Conv1D(filters=1, kernel_size=10, strides=1, )
])