In [1]:
import pandas as pd

In [2]:
positive_seqs = pd.read_csv('./../dataset/lin28/uniq.SRR458758.txt', header=None, names=['Sequence'])

In [3]:
positive_seqs_ln = positive_seqs[positive_seqs['Sequence'].str.len() > 70]

positive_seqs_ln.head()

Unnamed: 0,Sequence
56,12 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...
57,31 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...
58,35 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...
59,45 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...
60,117 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...


In [4]:
positive_seqs_ln.shape

(2257841, 1)

In [4]:
# random sampling
seq_pos = positive_seqs_ln.sample(n=10000, replace=False, random_state=42)
seq_pos['Sequence'] = [''.join(filter(str.isalpha, seq)) for seq in seq_pos['Sequence']]

In [5]:
seq_len = [len(x) for x in seq_pos['Sequence']]
max(seq_len)

78

In [6]:
# make every sequence same length
MAX_LEN = 78

def padding_seq(seq):
    gap = MAX_LEN - len(seq)
    res = 'N'*int(gap/2) + seq + 'N'*int(gap/2)
    
    if gap % 2 == 1:
        res += 'N'
    
    return res

In [7]:
seqs = [padding_seq(seq) for seq in seq_pos['Sequence']]

In [8]:
seqs[:5]

['GTTGTAAAACCTGTTCACCAGCACCTGCGGAATTGTTGCCCTGCCTTCATTTCTTTCCCAGCCTGTAGGCACCATCAA',
 'ATTCTCTTCTGTGTGTCACCAAGTTCAGGCCTGGTGGCCAGGCCTGCTCTGTAGGCACCATCAATCGTATGCCGTCTT',
 'NAGAACGGCTGGTCTTTCATCAGAATTCCAGGCAAGAAGAGCCTCATCACTGACAGTGGAAAGCCTGTAGGCACCANN',
 'TGGGATGTGTGTGTGTGTTACTGGGATGGGGTGAGTGTGTGTGTGTGTGTGCTCTCTCGCGCATATGCACACAAGCCT',
 'AGCTTAACTCATTTGAATGTTGTTGCTTCTGAGTTTAGGCTAACGGAAGTCTGTAGGCACCATCAATTCGTATGCCGT']

In [9]:
seq_pos['Sequence'] = seqs
seq_pos['y'] = '1'

In [10]:
print(max([len(x) for x in seq_pos['Sequence']]), min([len(x) for x in seq_pos['Sequence']]))

78 78


In [11]:
# random sequence for negative sample
import random

seq_len = 78

ranseq = []
for i in range(10000):
    ranseq.append(''.join(random.choices(['A', 'C', 'T', 'G', 'N'], k=seq_len)))

In [12]:
ranseq[:5]

['AACTNTCAGCCTNCATTNGANGTANTTGGNGGGNCAACGCCCATNNGCCACTGTGAGGTNANNAAGTTTGGTGNCAAG',
 'TGAGAGTATGNCGGNTTTTGTTTNNGAGTCGATGATTTGTCCNNGACGCTACTCNNNCCTNNAAAGCNTAGTGCANTT',
 'TTACNCTCGCGCTCGCTTATGTGCTGTTTNTNAANCTAGGGGCTGTAATGAGCATTTNTNACNGNGGNANTTAGGCNA',
 'TCAGAGATGCNCATATCTNGNGGNCGGGCNATTTNGTCGCNCNAAATTNCNAGNTNTANGTTGNNANTCGCGTNNNAC',
 'TANAAAAGCANGAAAAATNGGCNGNCCNCCAGTCNNCCTTTAANGNTANATNCNANCNCNTGNTNGTNGTGCNCNGGA']

In [13]:
seq_neg = pd.DataFrame(ranseq, columns=['Sequence'])
seq_neg['y'] = '0'

seq_neg.head()

Unnamed: 0,Sequence,y
0,AACTNTCAGCCTNCATTNGANGTANTTGGNGGGNCAACGCCCATNN...,0
1,TGAGAGTATGNCGGNTTTTGTTTNNGAGTCGATGATTTGTCCNNGA...,0
2,TTACNCTCGCGCTCGCTTATGTGCTGTTTNTNAANCTAGGGGCTGT...,0
3,TCAGAGATGCNCATATCTNGNGGNCGGGCNATTTNGTCGCNCNAAA...,0
4,TANAAAAGCANGAAAAATNGGCNGNCCNCCAGTCNNCCTTTAANGN...,0


In [14]:
seq_pos.head()

Unnamed: 0,Sequence,y
7601271,GTTGTAAAACCTGTTCACCAGCACCTGCGGAATTGTTGCCCTGCCT...,1
5073481,ATTCTCTTCTGTGTGTCACCAAGTTCAGGCCTGGTGGCCAGGCCTG...,1
2669291,NAGAACGGCTGGTCTTTCATCAGAATTCCAGGCAAGAAGAGCCTCA...,1
8012620,TGGGATGTGTGTGTGTGTTACTGGGATGGGGTGAGTGTGTGTGTGT...,1
3455182,AGCTTAACTCATTTGAATGTTGTTGCTTCTGAGTTTAGGCTAACGG...,1


In [15]:
seqs = pd.concat([seq_pos, seq_neg])
seqs = seqs.sample(frac=1).reset_index(drop=True)
seqs.head()

Unnamed: 0,Sequence,y
0,GAAGGTTTATCTCCTATACGCCTGCGCCTTCNTNNACANAAGCTCG...,0
1,AGGAAGAACGACCAAAGGAAAAGAGGGGACGCCCTCGGCACGGTGG...,1
2,GNNCNANNCGCTCTGAGTCGTCGCNGGGAAAANTNACCNACNAGAG...,0
3,TCCCGATGAGCATCAGGAAGGCGAAGCTGAGTGTGCGGGGGGGGGT...,1
4,CTCGCGCCGGGCCGTACCCATATCCGCAGCAGGTCTCCAAGGTGAA...,1


In [16]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model, regularizers, constraints
from tensorflow.keras.layers import Layer, Input, Embedding, Conv1D, Conv2D, BatchNormalization, ReLU, LeakyReLU, MaxPooling1D, Flatten, Dense, Softmax, Dropout, MaxPooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

2022-06-10 07:03:11.451654: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [17]:
train_X = seqs['Sequence'][:15000]
train_y = seqs['y'][:15000]

test_X = seqs['Sequence'][15000:]
test_y = seqs['y'][15000:]

In [18]:
BASE=sorted("ACGTN")

train_X_int = [[BASE.index(c) for c in seq] for seq in train_X.tolist()]
train_X_onehot = np.eye(len(BASE))[train_X_int]

test_X_int = [[BASE.index(c) for c in seq] for seq in test_X.tolist()]
test_X_onehot = np.eye(len(BASE))[test_X_int]

train_X_onehot_ = train_X_onehot[:, :, :, np.newaxis]
test_X_onehot_ = test_X_onehot[:, :, :, np.newaxis]

In [77]:
train_X_onehot_.shape

(15000, 78, 5, 1)

In [78]:
test_X_onehot_.shape

(5000, 78, 5, 1)

In [19]:
train_X_onehot_ = np.array(train_X_onehot_)
test_X_onehot_ = np.array(test_X_onehot_)

train_y = train_y.astype('int32')
test_y = test_y.astype('int32')

In [21]:
train_y[:5]

0    0
1    1
2    0
3    1
4    1
Name: y, dtype: int32

In [24]:
mymodel = tf.keras.Sequential([
    Conv2D(1, (14, 5), padding='same', strides=1, input_shape=(78, 5, 1), kernel_regularizer=regularizers.l1(3e-3), kernel_constraint=constraints.NonNeg()),
    BatchNormalization(),
    ReLU(),
    MaxPooling2D(),
    
    Flatten(),
    Dense(1, activation='sigmoid')
])

2022-06-10 07:05:39.368283: W tensorflow/core/common_runtime/bfc_allocator.cc:433] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.0KiB (rounded to 1280)requested by op ScratchBuffer
Current allocation summary follows.
2022-06-10 07:05:39.368444: I tensorflow/core/common_runtime/bfc_allocator.cc:972] BFCAllocator dump for GPU_0_bfc
2022-06-10 07:05:39.368482: I tensorflow/core/common_runtime/bfc_allocator.cc:979] Bin (256): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2022-06-10 07:05:39.368505: I tensorflow/core/common_runtime/bfc_allocator.cc:979] Bin (512): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2022-06-10 07:05:39.368559: I tensorflow/core/common_runtime/bfc_allocator.cc:979] Bin (1024): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2022-06-10 07:05:39.368714: 

FailedPreconditionError: Failed to allocate scratch buffer for device 0 [Op:VarHandleOp] name: Variable/

In [21]:
mymodel.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 78, 5, 1)          71        
_________________________________________________________________
batch_normalization (BatchNo (None, 78, 5, 1)          4         
_________________________________________________________________
re_lu (ReLU)                 (None, 78, 5, 1)          0         
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 39, 2, 1)          0         
_________________________________________________________________
flatten (Flatten)            (None, 78)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 79        
Total params: 154
Trainable params: 152
Non-trainable params: 2
__________________________________________________________

In [22]:
mymodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [38]:
mymodel.fit(train_X_onehot_, train_y, epochs=20, validation_split=0.2)

Epoch 1/20


2022-06-10 07:00:40.359769: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10
2022-06-10 07:00:40.814453: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.7


: 

: 