In [1]:

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.regularizers import L2

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import StratifiedKFold


SEED = 123

NO_CLASSES = 20
N_FOLDS = 3

MAX_LENGTH = 100
BATCH_SIZE = 256


train = pd.read_csv('Train.csv')

train = train.iloc[:100000, :]

test = pd.read_csv('Test.csv')


In [2]:

train['seq_char_count'] = train['SEQUENCE'].apply(lambda x: len(x))

codes = {code for seq in train['SEQUENCE'] for code in seq}


def create_dict(codes):
  char_dict = {}
  for index, val in enumerate(codes):
    char_dict[val] = index+1

  return char_dict

char_dict = create_dict(codes)

print(char_dict)
print("Dict Length:", len(char_dict))


def integer_encoding(data):
  """
  - Encodes code sequence to integer values.
  - 20 common amino acids are taken into consideration
    and rest 4 are categorized as 0.
  """
  
  encode_list = []
  for row in data['SEQUENCE'].values:
    row_encode = []
    for code in row:
      row_encode.append(char_dict.get(code, 0))
    encode_list.append(np.array(row_encode))
  
  return encode_list
  
train_encode = integer_encoding(train) 

train_pad = pad_sequences(train_encode, maxlen=MAX_LENGTH, padding='post', truncating='post')

print(train_pad.shape)




{'H': 1, 'C': 2, 'V': 3, 'U': 4, 'N': 5, 'Y': 6, 'B': 7, 'D': 8, 'Q': 9, 'F': 10, 'L': 11, 'S': 12, 'T': 13, 'G': 14, 'K': 15, 'E': 16, 'I': 17, 'P': 18, 'W': 19, 'A': 20, 'X': 21, 'R': 22, 'Z': 23, 'M': 24}
Dict Length: 24
(100000, 100)


In [3]:
# One hot encoding of sequences
X = to_categorical(train_pad)
print(X.shape) 

y = train['LABEL'].str.replace('[A-Za-z]', '').astype(int)



(100000, 100, 25)


In [4]:
def residual_block(data, filters, d_rate):
  """
  _data: input
  _filters: convolution filters
  _d_rate: dilation rate
  """

  shortcut = data

  bn1 = tf.keras.layers.BatchNormalization()(data)
  act1 = tf.keras.layers.Activation('relu')(bn1)
  conv1 = tf.keras.layers.Conv1D(filters, 1, dilation_rate=d_rate, padding='same', kernel_regularizer=tf.keras.regularizers.L2(0.001))(act1)

  #bottleneck convolution
  bn2 = tf.keras.layers.BatchNormalization()(conv1)
  act2 = tf.keras.layers.Activation('relu')(bn2)
  conv2 = tf.keras.layers.Conv1D(filters, 3, padding='same', kernel_regularizer=tf.keras.regularizers.L2(0.001))(act2)

  #skip connection
  x = tf.keras.layers.Add()([conv2, shortcut])

  return x

In [5]:
# input
x_input = tf.keras.Input(shape=(100, 25))

#initial conv
conv = tf.keras.layers.Conv1D(128, 1, padding='same')(x_input) 

# per-residue representation
res1 = residual_block(conv, 128, 2)
res2 = residual_block(res1, 128, 3)

x = tf.keras.layers.MaxPooling1D(3)(res2)
x = tf.keras.layers.Dropout(0.5)(x)

# softmax classifier
x = tf.keras.layers.Flatten()(x)
x_output = tf.keras.layers.Dense(NO_CLASSES, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(0.0001))(x)

model = tf.keras.Model(inputs=x_input, outputs=x_output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100, 25)]    0                                            
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 100, 128)     3328        input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 100, 128)     512         conv1d[0][0]                     
__________________________________________________________________________________________________
activation (Activation)         (None, 100, 128)     0           batch_normalization[0][0]        
_______________________________________________________________________________________

In [6]:

es = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)

skf = StratifiedKFold(n_splits=N_FOLDS)

score_list = []
for train_index, test_index in skf.split(X, y):
    print("TRAIN:", len(train_index), "TEST:", len(test_index))
    X_train, X_val = X[train_index], X[test_index]
    y_train, y_val = y[train_index], y[test_index]
    y_train = to_categorical(y_train)
    y_val = to_categorical(y_val)
    history = model.fit(X_train, y_train, epochs=50, batch_size=BATCH_SIZE,
                       validation_data=(X_val, y_val), callbacks=[es])
    best_score = max(history.history['val_accuracy'])
    score_list.append(best_score)


TRAIN: [32259 32260 32264 ... 99997 99998 99999] TEST: [    0     1     2 ... 35394 35406 35517]
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
TRAIN: [    0     1     2 ... 99997 99998 99999] TEST: [32259 32260 32264 ... 68264 68367 68376]
Epoch 1/50
Epoch 2/50
  7/261 [..............................] - ETA: 1:13 - loss: 0.9130 - accuracy: 0.8153

KeyboardInterrupt: 