# Mount Dataset

In [0]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/Shared\ drives/CS230/datasets

# Load required packages

In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
np.random.seed(0) # Reproducibility

In [0]:
from tensorflow.python.client import device_lib

In [0]:
tf.test.gpu_device_name()

In [0]:
device_lib.list_local_devices()

# Set Global Variables

In [0]:
prot_lookup = {'A' : 0, 'C' : 1, 'D' : 2, 'E' : 3, 
               'F' : 4, 'G' : 5, 'H' : 6, 'I' : 7, 
               'K' : 8, 'L' : 9, 'M' : 10, 'N' : 11, 
               'P' : 12, 'Q' : 13, 'R' : 14, 'S' : 15, 
               'T' : 16, 'V' : 17, 'W' : 18, 'Y' : 19}
attx_lookup = {'A' : 0, 'T' : 1, 'G' : 2, 'C' : 3}
max_prot_len, max_attx_len = 1700, 150
prot_dim, attx_dim = 20, 4

# Data Preprocessing

In [0]:
def generate_set(line_list):

  attx_set_1h = np.zeros((len(line_list), max_attx_len, attx_dim, 1))
  attx_set = np.zeros((len(line_list), max_attx_len))
  prot_set = np.zeros((len(line_list), max_prot_len))
  label_set = np.zeros((len(line_list), 1))
  
  for i, line in enumerate(line_list):
    _, _, attx, prot, label = line.strip().split()
    label_set[i, 0] = int(label)
    for j in range(len(attx)):
      attx_set_1h[i, j, attx_lookup[attx[j]], 0] = 1
      attx_set[i, j] = attx_lookup[attx[j]]
    for j in range(len(prot)):
      prot_set[i, j] = prot_lookup[prot[j]]
  
  print('Pos Frac: ', sum(label_set) / len(line_list))
  return attx_set_1h, attx_set, prot_set, label_set 
    
train_path = 'set2/attx_protein_binding/train_attx_protein_binding.tsv'
test_path = 'set2/attx_protein_binding/test_attx_protein_binding.tsv'
dev_path = 'set2/attx_protein_binding/dev_attx_protein_binding.tsv'

with open(train_path) as f:
  train_list = f.readlines()

with open(test_path) as f:
  test_list = f.readlines()

with open(dev_path) as f:
  dev_list = f.readlines()

print('Generating Train Set')
train_attx_1h, train_attx, train_prot, train_label = generate_set(train_list[1:])
print(train_attx_1h.shape)
print(train_attx.shape)
print(train_prot.shape)
print(train_label.shape)

print('\nGenerating Test Set')
test_attx_1h, test_attx, test_prot, test_label = generate_set(test_list[1:])
print(test_attx_1h.shape)
print(test_attx.shape)
print(test_prot.shape)
print(test_label.shape)

print('\nGenerating Dev Set')
dev_attx_1h, dev_attx, dev_prot, dev_label = generate_set(dev_list[1:])
print(dev_attx_1h.shape)
print(dev_attx.shape)
print(dev_prot.shape)
print(dev_label.shape)

# Model Training Setup

In [0]:
from keras.layers import *
from keras.models import Model
from keras.utils import plot_model
from keras import optimizers

In [0]:
def plot_loss(history):

  # Plot training & validation accuracy values
  plt.plot(history['acc'])
  plt.plot(history['val_acc'])
  plt.title('Model accuracy')
  plt.ylabel('Accuracy')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'Dev'], loc='upper left')
  plt.show()

  # Plot training & validation loss values
  plt.plot(history['loss'])
  plt.plot(history['val_loss'])
  plt.title('Model loss')
  plt.ylabel('Loss')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'Dev'], loc='upper left')
  plt.show()

# Hybrid LSTM-CNN Classifier

In [0]:
# Protein LSTM Embedding Layer
prot_input = Input(shape=(max_prot_len,), dtype='int32')
embedding_prot = Embedding(output_dim=20, 
                           input_dim=prot_dim, 
                           input_length=max_prot_len)(prot_input)
prot_out = LSTM(100)(embedding_prot)

# ATTx Convolutional Network
attx_input = Input(shape=(max_attx_len, attx_dim, 1), dtype='float32')

conv1 = Conv2D(64, 
               kernel_size=(4, 4), 
               padding='valid', 
               strides=1, 
               activation='relu')(attx_input)

pool1 = MaxPooling2D(pool_size=(2, 1), padding='valid')(conv1)

conv2 = Conv2D(64, 
               kernel_size=(5, 1), 
               padding='valid', 
               strides=1, 
               activation='relu')(pool1)

pool2 = MaxPooling2D(pool_size=(2, 1), padding='valid')(conv2)

conv3 = Conv2D(64, 
               kernel_size=(5, 1), 
               padding='valid', 
               strides=1, 
               activation='relu')(pool2)

drop = Dropout(0.15)(conv3)
attx_out = Flatten()(drop)

combined = concatenate([attx_out, prot_out])
dense1 = Dense(1024, activation='relu', name='Dense1', kernel_regularizer=keras.regularizers.l2(0.002))(combined)
dense2 = Dense(1024, activation='relu', name='Dense2', kernel_regularizer=keras.regularizers.l2(0.002))(dense1)
pred = Dense(1, activation='sigmoid', name='Dense3', kernel_regularizer=keras.regularizers.l2(0.002))(dense2)

model_b = Model(inputs=[prot_input, attx_input], outputs=[pred])

In [0]:
adam_b = optimizers.Adam(lr=0.005)
model_b.compile(optimizer=adam_b, loss='binary_crossentropy', metrics = ['accuracy'])
model_b.summary()
plot_model(model_b, show_shapes = True, show_layer_names = False, to_file = 'model_b.png')

history_b = model_b.fit([train_prot, train_attx_1h], train_label, epochs=50, batch_size=2048, 
                    validation_data = ([dev_prot, dev_attx_1h], dev_label))
model_b.save('model_b.h5')
plot_loss(history_b.history)