# Mount Dataset

In [1]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/Shared\ drives/CS230/datasets

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/Shared drives/CS230/datasets


# Load required packages

In [0]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
np.random.seed(0) # Reproducibility

In [0]:
from tensorflow.python.client import device_lib

In [4]:
tf.test.gpu_device_name()

''

In [5]:
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 17629093523220778877, name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 16019287805258879659
 physical_device_desc: "device: XLA_CPU device"]

# Set Global Variables

In [0]:
prot_lookup = {'A' : 0, 'C' : 1, 'D' : 2, 'E' : 3, 
               'F' : 4, 'G' : 5, 'H' : 6, 'I' : 7, 
               'K' : 8, 'L' : 9, 'M' : 10, 'N' : 11, 
               'P' : 12, 'Q' : 13, 'R' : 14, 'S' : 15, 
               'T' : 16, 'V' : 17, 'W' : 18, 'Y' : 19}
attx_lookup = {'A' : 0, 'T' : 1, 'G' : 2, 'C' : 3}
max_prot_len, max_attx_len = 100, 150
prot_dim, attx_dim, k = 20, 4, 4
aa_list = list(prot_lookup.keys())

def gen_kgrams(k):

  grams = []
  if (k > 1):
    lower_grams = gen_kgrams(k - 1)
  else:
    lower_grams = ['']
  for p in aa_list:
    for gram in lower_grams:
      grams.append(p + gram)
  return grams

grams = gen_kgrams(k)

idx_maps = []
for j in range(max_prot_len):
  idx_map = {}
  random.shuffle(grams)
  for idx, gram in enumerate(grams):
    idx_map[gram] = idx
  idx_maps.append(idx_map)

# Data Preprocessing

In [0]:
from tqdm import tqdm

def generate_set(line_list):

  attx_set_1h = np.zeros((len(line_list), max_attx_len, attx_dim, 1))
  attx_set = np.zeros((len(line_list), max_attx_len))
  prot_set = np.zeros((len(line_list), max_prot_len))
  label_set = np.zeros((len(line_list), 1))
  
  for i in tqdm(range(len(line_list))):
    line = line_list[i]
    _, _, attx, prot, label = line.strip().split()
    label_set[i, 0] = int(label)
    for j in range(len(attx)):
      attx_set_1h[i, j, attx_lookup[attx[j]], 0] = 1
      attx_set[i, j] = attx_lookup[attx[j]]
    grams = [prot[i:i+k] for i in range(len(prot)-(k-1))]
    for j in range(max_prot_len):
      prot_set[i, j] = min(idx_maps[j][gram] for gram in grams)
  
  print('Pos Frac: ', sum(label_set) / len(line_list))
  return attx_set_1h, attx_set, prot_set, label_set 
    
train_path = 'set2/attx_protein_binding/train_attx_protein_binding.tsv'
test_path = 'set2/attx_protein_binding/test_attx_protein_binding.tsv'
dev_path = 'set2/attx_protein_binding/dev_attx_protein_binding.tsv'

with open(train_path) as f:
  train_list = f.readlines()

with open(test_path) as f:
  test_list = f.readlines()

with open(dev_path) as f:
  dev_list = f.readlines()

print('Generating Train Set')
train_attx_1h, train_attx, train_prot, train_label = generate_set(train_list[1:])
print(train_attx_1h.shape)
print(train_attx.shape)
print(train_prot.shape)
print(train_label.shape)

print('\nGenerating Test Set')
test_attx_1h, test_attx, test_prot, test_label = generate_set(test_list[1:])
print(test_attx_1h.shape)
print(test_attx.shape)
print(test_prot.shape)
print(test_label.shape)

print('\nGenerating Dev Set')
dev_attx_1h, dev_attx, dev_prot, dev_label = generate_set(dev_list[1:])
print(dev_attx_1h.shape)
print(dev_attx.shape)
print(dev_prot.shape)
print(dev_label.shape)

In [0]:
train_prot = train_prot / float(max_prot_len)
test_prot = test_prot / float(max_prot_len)
dev_prot = dev_prot / float(max_prot_len)

# Model Training Setup

In [8]:
from keras.layers import *
from keras.models import Model
from keras.utils import plot_model
from keras import optimizers

Using TensorFlow backend.


In [0]:
def plot_loss(history):

  # Plot training & validation accuracy values
  plt.plot(history['acc'])
  plt.plot(history['val_acc'])
  plt.title('Model accuracy')
  plt.ylabel('Accuracy')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'Dev'], loc='upper left')
  plt.show()

  # Plot training & validation loss values
  plt.plot(history['loss'])
  plt.plot(history['val_loss'])
  plt.title('Model loss')
  plt.ylabel('Loss')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'Dev'], loc='upper left')
  plt.show()

# Jaccard-CNN Classifier

In [0]:
# Protein LSTM Embedding Layer
prot_input = Input(shape=(max_prot_len,), dtype='float32')

# ATTx Convolutional Network
attx_input = Input(shape=(max_attx_len, attx_dim, 1), dtype='float32')

conv1 = Conv2D(64, 
               kernel_size=(4, 4), 
               padding='valid', 
               strides=1, 
               activation='relu')(attx_input)

pool1 = MaxPooling2D(pool_size=(2, 1), padding='valid')(conv1)

conv2 = Conv2D(64, 
               kernel_size=(5, 1), 
               padding='valid', 
               strides=1, 
               activation='relu')(pool1)

pool2 = MaxPooling2D(pool_size=(2, 1), padding='valid')(conv2)

conv3 = Conv2D(64, 
               kernel_size=(5, 1), 
               padding='valid', 
               strides=1, 
               activation='relu')(pool2)

drop = Dropout(0.15)(conv3)
attx_out = Flatten()(drop)

combined = concatenate([prot_input, attx_out])
dense1 = Dense(1024, activation='relu', name='Dense1', kernel_regularizer=keras.regularizers.l2(0.002))(combined)
dense2 = Dense(1024, activation='relu', name='Dense2', kernel_regularizer=keras.regularizers.l2(0.002))(dense1)
pred = Dense(1, activation='sigmoid', name='Dense3', kernel_regularizer=keras.regularizers.l2(0.002))(dense2)

model_c = Model(inputs=[prot_input, attx_input], outputs=[pred])

In [0]:
from keras.callbacks import ModelCheckpoint

adam_c = optimizers.Adam(lr=0.003)
model_c.compile(optimizer=adam_c, loss='binary_crossentropy', metrics = ['accuracy'])
model_c.summary()
plot_model(model_c, show_shapes = True, show_layer_names = False, to_file='model_c.png')


filepath="model_c_{epoch:02d}_{val_acc:.2f}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
                  
history_c = model_c.fit([train_prot, train_attx_1h], train_label, epochs=1000, batch_size=2048, 
                    validation_data = ([dev_prot, dev_attx_1h], dev_label), callbacks=[checkpoint])
model_c.save('model_c_long.h5')
plot_loss(history_c.history)

In [0]:
from keras.models import load_model
model = load_model('model_c_92_0.80.h5')
predictions = np.round(model.predict([test_prot, test_attx_1h]))

In [0]:
true_pos = 0
true_neg = 0
false_pos = 0
false_neg = 0

for i in range(len(test_label)):

  if (predictions[i] == 1) and (test_label[i] == 1):
    true_pos += 1
  if (predictions[i] == 1) and (test_label[i] == 0):
    false_pos += 1
  if (predictions[i] == 0) and (test_label[i] == 0):
    true_neg += 1
  if (predictions[i] == 0) and (test_label[i] == 1):
    false_neg += 1

print('True Positive: ', true_pos)
print('True Negative: ', true_neg)
print('False Positive: ', false_pos)
print('False Negative: ', false_neg)

print('Accuracy: ', (true_pos + true_neg) / (false_pos + false_neg + true_pos + true_neg))
print('Precision: ', (true_pos) / (false_pos + true_pos))
print('Recall: ', (true_pos) / (false_neg + true_pos))
