In [17]:
from __future__ import absolute_import, division, print_function, unicode_literals
import pandas as pd

# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow.keras import regularizers
import random
from tensorflow import keras
from tensorflow.keras.layers import Dropout

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers.core import Activation
from keras.layers.core import Flatten
from keras.layers.core import Dropout
from keras.layers.core import Dense
from keras import backend as K

# https://www.tensorflow.org/tutorials/keras/classification
# print(tf.__version__)

# from google.colab import drive
# drive.mount('/gdrive')
# %cd /gdrive/Shared\ drives/CS230/datasets


Using TensorFlow backend.


In [2]:
class_names = ["attB", "attP"]

In [30]:
char2index = {'A':[1, 0, 0, 0], 'C':[0, 1, 0, 0], 'G':[0, 0, 1, 0], 'T':[0, 0, 0, 1]}
# index2char = {1:'A', 2:'C', 3:'G', 4:'T'}
def load_data(path):
  df = pd.read_csv(path, sep='\t')
  X = [[char2index[char] for char in seq] for seq in df.attb]
  Y = [[char2index[char] for char in seq] for seq in df.attp]
  return X, Y

train_attb, train_attp = load_data('set1/attB2attP/train.tsv')
dev_attb, dev_attp = load_data('set1/attB2attP/dev.tsv')
test_attb, test_attp = load_data('set1/attB2attP/test.tsv')

def pad_zeros(data, max_pad=160):
  padded_data = []
  for row in data:
    if max_pad - len(row) > 0:
        row = row + [[0, 0, 0, 0]]*(max_pad - len(row))
    padded_data.append(row)
  return np.array(padded_data)


def get_y(attb, attp):
    y_to_return = np.array(len(attb) * [0] + len(attp) * [1])
    return y_to_return.reshape(y_to_return.shape[0], 1)

print(len(train_attb))
print(len(train_attp))
print(len(train_attb + train_attp))
train_x = train_attb + train_attp
dev_x = dev_attb + dev_attp
test_x = test_attb + test_attp


train_x, train_y = pad_zeros(train_x), get_y(train_attb, train_attp)
dev_x, dev_y = pad_zeros(dev_x), get_y(dev_attb, dev_attp)
test_x, test_y = pad_zeros(test_x), get_y(test_attb, test_attp)

train_x = pad_zeros(train_x)
dev_x = pad_zeros(dev_x)
test_x = pad_zeros(test_x)

print(train_y.shape)
print(train_x.shape)
print(train_x.shape)
print(test_x.shape)

20301
20301
40602
(40602, 1)
(40602, 160, 4)
(40602, 160, 4)
(11404, 160, 4)


In [50]:
def get_model(l1_val=None, middle_layer_val=128, middle_layer_activation="relu"):
    model_to_return = keras.Sequential([
        keras.layers.Flatten(input_shape=(160, 4))
    ])
    if l1_val is not None:
        # Can also do: kernel_regularizer=regularizers.l2(0.01)
        # Kernel regularizer does weight decay. If we are going to try this, we must shuffle the attb and attp ordering
        # because right now the attbs come first, then the attps.
        # https://stackoverflow.com/questions/44495698/keras-difference-between-kernel-and-activity-regularizers
        model_to_return.add(keras.layers.Dense(middle_layer_val, input_dim=160,
                                               activation=middle_layer_activation,
                                               activity_regularizer=regularizers.l1(l1_val)))
    else:
        model_to_return.add(keras.layers.Dense(middle_layer_val, activation=middle_layer_activation))
        # model_to_return.add(Dropout(0.9999999999999))

    model_to_return.add(keras.layers.Dense(2, activation='softmax'))
    return model_to_return

def fit_model(model, train_x_arr, train_y_arr, verbose=2):
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit(train_x_arr, train_y_arr, epochs=5, shuffle=True, verbose=verbose)

def eval_model(model, test_x_arr, test_y_arr):
    return model.evaluate(test_x_arr,  test_y_arr, verbose=0)

In [51]:
model = get_model()
fit_model(model, train_x, train_y)
dev_loss, dev_acc = eval_model(model, dev_x, dev_y)
print('Test accuracy:', dev_acc)

Train on 40602 samples
Epoch 1/5
40602/40602 - 2s - loss: 0.4959 - accuracy: 0.7529
Epoch 2/5
40602/40602 - 2s - loss: 0.3296 - accuracy: 0.8539
Epoch 3/5
40602/40602 - 2s - loss: 0.2244 - accuracy: 0.9089
Epoch 4/5
40602/40602 - 2s - loss: 0.1487 - accuracy: 0.9451
Epoch 5/5
40602/40602 - 2s - loss: 0.0954 - accuracy: 0.9695
Test accuracy: 0.84628433


In [None]:
predictions = model.predict(test_x)

In [None]:
def plot_image(i, predictions_array, true_label, img):
    predictions_array, true_label, img = predictions_array, true_label[i], img[i]
    plt.grid(False)
    plt.xticks([])
    plt.yticks([])

    plt.imshow(img, cmap=plt.cm.binary)

    predicted_label = np.argmax(predictions_array)
    if predicted_label == true_label:
        color = 'blue'
    else:
        color = 'red'
    print(true_label)
    plt.xlabel("{} {:2.0f}% ({})".format(class_names[predicted_label],
                                100*np.max(predictions_array),
                                class_names[true_label[0]]),
                                color=color)

i = 0
plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
plot_image(i, predictions[i], test_y, test_x)
plt.show()
print(predictions.shape)

In [None]:
plt.figure()
plt.imshow(train_x[0])
plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize=(45,45))
range_to_show = 20
for i in range(range_to_show):
    plt.subplot(4, 5, i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    item_to_get = len(train_x) - i if int(i / 5) % 2 == 1 else i
    plt.imshow(train_x[item_to_get])
    plt.xlabel(class_names[train_y[item_to_get][0]])
plt.show()


## Regularization and Data Augmentation

We now have a training accuracy of 99.3% and a test accuracy of 85.6%. We see that we have high variance and perhaps are overfitting our training set.

In [None]:
def get_hyperparameters(simple=False):
    activations = ["relu", "sigmoid"]
    layer_vals = [8, 32, 64, 128]
    l1_vals = np.random.exponential(scale=0.015, size=(5,)).reshape(5, 1)
    hyperparameters_to_return = []
    print(l1_vals)
    if simple:
        return [{
                    'activation': activations[0], 
                    'layer_val': 128,
                    'l1': 0.05
                },
                {
                    'activation': activations[1], 
                    'layer_val': 128,
                    'l1': 0.05
                }]

    for act in activations:
        for layer_val in layer_vals:
            for l_val in l1_vals:
                hyperparameters_to_return.append({
                    'activation': act, 
                    'layer_val': layer_val,
                    'l1': l_val
                })
    return hyperparameters_to_return

In [None]:
dev_accuracies = []
hyperparameters = get_hyperparameters(simple=False)
print(hyperparameters)
print("We are testing " + str(len(hyperparameters)) + ".")
for param in hyperparameters:
    print(param)
for hyper_params in hyperparameters:
    model_extended = get_model(l1_val=hyper_params['l1'],
                               middle_layer_val=hyper_params['layer_val'], 
                               middle_layer_activation=hyper_params['activation'])
    fit_model(model_extended, 
              train_x, 
              train_y,
              verbose=0)
    dev_loss, dev_acc = eval_model(model_extended, dev_x, dev_y)
    dev_accuracies.append(dev_acc)
    print(dev_accuracies)
    print(hyper_params)
    print('\dev accuracy:', dev_acc)

dev_accuracies = np.array(dev_accuracies)
print(dev_accuracies)
print(max(dev_accuracies))
index_of_hyper_to_get = dev_accuracies.argmax(axis=0)
print(hyperparameters[index_of_hyper_to_get])

## CNN

In [57]:
def get_model_cnn():
    model_to_return = Sequential()
    model_to_return.add(Conv2D(64, kernel_size=3, activation='relu', input_shape=(160, 4, 1)))
    model_to_return.add(Conv2D(32, kernel_size=3, activation='relu'))
    model_to_return.add(Flatten())
    model_to_return.add(Dense(10, activation='softmax'))
    return model_to_return

In [58]:
# train_x_cnn, train_y_cnn = np.array(train_x), np.array(train_y)
# print(train_x_cnn.shape)
model = get_model_cnn()
# train_x, train_y = train_x.reshape(train_x[0], train_x[1], train_x[2], 1), train_y.reshape(train_y[0], train_y[1], train_y[2], 1)
fit_model(model, train_x_cnn, train_y_cnn)
dev_loss, dev_acc = eval_model(model, dev_x, dev_y)
print('Test accuracy:', dev_acc)

ValueError: Negative dimension size caused by subtracting 3 from 2 for 'conv2d_43/convolution' (op: 'Conv2D') with input shapes: [?,158,2,64], [3,3,64,32].