# deepSEA

https://mit6874.github.io/assets/misc/l10/deep-sea.pdf

https://github.com/jisraeli/DeepSEA/blob/master/2_model.lua

model architecture (from supplemental)
1. conv (320 kernels, kernel size 8, stride 1)
2. pooling (window 4, step 4)
3. conv (480 kernels, kernel size 8, stride 1)
4. pooling (window 4, step 4)
5. conv (960 kernels, kernel size 8, stride 1)
6. dense (925 neurons)
7. sigmoid output (but 12 neurons before this?)

- dropout after layer 2 (20%), after layer 4 (20%), and after layer 5 (50%).
- L2 regularization ($\lambda_1$): 5e-07
- L1 sparsity ($\lambda_2$): 1e-08
- max kernel norm ($\lambda_3$): 0.9

In [None]:
# format code with "black" formatter. optional
%load_ext nb_black

In [None]:
!wget --timestamping https://www.dropbox.com/s/c3umbo5y13sqcfp/synthetic_dataset.h5

In [None]:
from pathlib import Path
import h5py
import numpy as np

data_path = Path("synthetic_dataset.h5")
with h5py.File(data_path, "r") as dataset:
    x_train = dataset["X_train"][:].astype(np.float32)
    y_train = dataset["Y_train"][:].astype(np.float32)
    x_valid = dataset["X_valid"][:].astype(np.float32)
    y_valid = dataset["Y_valid"][:].astype(np.int32)
    x_test = dataset["X_test"][:].astype(np.float32)
    y_test = dataset["Y_test"][:].astype(np.int32)

x_train = x_train.transpose([0, 2, 1])
x_valid = x_valid.transpose([0, 2, 1])
x_test = x_test.transpose([0, 2, 1])

N, L, A = x_train.shape
print(f"{N} sequences, {L} nts per sequence, {A} nts in alphabet")

## create model

In [None]:
import tensorflow as tf

tfk = tf.keras
tfkl = tf.keras.layers

n_classes = 12

l1_l2 = tfk.regularizers.l1_l2(l1=1e-08, l2=5e-07)
maxnorm = tfk.constraints.max_norm(0.9)

tfk.Sequential(
    [
        tfkl.Conv1D(
            filters=320,
            kernel_size=8,
            strides=1,
            activation=tf.nn.relu,
            kernel_regularizer=l1_l2,
            kernel_constraint=maxnorm,
            input_shape=(L, A),
        ),
        tfkl.MaxPool1D(pool_size=4, strides=4),
        tfkl.Conv1D(
            filters=480,
            kernel_size=8,
            strides=1,
            activation=tf.nn.relu,
            kernel_regularizer=l1_l2,
            kernel_constraint=maxnorm,
        ),
        tfkl.MaxPool1D(pool_size=4, strides=4),
        tfkl.Conv1D(
            filters=960,
            kernel_size=8,
            strides=1,
            activation=tf.nn.relu,
            kernel_regularizer=l1_l2,
            kernel_constraint=maxnorm,
        ),
        tfkl.Dense(925, activation=tf.nn.relu),
        tfkl.Dense(n_classes, activation=tf.nn.sigmoid),
    ]
)

In [None]:
metrics = [
    tfk.metrics.AUC(curve="ROC", name="auroc"),
    tfk.metrics.AUC(curve="PR", name="aupr"),  # precision-recall
]
model.compile(
    optimizer=tfk.optimizers.Adam(learning_rate=0.001),
    loss=tfk.losses.BinaryCrossentropy(from_logits=False),
    metrics=metrics,
)

In [None]:
callbacks = [
    tfk.callbacks.EarlyStopping(
        monitor="val_aupr",
        patience=20,
        verbose=1,
        mode="max",
        restore_best_weights=False,
    ),
    tfk.callbacks.ReduceLROnPlateau(
        monitor="val_aupr",
        factor=0.2,
        patience=5,
        min_lr=1e-7,
        mode="max",
        verbose=1,
    ),
]
# train
history: tfk.callbacks.History = model.fit(
    x=x_train,
    y=y_train,
    batch_size=100,
    epochs=100,
    shuffle=True,
    validation_data=(x_valid, y_valid),
    callbacks=callbacks,
    verbose=2,
)