# Building a convolutional neural network model to predict terminator strength

## Import modules and define functions

Import the required modules:

In [1]:
import os
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow import keras as k
import tensorflow.keras.layers as kl
from dataclasses import dataclass
from typing import Optional
from io import TextIOBase
from collections import OrderedDict
from tqdm.notebook import tqdm

Enable GPU memory growth:

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs 1 Logical GPUs


2022-07-23 09:27:49.939337: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-23 09:27:49.979884: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-23 09:27:49.980318: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-23 09:27:49.981031: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate

Define a function to one-hot encode the DNA sequences:

In [3]:
BASE_1HOT = OrderedDict((
    ("A", np.array([1, 0, 0, 0])),
    ("C", np.array([0, 1, 0, 0])),
    ("G", np.array([0, 0, 1, 0])),
    ("T", np.array([0, 0, 0, 1]))
))

def one_hot_encoding(seq):
    """ one-hot encodes a DNA sequence """
    encoded = np.zeros(shape = (len(seq), 4), dtype = 'int8')
    for i, base in enumerate(seq):
        try:
            encoded[i, :] = BASE_1HOT[base]
        except KeyError:
            logging.error(
                f"Unrecognized base encountered during one-hot encoding: '{base}'"
            )
    return encoded

## Load and convert the data to the required format

Load the experimental data and split into training and test set:

In [4]:
data_term = pd.read_csv('terminator_data.tsv', sep = '\t', header = 0)

data_train = data_term.query('set == "train"').reset_index(drop = True)
data_test = data_term.query('set == "test"').reset_index(drop = True)

One-hot encode the terminator sequences:

In [5]:
train_sequences = np.stack(tuple(map(one_hot_encoding, tqdm(data_train['sequence'], desc = 'Encoding training sequences'))))
test_sequences = np.stack(tuple(map(one_hot_encoding, tqdm(data_test['sequence'], desc = 'Encoding test sequences'))))

Encoding training sequences:   0%|          | 0/48172 [00:00<?, ?it/s]

Encoding test sequences:   0%|          | 0/5318 [00:00<?, ?it/s]

Convert the enrichment value to an array of the correct shape.

In [6]:
train_enrichment = np.array(data_train[['enrichment']]).reshape(-1, 1)
test_enrichment = np.array(data_test[['enrichment']]).reshape(-1, 1)

## Build the models

Define a bidirectional convolutional layer stack, inspired from DeepGMAP (https://doi.org/10.1371/journal.pone.0235748)

In [7]:
class BiConv1D(kl.Layer):
    def __init__(self, filters, kernel_size, layers = 2, stride = 1, dropout_rate = 0.15):
        super().__init__()
        self.filters = filters
        self.kernel_size = kernel_size
        if layers < 1:
            raise ValueError("At least one layer needed")
        self.layers = layers
        if (dropout_rate < 0) or (dropout_rate > 1):
            raise ValueError("Dropout rate must be a float between 0 and 1")
        self.dropout_rate = dropout_rate
        self.stride = stride
    
    def build(self, input_shape):
        self.kernels = []
        self.biases = []
        for layer in range(self.layers):
            self.kernels.append(self.add_weight(
                f"kernel{layer}",
                shape = (self.kernel_size, input_shape[-1], self.filters),
                trainable = True,
                initializer = k.initializers.GlorotUniform()
            ))
            self.biases.append(self.add_weight(
                f"bias{layer}",
                shape = (self.filters,),
                trainable = True,
                initializer = k.initializers.Zeros()
            ))

    def call(self, input):
        # first layer
        x_fwd = tf.nn.conv1d(input, self.kernels[0], stride = self.stride, padding = 'SAME')
        x_fwd = tf.add(x_fwd, self.biases[0])
        x_fwd = tf.nn.dropout(tf.nn.relu(x_fwd), rate = self.dropout_rate)
        x_rev = tf.nn.conv1d(input, tf.reverse(self.kernels[0], axis = [1, 2]), stride = self.stride, padding = 'SAME')
        x_rev = tf.add(x_fwd, self.biases[0])
        x_rev = tf.nn.dropout(tf.nn.relu(x_rev), rate = self.dropout_rate)
        
        # subsequent layers
        for layer in range(1, self.layers):
            x_fwd = tf.nn.conv1d(x_fwd, self.kernels[layer], stride = self.stride, padding = 'SAME')
            x_fwd = tf.add(x_fwd, self.biases[layer])
            x_fwd = tf.nn.dropout(tf.nn.relu(x_fwd), rate = self.dropout_rate)
            x_rev = tf.nn.conv1d(x_rev, tf.reverse(self.kernels[layer], axis = [1, 2]), stride = self.stride, padding = 'SAME')
            x_rev = tf.add(x_fwd, self.biases[layer])
            x_rev = tf.nn.dropout(tf.nn.relu(x_rev), rate = self.dropout_rate)
        
        return tf.math.add(x_fwd, x_rev)

Define a function to build the bidirectional model:

In [8]:
def build_bidirectional_model(motif_kernel: np.ndarray):
    # motif_kernel.shape[2] is filters, shape[0] is kernel size
    inputs = kl.Input((170, 4))
    x = BiConv1D(filters = motif_kernel.shape[2], kernel_size = motif_kernel.shape[0], layers = 2)(inputs)
    x = kl.Conv1D(filters = 128, kernel_size = 13, padding = 'same', activation = 'relu')(x)
    x = kl.Dropout(0.15)(x)
    x = kl.Flatten()(x)
    x = kl.Dense(64)(x)
    x = kl.BatchNormalization()(x)
    x = kl.Activation('relu')(x)
    outputs = kl.Dense(1)(x)
    model = k.Model(inputs = inputs, outputs = outputs, name = "BiDirectionalCNN")
    # initialize first layer kernel with motifs
    model.layers[1].kernels[0].assign(motif_kernel)
    return model

Initialize the kernal weights:

In [9]:
kernel = k.initializers.GlorotUniform()(shape = (13, 4, 128)).numpy()

Build and compile the models:

In [10]:
model = build_bidirectional_model(kernel)

model.compile(
    loss = 'mean_squared_error',
    optimizer = 'Adam',
    metrics = ['mean_squared_error']
)

Define training parameters:

In [11]:
earlyStop = k.callbacks.EarlyStopping(patience = 5)
reduceLR = k.callbacks.ReduceLROnPlateau(patience = 3)

Train the model:

In [12]:
if os.path.isdir('models/model_BiConv_term'):
    # load previously trained model
    model = k.models.load_model('models/model_BiConv_term')
else:
    # train model
    model.fit(
        train_sequences,
        train_enrichment, 
        epochs = 25,
        batch_size = 128,
        validation_split = 0.1,
        callbacks = [earlyStop, reduceLR],
        verbose = 1
    )

    # save model
    model.save('models/model_BiConv_term')

Epoch 1/25


2022-07-23 09:27:57.401207: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8400
2022-07-23 09:27:58.273395: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-07-23 09:27:58.509366: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.10GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-07-23 09:27:58.509447: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.10GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.


  5/339 [..............................] - ETA: 9s - loss: 12.8779 - mean_squared_error: 12.8779 

2022-07-23 09:27:58.968781: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.10GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-07-23 09:27:58.968853: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.10GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.




2022-07-23 09:28:11.496973: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.86GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-07-23 09:28:11.497043: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.86GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25


2022-07-23 09:31:19.062787: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: models/model_BiConv_term/assets


## Evalutate the model

Predict enrichment for the test set:

In [13]:
predicted_enrichment = model.predict(np.stack(test_sequences))
predicted_enrichment = pd.DataFrame(predicted_enrichment, columns = ['prediction'])

2022-07-23 09:31:20.038705: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.03GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-07-23 09:31:20.038764: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.03GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.


Add predicted values to the dataframe:

In [14]:
data_pred = pd.concat([data_test, predicted_enrichment], axis = 1).drop(columns = 'sequence')

data_pred.to_csv('terminators_pred_BiConv.tsv', sep = '\t', index = False)