In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import time

import sys
sys.path.append("../")
import fastISM

In [2]:
!nvidia-smi

Tue Feb  1 00:38:02 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.73.01    Driver Version: 460.73.01    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    52W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!nvidia-smi -L

GPU 0: A100-SXM4-40GB (UUID: GPU-b593d456-e6a0-80b7-b80e-c06dab2dca6a)


In [4]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Mon_May__3_19:15:13_PDT_2021
Cuda compilation tools, release 11.3, V11.3.109
Build cuda_11.3.r11.3/compiler.29920130_0


In [5]:
# get test seqs (randomly sampled from hg38 chr1-22)
# !wget http://mitra.stanford.edu/kundaje/surag/fastISM/test_long.seq.npy

In [2]:
seqs = np.load("test_long.seq.npy")
seqs.shape

(100, 196608, 4)

In [3]:
def time_ism(ism_model, batch_sizes, seqlen):
    times = []
    per_100 = []
    for b in batch_sizes:
        assert(b <= seqs.shape[0])
        # dry run -- required as first batch slower for setting up
        #            and variable batch sizes (due to varying number
        #            of seqs that need to be mutated at a position) 
        #            also slows down first call
        # x = np.random.random((b,seqlen,4))
        x = seqs[:b, :seqlen]        
        x = tf.constant(x, dtype=ism_model.model.inputs[0].dtype)
        o = ism_model(x, [0,0,0,1])
        
        t = time.time()
        x = tf.constant(x, dtype=ism_model.model.inputs[0].dtype)
        
        # NOTE: computations are only performed at those positions
        # at which the existing base != replace_with
        o = ism_model(x, replace_with=[0,0,0,1])
        o = ism_model(x, replace_with=[0,0,1,0])
        o = ism_model(x, replace_with=[0,1,0,0])
        o = ism_model(x, replace_with=[1,0,0,0])
        
        times.append(time.time()-t)
        
        per_100.append((times[-1]/b)*100)
        print("BATCH: {}\tTIME: {:.2f}\tPER 100: {:.2f}".format(b, times[-1], (times[-1]/b)*100))
    
    print("BEST PER 100: {:.2f}".format(min(per_100)))

In [4]:
# modified from https://github.com/deepmind/deepmind-research/blob/master/enformer/enformer.py

class AttentionPooling1D(tf.keras.layers.Layer):
    """Pooling operation with optional weights."""
    def __init__(self,
            pool_size: int = 2,
            per_channel: bool = True,
            w_init_scale: float = 2.0,
                **kwargs):
        """Softmax pooling.
        Args:
        pool_size: Pooling size, same as in Max/AvgPooling.
        per_channel: If True, the logits/softmax weights will be computed for
        each channel separately. If False, same weights will be used across all
        channels.
         w_init_scale: When 0.0 is equivalent to avg pooling, and when
        ~2.0 and `per_channel=False` it's equivalent to max pooling.
        name: Module name.
        """
        super().__init__()
        self.pool_size = pool_size
        self._per_channel = per_channel
        self._w_init_scale = w_init_scale
        self._logit_linear = None
        
        # need for pooling layer
        self.strides = self.pool_size 
        self.padding = "valid" # here we are using padding of 2 on multiples of 2 so it's ok
        self.data_format = "channels_last"
        

    def build(self, input_shape):
        _, length, num_features = input_shape
        self.w = self.add_weight(
            shape=(num_features, num_features),
            initializer="random_normal",
            trainable=True,
        )
        # self._logit_linear = tf.keras.layers.Dense(
        # output_size=num_features if self._per_channel else 1,
        # with_bias=False,  # Softmax is agnostic to shifts.
        # w_init=snt.initializers.Identity(self._w_init_scale))
    
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "pool_size": self.pool_size,
            "_per_channel": self._per_channel,
            "_w_init_scale": self._w_init_scale,
            "_logit_linear": self._logit_linear,
            "data_format": self.data_format,
            "strides": self.strides,
            "padding": self.padding
        })
        return config
    
    @tf.function(jit_compile=True)
    def call(self, inputs):
        _, length, num_features = inputs.shape
        
        if length == None: # this can happen at when creating fast_ism_model
            return inputs # don't do anything for now
            
        inputs = tf.reshape(
            inputs,
            (-1, length // self.pool_size, self.pool_size, num_features))
        # return tf.reduce_sum(
        #     inputs * tf.nn.softmax(self._logit_linear(inputs), axis=-2),
        #     axis=-2)
        return tf.reduce_sum(
            inputs * tf.nn.softmax(tf.matmul(inputs, self.w), axis=-2),
            axis=-2)

In [5]:
class PointwiseResConv(keras.layers.Layer):
    # point-wise convolution residual layer -- essentially see-through
    def __init__(self, c, **kwargs):
        super(PointwiseResConv, self).__init__()
        self.c = c
        self.bn = keras.layers.BatchNormalization()        
        self.act = keras.layers.Activation('gelu')        
        self.conv = keras.layers.Conv1D(self.c, 1)        
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "c": self.c,
        })
        return config
    
    def call(self, inputs):
        x = self.bn(inputs)
        x = self.act(x)
        x = self.conv(x)
        x = keras.layers.Add()([x,inputs])
        return x

In [6]:
def conv_block(c, w, padding, x_input):
    x = keras.layers.BatchNormalization()(x_input)
    x = keras.layers.Activation('gelu')(x)
    x = keras.layers.Conv1D(c,w, padding=padding)(x)

    return x

def enformer(inlen=196608, C=1536, num_conv=6, num_tx=11, crop_each=320, out_dim=5313):
    inp = keras.layers.Input((inlen,4))
    
    x = keras.layers.Conv1D(C//2, 15, padding='same')(inp)
    x = PointwiseResConv(C//2)(x)
    x = AttentionPooling1D(2, True)(x)
    
    tower_chans = [int((C//2)*(2**(1/num_conv))**i) for i in range(1,num_conv+1)]
    for ci in tower_chans:
        x = conv_block(ci, 5, 'same', x)
        x = PointwiseResConv(ci)(x)
        x = AttentionPooling1D(2, True)(x)
    
    # just an identity layer, using this as STOP_LAYER
    # there's an edge case that needs to be taken care of
    # when the stop layer is within a branch
    x = keras.layers.Layer()(x)
    
    for i in range(num_tx):
        y = keras.layers.LayerNormalization()(x)
        
        # using more heads than that in paper 
        # this is because keras MHA doesn't apply positional 
        # encoding, so increasing number of heads here matches
        # the time taken by DeepMind's MHA block (timed separately, not shown here)
        y = keras.layers.MultiHeadAttention(20, 64, C//8)(y, y)
        x = keras.layers.Add()([x,y])
        
        y = keras.layers.LayerNormalization()(x)
        y = keras.layers.Dense(C*2, activation='relu')(y)
        y = keras.layers.Dense(C)(y)
        x = keras.layers.Add()([x,y])
        x=y

    if crop_each>0:
        x = keras.layers.Cropping1D(crop_each)(x)
        
    x = conv_block(C*2, 1, 'valid', x)
    x = keras.layers.Activation('gelu')(x)
    
    x = keras.layers.Conv1D(out_dim, 1, padding='valid', activation='softplus')(x)

    m = keras.Model(inputs=inp, outputs=x)
    return m

In [7]:
fastISM.fast_ism_utils.POOLING_LAYERS.add("AttentionPooling1D")
fastISM.fast_ism_utils.STOP_LAYERS.add("Layer")
fastISM.fast_ism_utils.SEE_THROUGH_LAYERS.add("PointwiseResConv")

### Full Enformer

In [8]:
model = enformer(out_dim=5313, num_tx=11)

2022-02-04 21:42:34.532956: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-02-04 21:42:35.042297: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38444 MB memory:  -> device: 0, name: A100-SXM4-40GB, pci bus id: 0000:00:04.0, compute capability: 8.0


In [9]:
model.count_params()

354722383

In [10]:
loltf = tf.constant(np.random.random((1000,196608,4)), dtype=model.input.dtype)

In [11]:
model(loltf[:6], training=False).numpy().shape

2022-02-04 21:42:47.907581: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8200
2022-02-04 21:42:49.820850: I tensorflow/stream_executor/cuda/cuda_blas.cc:1774] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2022-02-04 21:42:49.884603: I tensorflow/compiler/xla/service/service.cc:171] XLA service 0x55b0b21d5ad0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2022-02-04 21:42:49.884641: I tensorflow/compiler/xla/service/service.cc:179]   StreamExecutor device (0): A100-SXM4-40GB, Compute Capability 8.0
2022-02-04 21:42:49.891528: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:237] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2022-02-04 21:42:55.382691: I tensorflow/compiler/jit/xla_compilation_cache.cc:351] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




(6, 896, 5313)

In [12]:
%timeit model(loltf[:6], training=False).numpy().shape

680 ms ± 1.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%timeit model(loltf[:7], training=False).numpy().shape

1 s ± 1.09 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


**Scoring 100bp**

In [13]:
fast_ism_model = fastISM.FastISM(model,
                                 test_correctness=False, 
                                 change_ranges=[(x,x+1) for x in range(model.input_shape[1]//2-50, model.input_shape[1]//2 + 50)])

In [15]:
time_ism(fast_ism_model, [10], 196608)

BATCH: 10	TIME: 164.15	PER 100: 1641.52
BEST PER 100: 1641.52


**Scoring 100bp**

In [16]:
naive_ism_model = fastISM.NaiveISM(model, change_ranges=[(x,x+1) for x in range(model.input_shape[1]//2-50, model.input_shape[1]//2 + 50)])

In [17]:
time_ism(naive_ism_model, [6,5,7], 196608)

BATCH: 6	TIME: 227.50	PER 100: 3791.72
BATCH: 5	TIME: 197.11	PER 100: 3942.30
BATCH: 7	TIME: 281.97	PER 100: 4028.08
BEST PER 100: 3791.72


### Full Enformer, Small output

In [8]:
model = enformer(out_dim=20)

2022-02-04 22:55:58.212698: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-02-04 22:55:58.728247: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38444 MB memory:  -> device: 0, name: A100-SXM4-40GB, pci bus id: 0000:00:04.0, compute capability: 8.0


In [9]:
loltf = tf.constant(np.random.random((100,196608,4)), dtype=model.input.dtype)

In [10]:
model(loltf[:6], training=False).numpy().shape

2022-02-04 22:56:07.254938: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8200
2022-02-04 22:56:09.189477: I tensorflow/stream_executor/cuda/cuda_blas.cc:1774] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2022-02-04 22:56:09.253316: I tensorflow/compiler/xla/service/service.cc:171] XLA service 0x55ccc643e6b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2022-02-04 22:56:09.253353: I tensorflow/compiler/xla/service/service.cc:179]   StreamExecutor device (0): A100-SXM4-40GB, Compute Capability 8.0
2022-02-04 22:56:09.260807: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:237] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2022-02-04 22:56:14.762937: I tensorflow/compiler/jit/xla_compilation_cache.cc:351] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




(6, 896, 20)

In [11]:
%timeit model(loltf[:6], training=False).numpy().shape

597 ms ± 449 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%timeit model(loltf[:7], training=False).numpy().shape

904 ms ± 521 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


**Scoring 1000bp**

In [11]:
fast_ism_model = fastISM.FastISM(model, 
                                 test_correctness=False, 
                                 change_ranges=[(x,x+1) for x in range(model.input_shape[1]//2-500, model.input_shape[1]//2 + 500)])

In [12]:
time_ism(fast_ism_model, [11], 196608)

2022-02-04 22:21:11.690892: W tensorflow/core/kernels/gpu_utils.cc:49] Failed to allocate memory for convolution redzone checking; skipping this check. This is benign and only means that we won't check cudnn for out-of-bounds reads and writes. This message will only be printed once.


BATCH: 11	TIME: 1109.47	PER 100: 10086.06
BEST PER 100: 10086.06


**Scoring 100bp**

In [12]:
naive_ism_model = fastISM.NaiveISM(model, 
                                   change_ranges=[(x,x+1) for x in \
                                                  range(model.input_shape[1]//2-50, 
                                                        model.input_shape[1]//2 + 50)])

In [14]:
time_ism(naive_ism_model, [6,5], 196608)

BATCH: 6	TIME: 193.76	PER 100: 3229.29
BATCH: 5	TIME: 168.71	PER 100: 3374.21
BEST PER 100: 3229.29


**Scoring 1000bp**

In [21]:
naive_ism_model = fastISM.NaiveISM(model, 
                                   change_ranges=[(x,x+1) for x in \
                                                  range(model.input_shape[1]//2 - 500, 
                                                        model.input_shape[1]//2 + 500)])

In [22]:
time_ism(naive_ism_model, [6], 196608)

BATCH: 6	TIME: 1918.78	PER 100: 31979.73
BEST PER 100: 31979.73


**Correctness**

In [13]:
fism_out = fast_ism_model(loltf[:5])

In [14]:
nism_out = naive_ism_model(loltf[:5])

In [17]:
np.all(np.isclose(fism_out, nism_out, atol=1e-2)), np.all(np.isclose(nism_out, fism_out, atol=1e-2))

(True, True)

In [16]:
np.mean(np.isclose(fism_out, nism_out, atol=1e-3)), np.mean(np.isclose(nism_out, fism_out, atol=1e-3))

(0.99999921875, 0.99999921875)

In [18]:
fism_out[0,0,0]

array([0.44960645, 0.5226613 , 0.71340764, 0.76874745, 0.6769167 ,
       0.7781925 , 0.88031006, 0.5711825 , 0.8310388 , 0.5729828 ,
       0.73103267, 0.4928353 , 0.82416934, 0.52590275, 0.6440538 ,
       0.71847594, 0.65493226, 0.89954627, 0.710739  , 0.525694  ],
      dtype=float32)

In [19]:
nism_out[0,0,0]

array([0.44960108, 0.5222728 , 0.71331096, 0.7684644 , 0.6770802 ,
       0.77832896, 0.8805904 , 0.5711045 , 0.83050287, 0.57302916,
       0.731101  , 0.49275845, 0.8243413 , 0.526146  , 0.6440768 ,
       0.7184003 , 0.65477055, 0.89942145, 0.7109796 , 0.52549505],
      dtype=float32)

In [20]:
import scipy.stats
scipy.stats.spearmanr(nism_out.ravel(), fism_out.ravel())

SpearmanrResult(correlation=0.9999848592062245, pvalue=0.0)

### Enformer 4 Tx, small output

In [8]:
model = enformer(inlen=196608, C=1536, num_tx=4, crop_each=320, out_dim=20)

2022-02-04 23:41:49.507717: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-02-04 23:41:50.022467: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38444 MB memory:  -> device: 0, name: A100-SXM4-40GB, pci bus id: 0000:00:04.0, compute capability: 8.0


In [9]:
loltf = tf.constant(np.random.random((100,196608,4)), dtype=model.input.dtype)

In [10]:
model(loltf[:6], training=False).numpy().shape

2022-02-04 23:41:53.246607: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8200
2022-02-04 23:41:55.201712: I tensorflow/stream_executor/cuda/cuda_blas.cc:1774] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2022-02-04 23:41:55.266975: I tensorflow/compiler/xla/service/service.cc:171] XLA service 0x556b3da73b40 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2022-02-04 23:41:55.267012: I tensorflow/compiler/xla/service/service.cc:179]   StreamExecutor device (0): A100-SXM4-40GB, Compute Capability 8.0
2022-02-04 23:41:55.274264: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:237] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2022-02-04 23:42:00.782316: I tensorflow/compiler/jit/xla_compilation_cache.cc:351] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




(6, 896, 20)

In [13]:
%timeit model(loltf[:6], training=False).numpy().shape

500 ms ± 636 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
# sudden jump, not sure why
%timeit model(loltf[:7], training=False).numpy().shape

790 ms ± 434 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


**Scoring 1000bp**

In [15]:
fast_ism_model = fastISM.FastISM(model, test_correctness=False, 
                                 change_ranges=[(x,x+1) for x in range(model.input_shape[1]//2 - 500, 
                                                                       model.input_shape[1]//2 + 500)])

In [16]:
time_ism(fast_ism_model, [12], 196608)

2022-02-04 23:43:11.845525: W tensorflow/core/kernels/gpu_utils.cc:49] Failed to allocate memory for convolution redzone checking; skipping this check. This is benign and only means that we won't check cudnn for out-of-bounds reads and writes. This message will only be printed once.


BATCH: 12	TIME: 585.38	PER 100: 4878.16
BEST PER 100: 4878.16


**Scoring 100bp**

In [17]:
naive_ism_model = fastISM.NaiveISM(model, 
                                   change_ranges=[(x,x+1) for x in range(model.input_shape[1]//2 - 50, 
                                                                         model.input_shape[1]//2 + 50)])

In [18]:
time_ism(naive_ism_model, [6, 5], 196608)

BATCH: 6	TIME: 161.99	PER 100: 2699.83
BATCH: 5	TIME: 141.66	PER 100: 2833.28
BEST PER 100: 2699.83


**Scoring 1000bp**

In [19]:
naive_ism_model = fastISM.NaiveISM(model, 
                                   change_ranges=[(x,x+1) for x in range(model.input_shape[1]//2 - 500, 
                                                                         model.input_shape[1]//2 + 500)])

In [20]:
time_ism(naive_ism_model, [6], 196608)

BATCH: 6	TIME: 1601.63	PER 100: 26693.76
BEST PER 100: 26693.76
