In [1]:
import sys
import os
import math
import sacred
import tensorflow as tf
import numpy as np

from sacred import Experiment
from sacred.observers import FileStorageObserver
from sacred import Ingredient

import keras 
import keras_genomics
import keras.layers as k1

from keras import backend as K 
from keras.layers.core import Dropout 
from keras.layers.core import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers import Input
from keras.engine import Layer
from keras.models import Sequential 
from keras.engine.base_layer import InputSpec
from keras.models import Model
from keras.models import load_model
from keras.initializers import Initializer
from keras.utils import conv_utils
from scipy.stats import spearmanr

Using TensorFlow backend.


In [2]:
from seqdataloader.batchproducers import coordbased
import gzip
import numpy as np

class ColsInBedFile(
    coordbased.coordstovals.core.AbstractSingleNdarrayCoordsToVals):
    def __init__(self, gzipped_bed_file, **kwargs):
        super(ColsInBedFile, self).__init__(**kwargs)
        self.gzipped_bed_file = gzipped_bed_file
        coords_to_vals = {}
        for row in gzip.open(gzipped_bed_file, 'rb'):
            row = row.decode("utf-8").rstrip()
            split_row = row.split("\t")
            chrom_start_end = split_row[0]+":"+split_row[1]+"-"+split_row[2]
            vals = np.array([float(x) for x in split_row[4:]])
            coords_to_vals[chrom_start_end] = vals
        self.coords_to_vals = coords_to_vals
        
    def _get_ndarray(self, coors):
        to_return = []
        for coor in coors:
            chrom_start_end = (coor.chrom+":"
                               +str(coor.start)+"-"+str(coor.end))
            to_return.append(self.coords_to_vals[chrom_start_end])
        return np.array(to_return)
    
    
inputs_coordstovals = coordbased.coordstovals.fasta.PyfaidxCoordsToVals(
  genome_fasta_path= '/mnt/data/annotations/by_release/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta',
  center_size_to_use=1000)

targets_coordstovals = ColsInBedFile(
       gzipped_bed_file="summits_with_signal.bed.gz")
            
keras_train_batch_generator = coordbased.core.KerasBatchGenerator(
    coordsbatch_producer=coordbased.coordbatchproducers.SimpleCoordsBatchProducer(
      bed_file="train_summits_with_signal.bed.gz",
      #coord_batch_transformer=coordbased.coordbatchtransformers.ReverseComplementAugmenter(),
      batch_size=64,
      shuffle_before_epoch=True,
      seed=1234
    ),
    inputs_coordstovals=inputs_coordstovals,
    targets_coordstovals=targets_coordstovals
)


keras_valid_batch_generator = coordbased.core.KerasBatchGenerator(
    coordsbatch_producer = coordbased.coordbatchproducers.SimpleCoordsBatchProducer(
        bed_file="valid_summits_with_signal.bed.gz", 
        batch_size=64, 
        shuffle_before_epoch=True, 
        seed=1234
    ),
    inputs_coordstovals=inputs_coordstovals, 
    targets_coordstovals=targets_coordstovals
)

keras_test_batch_generator = coordbased.core.KerasBatchGenerator(
    coordsbatch_producer = coordbased.coordbatchproducers.SimpleCoordsBatchProducer(
        bed_file="test_summits_with_signal.bed.gz", 
        batch_size = 64, 
        shuffle_before_epoch = True, 
        seed = 1234
    ), 
    inputs_coordstovals = inputs_coordstovals, 
    targets_coordstovals = targets_coordstovals
)


keras_train_batch_generator_augment = coordbased.core.KerasBatchGenerator(
    coordsbatch_producer=coordbased.coordbatchproducers.SimpleCoordsBatchProducer(
      bed_file="train_summits_with_signal.bed.gz",
      coord_batch_transformer=coordbased.coordbatchtransformers.ReverseComplementAugmenter(),
      batch_size=128,
      shuffle_before_epoch=True,
      seed=1234
    ),
    inputs_coordstovals=inputs_coordstovals,
    targets_coordstovals=targets_coordstovals
)


keras_valid_batch_generator_augment = coordbased.core.KerasBatchGenerator(
    coordsbatch_producer = coordbased.coordbatchproducers.SimpleCoordsBatchProducer(
        bed_file="valid_summits_with_signal.bed.gz",
        coord_batch_transformer=coordbased.coordbatchtransformers.ReverseComplementAugmenter(),
        batch_size=128, 
        shuffle_before_epoch=True, 
        seed=1234
    ),
    inputs_coordstovals=inputs_coordstovals, 
    targets_coordstovals=targets_coordstovals
)

keras_test_batch_generator_augment = coordbased.core.KerasBatchGenerator(
    coordsbatch_producer = coordbased.coordbatchproducers.SimpleCoordsBatchProducer(
        bed_file="test_summits_with_signal.bed.gz",
        coord_batch_transformer=coordbased.coordbatchtransformers.ReverseComplementAugmenter(),
        batch_size = 128, 
        shuffle_before_epoch = True, 
        seed = 1234
    ), 
    inputs_coordstovals = inputs_coordstovals, 
    targets_coordstovals = targets_coordstovals
)

In [14]:
y_test = np.array([val for batch in keras_test_batch_generator for val in batch[1]], dtype = 'float32') 
y_test_augment = np.array([val for batch in keras_test_batch_generator_augment for val in batch[1]], dtype = 'float32')

In [4]:
class AveragePool(Initializer): 
    def __call__(self, shape, dtype = None): 
        return K.constant(1/(shape[0]), shape=shape, dtype=dtype)

class WeightDistConv(Conv1D): 
    def __init__(self, filters, 
                kernel_size, 
                strides =1, 
                padding = 'valid', 
                data_format = 'channels_last',
                dilation_rate = 1, 
                activation = None, 
                use_bias = False, 
                kernel_initializer = AveragePool(), 
                bias_initializer = 'zeros', 
                kernel_regularizer = None, 
                bias_regularizer = None, 
                activity_regularizer = None, 
                kernel_constraint = None,
                bias_constraint = None, 
                **kwargs): 
        super(WeightDistConv, self).__init__(
            filters=filters, 
            kernel_size=kernel_size, 
            strides = strides, 
            padding=padding,
            data_format=data_format,
            dilation_rate=dilation_rate,
            activation=activation,
            use_bias=False,
            kernel_initializer=kernel_initializer,
            bias_initializer=bias_initializer,
            kernel_regularizer=kernel_regularizer,
            bias_regularizer=bias_regularizer,
            activity_regularizer=activity_regularizer,
            kernel_constraint=kernel_constraint,
            bias_constraint=bias_constraint,
            **kwargs) 


    def build(self, input_shape): 
        self.bias = None
        self.filters = input_shape[-1]
        if self.data_format == 'channels_first':
            channel_axis = 1
        else:
            channel_axis = -1
        if input_shape[channel_axis] is None:
            raise ValueError('The channel dimension of the inputs '
                             'should be defined. Found `None`.')
        input_dim = input_shape[channel_axis]
        kernel_shape = self.kernel_size + (self.filters,)
        self.kernel = self.add_weight(shape=kernel_shape,
                                        initializer = self.kernel_initializer, 
                                        name ='kernel',
                                        regularizer = self.kernel_regularizer, 
                                        constraint = self.kernel_constraint)

        self.input_spec = InputSpec(ndim=3,
                                    axes={channel_axis: input_dim})
        self.num_input_channels = input_shape[1]
        self.built = True
       
      
    #Layer's logic
    def call(self, inputs):
        result = []
        for x in range(self.kernel_size[0]): 
            result.append((self.kernel[x][:,None]*K.eye(self.filters))[None,:,:])

        curr_kernel = K.concatenate(result, axis = 0)
        print("curr kernel: ", curr_kernel)
        outputs = K.conv1d(inputs, curr_kernel,
                         strides=self.strides[0],
                         padding=self.padding,
                         data_format=self.data_format,
                         dilation_rate=self.dilation_rate[0])

        if (self.activation is not None):
            outputs = self.activation(outputs)

        return outputs
  
    def compute_output_shape(self, input_shape):
        length = conv_utils.conv_output_length(input_length = self.num_input_channels, 
                                               filter_size = self.filters,
                                               padding=self.padding,
                                               stride=self.strides[0])
        return (input_shape[0],length, self.filters)

In [5]:
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import random_ops
from tensorflow.python.framework import tensor_shape
from tensorflow.python.ops import array_ops
from tensorflow.python.framework import ops
import numbers
from tensorflow.python.framework import tensor_util
def _get_noise_shape(x, noise_shape):
  # If noise_shape is none return immediately.
  if noise_shape is None:
    return array_ops.shape(x)

  try:
    # Best effort to figure out the intended shape.
    # If not possible, let the op to handle it.
    # In eager mode exception will show up.
    noise_shape_ = tensor_shape.as_shape(noise_shape)
  except (TypeError, ValueError):
    return noise_shape

  if x.shape.dims is not None and len(x.shape.dims) == len(noise_shape_.dims):
    new_dims = []
    for i, dim in enumerate(x.shape.dims):
      if noise_shape_.dims[i].value is None and dim.value is not None:
        new_dims.append(dim.value)
      else:
        new_dims.append(noise_shape_.dims[i].value)
    return tensor_shape.TensorShape(new_dims)

  return noise_shape

class MCRCDropout(Layer):
    """Applies MC Dropout to the input.
       The applied noise vector is symmetric to reverse complement symmetry
       Class structure only slightly adapted 
    Dropout consists in randomly setting
    a fraction `rate` of input units to 0 at each update during training time,
    which helps prevent overfitting.
    Remains active ative at test time so sampling is required
    # Arguments
        rate: float between 0 and 1. Fraction of the input units to drop.
        noise_shape: 1D integer tensor representing the shape of the
            binary dropout mask that will be multiplied with the input.
            For instance, if your inputs have shape
            `(batch_size, timesteps, features)` and
            you want the dropout mask to be the same for all timesteps,
            you can use `noise_shape=(batch_size, 1, features)`.
        seed: A Python integer to use as random seed.
    # References
        - [Dropout: A Simple Way to Prevent Neural Networks from Overfitting](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)
    """
    def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
        super(MCRCDropout, self).__init__(**kwargs)
        self.rate = min(1., max(0., rate))
        self.noise_shape = noise_shape
        self.seed = seed
        self.supports_masking = True
        
    def build(self, input_shape):
        self.num_input_chan = input_shape[2]
        super(MCRCDropout, self).build(input_shape)

    def _get_noise_shape(self, inputs):
        if self.noise_shape is None:
            return self.noise_shape

        symbolic_shape = K.shape(inputs)
        noise_shape = [symbolic_shape[axis] if shape is None else shape
                       for axis, shape in enumerate(self.noise_shape)]
        return tuple(noise_shape)

    def call(self, inputs, training=None):
        if 0. < self.rate < 1.:
            import numpy as np
            noise_shape = self._get_noise_shape(inputs)
            x = inputs
            seed = self.seed
            keep_prob = 1. - self.rate
            if seed is None:
                seed = np.random.randint(10e6)
            # the dummy 1. works around a TF bug
            # (float32_ref vs. float32 incompatibility)
            x= x*1
            name = None
            with ops.name_scope(name, "dropout", [x]) as name:
                x = ops.convert_to_tensor(x, name="x")
                if not x.dtype.is_floating:
                    raise ValueError("x has to be a floating point tensor since it's going to"
                       " be scaled. Got a %s tensor instead." % x.dtype)
                if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1:
                    raise ValueError("keep_prob must be a scalar tensor or a float in the "
                       "range (0, 1], got %g" % keep_prob)
                keep_prob = ops.convert_to_tensor(
                             keep_prob, dtype=x.dtype, name="keep_prob")
                keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())

                # Do nothing if we know keep_prob == 1
                if tensor_util.constant_value(keep_prob) == 1:
                    return x

                noise_shape = _get_noise_shape(x, noise_shape)
                # uniform [keep_prob, 1.0 + keep_prob)
                random_tensor = keep_prob
                random_tensor += random_ops.random_uniform(
                noise_shape, seed=seed, dtype=x.dtype)
               
                # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
                binary_tensor = math_ops.floor(random_tensor)
                dim = binary_tensor.shape[2]//2

                symmetric_binary = K.concatenate(
                    tensors = [
                      binary_tensor[:,:,int(self.num_input_chan/2):], 
                      binary_tensor[:,:,int(self.num_input_chan/2):][::,::-1,::-1]], 
                  axis=2)
                ret = math_ops.div(x, keep_prob) * symmetric_binary
                
                return ret


    def get_config(self):
        config = {'rate': self.rate,
                  'noise_shape': self.noise_shape,
                  'seed': self.seed}
        base_config = super(MCRCDropout, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def compute_output_shape(self, input_shape):
        return input_shape

In [6]:
class RevCompSpatialDropout1D(Dropout): 
    def __init__(self, rate,**kwargs): 
        super(RevCompSpatialDropout1D, self).__init__(rate, **kwargs)
        self.seed = 3
        self.input_spec = InputSpec(ndim = 3)

    def _get_noise_shape(self, inputs): 
        input_shape = K.shape(inputs)
        noise_shape = (input_shape[0], 1, 1, int(self.num_input_chan/2)) 
        return noise_shape
        
    def build(self, input_shape):
        self.num_input_chan = input_shape[2]
        self.input_len = input_shape[1]
        super(RevCompSpatialDropout1D, self).build(input_shape)

    def call(self, inputs, training=None): 
        inputs_fwdandrevconcat = K.concatenate(
                tensors = [
                    inputs[:,:,None,:int(self.num_input_chan/2)],
                    inputs[:,:,None,int(self.num_input_chan/2):][:,:,:,::-1]],
                axis=2)

        if 0. < self.rate < 1.: 
            noise_shape = self._get_noise_shape(inputs)
            def dropped_inputs(): 
                dropped = K.dropout(inputs_fwdandrevconcat,
                                    self.rate, noise_shape, seed = self.seed)
                dropped = K.reshape(dropped, (-1, int(self.input_len), int(self.num_input_chan)))
                return K.concatenate(
                    tensors = [
                        dropped[:,:,:int(self.num_input_chan/2)],
                        dropped[:,:,int(self.num_input_chan/2):][:,:,::-1]],
                    axis=-1)

            return K.in_train_phase(dropped_inputs, inputs, training = training)

        return inputs

In [7]:
class RevCompSumPool(Layer): 
    def __init__(self, **kwargs): 
        super(RevCompSumPool, self).__init__(**kwargs)

    def build(self, input_shape):
        self.num_input_chan = input_shape[2]
        super(RevCompSumPool, self).build(input_shape)

    def call(self, inputs): 
        #divide by sqrt 2 for variance preservation
        inputs = (inputs[:,:,:int(self.num_input_chan/2)] + inputs[:,:,int(self.num_input_chan/2):][:,::-1,::-1])/(1.41421356237)
        return inputs
      
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[1], int(input_shape[2]/2))

In [8]:
from numpy.random import seed
from tensorflow import set_random_seed
from keras.callbacks import EarlyStopping, History, ModelCheckpoint
from sacred import Experiment, utils

In [9]:
ex = Experiment("Augment Models", interactive = True)

MODEL_DIR = "/users/hannahgz/revcomp_experiments/sacred_runs/augment_spatial_dropout/"
ex.observers.append(
    sacred.observers.FileStorageObserver.create(MODEL_DIR)
)
ex.captured_out_filter = utils.apply_backspaces_and_linefeeds

In [10]:
@ex.config 
def config(): 
    # Number of convolutional layers to apply
    num_conv = 3
    
    # Filters 
    filters = 15
    
    # Kernel Size 
    kernel_size = 15 
    
    # Whether to add weight dist layer before conv layers 
    weight_dist = False 
    
    # Pool Size for Max Pooling 
    pool_size = 40 
    
    # Strides for Max Pooling 
    strides = 40 
    
    # Whether to use rc layers or normal layers 
    rev_comp = False 
    
    # Whether the model is siamese or not 
    siamese = False
    
    # Whether to use regular dropout 
    dropout = False 
    
    # Whether to use RevCompSpatialDropout1D 
    spatial_dropout = False 
    
    # Whether to use MCRCDropout 
    mc_dropout = False
    
    # Dropout Rate: used for all types of dropout
    dropout_rate = 0.2
    
    # Type of Pooling, options are: 'max', 'avg', 'weight_dist' 
    pooling = 'max'
    
    # Units in dense layer 
    units = 100
    
    # Whether or not to use a siamese model
    siamese = False
    
    #Number of epochs to train
    num_epochs = 300
    
    #Patience for early stopping 
    patience = 60 
    
    #Training seed 
    seed_num = 1000
    
    #Filename for saving the model
    filename = ""

In [16]:
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = '5'

In [17]:
@ex.main
def main(
    _run, _log, num_conv, filters, kernel_size, weight_dist, pool_size, 
    strides, rev_comp, dropout, dropout_rate, spatial_dropout, 
    mc_dropout, pooling, units, augment, siamese, num_epochs, patience, 
    seed_num, filename
):
    seed(seed_num)
    set_random_seed(seed_num)
    print(keras_train_batch_generator_augment)
#     train_batch_generator = keras_train_batch_generator
#     test_batch_generaotr = keras_test_batch_generator
#     print(test_batch_generator)
#     valid_batch_generator = keras_valid_batch_generator

#     if augment: 
#         train_batch_generator = keras_train_batch_generator_augment 
#         test_batch_generaotr = keras_test_batch_generator_augment
#         valid_batch_generator = keras_valid_batch_generator_augment

    model = keras.models.Sequential()
    #Convolutional layers and dropout
    for i in range(num_conv): 
        if i > 0 and i < num_conv - 1 and weight_dist: 
            model.add(WeightDistConv(filters = filters, kernel_size = kernel_size, 
                                input_shape = keras_train_batch_generator_augment[0][0].shape[1:], padding = "same"))
        if not rev_comp: 
            model.add(k1.Conv1D(filters=filters, kernel_size=kernel_size,
                          input_shape=keras_train_batch_generator_augment[0][0].shape[1:],
                          padding="same")) 
        else: 
            model.add(keras_genomics.layers.RevCompConv1D(filters=filters, 
                                                          kernel_size=kernel_size, 
                                                          input_shape=keras_train_batch_generator_augment[0][0].shape[1:], 
                                                          padding="same"))
        model.add(k1.core.Activation("relu"))
        
        #Don't apply dropout before the pooling layer
        if i != num_conv - 1: 
            if dropout: 
                model.add(k1.Dropout(dropout_rate))
            elif spatial_dropout:
                if rev_comp: 
                    model.add(RevCompSpatialDropout1D(dropout_rate)) 
                else: 
                    model.add(k1.SpatialDropout1D(dropout_rate))
            elif mc_dropout: 
                model.add(MCRCDropout(dropout_rate))

    #Only needed with rc model
    if rev_comp: 
        model.add(RevCompSumPool())
    
    #Pooling layers
    if pooling == 'max': 
        model.add(k1.pooling.MaxPooling1D(pool_size = pool_size,padding = "same", strides = strides))
    elif pooling == 'avg':
        model.add(k1.pooling.AveragePooling1D(pool_size = pool_size, padding = "same", strides = strides))
    elif pooling == 'weight_dist':
        model.add(WeightDistConv(filters = filters, kernel_size = kernel_size, 
                                input_shape = keras_train_batch_generator_augment[0][0].shape[1:], padding = "same"))
    
    model.add(Flatten())
    
    #Fully-connected layers 
    model.add(keras_genomics.layers.core.Dense(units = units, activation = "relu"))
    model.add(keras_genomics.layers.core.Dense(units = 1))  
    
    if siamese: 
        main_input = Input(shape=keras_train_batch_generator_augment[0][0].shape[1:])
        rev_input = Input(shape=keras_train_batch_generator_augment[0][0].shape[1:])
        rev_input = RevComp()(main_input)
        main_output = model(main_input)
        rev_output = model(rev_input)
        avg = k1.Average()([main_output, rev_output])
        model = Model(inputs = main_input, outputs = avg)

    output_dir = os.path.join(MODEL_DIR, str(_run._id))

    model.summary()
    model.compile(optimizer="adam", loss="mean_squared_error")
    early_stopping_callback = keras.callbacks.EarlyStopping(
                              monitor = 'val_loss',
                              patience = patience,
                              restore_best_weights=True)
    model.fit_generator(generator = keras_train_batch_generator_augment, 
                           epochs = num_epochs, callbacks = [early_stopping_callback],
                           validation_data = keras_valid_batch_generator_augment)
    model.set_weights(early_stopping_callback.best_weights)
    
    filename = (filename + '_%s.h5' % seed_num, str(seed_num))[0]
    model.save(filename)
    
    y_pred = model.predict_generator(keras_test_batch_generator_augment)

    
    rho, pval = spearmanr(y_test_augment, y_pred)
    print('correlation ', rho)
    print('p-value', pval)
    _run.log_scalar('correlation', rho)
    _run.log_scalar('p-value', pval)

In [None]:
ex.run(config_updates = {'augment': True, 'spatial_dropout': True, 'filename': 'augment_spatial_dropout'})

W0725 12:09:58.066308 140166411724544 initialize.py:192] Added new config entry: "augment"


<seqdataloader.batchproducers.coordbased.core.KerasBatchGenerator object at 0x7f7a9385c128>
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_4 (Conv1D)            (None, 1000, 15)          915       
_________________________________________________________________
activation_4 (Activation)    (None, 1000, 15)          0         
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 1000, 15)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 1000, 15)          3390      
_________________________________________________________________
activation_5 (Activation)    (None, 1000, 15)          0         
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 1000, 15)          0         
__________________________________________________

In [None]:
ex.run(config_updates = {'augment': True, 'spatial_dropout': True, 'weight_dist': True, 'filename': 'augment_spatial_dropout_weight_dist'})

In [None]:
ex.run(config_updates = {'augment': True, 'dropout': True})