In [2]:
from __future__ import division, print_function

In [3]:
from seqdataloader.batchproducers import coordbased
import gzip
import numpy as np
class SiameseAugmenter(coordbased.coordbatchtransformers.AbstractCoordBatchTransformer): 
  def __call__(self, coords): 
    return [x.get_revcomp() for x in coords]

In [4]:
class ColsInBedFile(
    coordbased.coordstovals.core.AbstractSingleNdarrayCoordsToVals):
    def __init__(self, gzipped_bed_file, **kwargs):
        super(ColsInBedFile, self).__init__(**kwargs)
        self.gzipped_bed_file = gzipped_bed_file
        coords_to_vals = {}
        for row in gzip.open(gzipped_bed_file, 'rb'):
            row = row.decode("utf-8").rstrip()
            split_row = row.split("\t")
            chrom_start_end = split_row[0]+":"+split_row[1]+"-"+split_row[2]
            vals = np.array([float(x) for x in split_row[4:]])
            coords_to_vals[chrom_start_end] = vals
        self.coords_to_vals = coords_to_vals
        
    def _get_ndarray(self, coors):
        to_return = []
        for coor in coors:
            chrom_start_end = (coor.chrom+":"
                               +str(coor.start)+"-"+str(coor.end))
            to_return.append(self.coords_to_vals[chrom_start_end])
        return np.array(to_return)
    
    
inputs_coordstovals = coordbased.coordstovals.fasta.PyfaidxCoordsToVals(
  genome_fasta_path='/mnt/data/annotations/by_release/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta',
  center_size_to_use=1000)

targets_coordstovals = ColsInBedFile(
       gzipped_bed_file="summits_with_signal.bed.gz")
            
keras_train_batch_generator = coordbased.core.KerasBatchGenerator(
    coordsbatch_producer=coordbased.coordbatchproducers.SimpleCoordsBatchProducer(
      bed_file="train_summits_with_signal.bed.gz",
      batch_size=64,
      shuffle_before_epoch=True,
      seed=1234
    ),
    inputs_coordstovals=inputs_coordstovals,
    targets_coordstovals=targets_coordstovals
)

keras_valid_batch_generator = coordbased.core.KerasBatchGenerator(
    coordsbatch_producer = coordbased.coordbatchproducers.SimpleCoordsBatchProducer(
        bed_file="valid_summits_with_signal.bed.gz",
        batch_size=64, 
        shuffle_before_epoch=True, 
        seed=1234
    ),
    inputs_coordstovals=inputs_coordstovals, 
    targets_coordstovals=targets_coordstovals
)

keras_test_batch_generator = coordbased.core.KerasBatchGenerator(
    coordsbatch_producer = coordbased.coordbatchproducers.SimpleCoordsBatchProducer(
        bed_file="test_summits_with_signal.bed.gz", 
        batch_size = 64, 
        shuffle_before_epoch = True, 
        seed = 1234
    ), 
    inputs_coordstovals = inputs_coordstovals, 
    targets_coordstovals = targets_coordstovals
)

In [5]:
y_test = np.array([val for batch in keras_test_batch_generator for val in batch[1]], dtype = 'float32') 

In [6]:
import keras 
import keras_genomics
import numpy as np
import keras.layers as k1

from keras import backend as K 
from keras.layers.core import Dropout 
from keras.layers.core import Flatten
from keras.layers import Input
from keras.engine import Layer
from keras.models import Sequential 
from keras.engine.base_layer import InputSpec
from keras.models import Model
from keras.models import load_model

In [7]:
kernel_size = 15
filters= 15
input_length = 1000

from numpy.random import seed
from tensorflow import set_random_seed
from keras.callbacks import EarlyStopping, History, ModelCheckpoint

seed_num = 1000
seed(seed_num)
set_random_seed(seed_num)

In [10]:
class RevComp(Layer): 
    def __init__(self, **kwargs): 
      super(RevComp, self).__init__(**kwargs)

    def build(self, input_shape):
      super(RevComp, self).build(input_shape)

    def call(self, inputs): 
      return inputs[:,::-1,::-1]
      
    def compute_output_shape(self, input_shape):
      return input_shape

In [9]:
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [None]:
s_model = Sequential([
    k1.Conv1D(filters=filters, kernel_size=kernel_size,
            input_shape=keras_train_batch_generator[0][0].shape[1:], padding="same"), 
#     k1.BatchNormalization(), 
    k1.core.Activation("relu"),
#     k1.Conv1D(filters=filters, kernel_size=kernel_size,
#               padding="same"), 
#     k1.BatchNormalization(), 
#     k1.core.Activation("relu"),
#     k1.Conv1D(filters=filters, kernel_size=kernel_size,
#               padding="same"), 
#     k1.BatchNormalization(), 
#     k1.core.Activation("relu"),
    k1.pooling.MaxPooling1D(pool_size=40,padding="same",
                                               strides=40), 
    Flatten(), 
#     k1.Dense(units = 100, activation = "relu"),
    k1.Dense(units = 1)
], name = "shared_layers")

s_model.summary()
main_input = Input(shape=keras_train_batch_generator[0][0].shape[1:])
rev_input = Input(shape=keras_train_batch_generator[0][0].shape[1:])

rev_input = RevComp()(main_input)

main_output = s_model(main_input)
rev_output = s_model(rev_input)

avg = k1.Average()([main_output, rev_output])
siamese_model = Model(inputs = main_input, outputs = avg)

merged = keras.layers.concatenate([main_output, rev_output])
                                  
siamese_model.compile(optimizer="adam", loss="mean_squared_error")
early_stopping_callback = keras.callbacks.EarlyStopping(
                              monitor='val_loss',
                              patience= 60,
                              restore_best_weights=True)
siamese_model.fit_generator(generator= keras_train_batch_generator, 
                           epochs=300, callbacks=[early_stopping_callback],
                           validation_data=keras_valid_batch_generator)
siamese_model.set_weights(early_stopping_callback.best_weights)  

siamese_filename = ('siamese_%s.h5' % seed_num, str(seed_num))[0]
siamese_model.save(siamese_filename)
custom_objects = {"RevComp":RevComp}
siamese_model_final = load_model(siamese_filename, custom_objects)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_3 (Conv1D)            (None, 1000, 15)          915       
_________________________________________________________________
activation_3 (Activation)    (None, 1000, 15)          0         
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 25, 15)            0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 375)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 376       
Total params: 1,291
Trainable params: 1,291
Non-trainable params: 0
_________________________________________________________________
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 

In [None]:
y_pred_siamese = siamese_model.predict_generator(keras_test_batch_generator)

In [None]:
from matplotlib import pyplot as plt
from scipy.stats import spearmanr

plt.scatter(y_test, y_pred_siamese, alpha = 0.1)
plt.xlabel("True Labels")
plt.ylabel("Predicted Labels")
plt.show()
print(spearmanr(y_test, y_pred_siamese))