# Gan Baseline

#### NOTE: Need to activate genomelake environment before this code. Simply type 'genomelake' in terminal.

In [1]:
%env CUDA_VISIBLE_DEVICES=1,2
import os, sys
sys.path.append("..")
import random
# custom file path package
from data import Data_Directories
# custom utility package
from utils.compute_util import *
# package for genomic data
from pybedtools import Interval, BedTool
from genomelake.extractors import ArrayExtractor, BigwigExtractor
# package for plotting
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats.stats import pearsonr,spearmanr
import tensorflow as tf

env: CUDA_VISIBLE_DEVICES=1,2


Using TensorFlow backend.


In [2]:
window_size = 2001
process_all = False
sample_num = 1000

In [3]:
# retrieve data
data = Data_Directories()
print data.intervals.keys()
print data.input_atac['day0'].keys()
print data.output_histone['day0'].keys()

['day6', 'day3', 'day0']
['100', '140']
['H3K27me3', 'H3K4me1', 'H3K27ac']


In [4]:
# get intervals for day0 data
day0_intervals = list(BedTool(data.intervals['day0']))
print '# of Intervals Extracted for day0: {}'.format(len(day0_intervals))

# of Intervals Extracted for day0: 267226


In [5]:
# create an ArrayExtractor for ATAC-seq for day0 with 140 base pairs
bw_140bp_day0 = ArrayExtractor(data.input_atac['day0']['140'])
print 'Finished extracting bigwig for day0, 140bp'

Finished extracting bigwig for day0, 140bp


In [6]:
# create a BigWigExtractor for histone makr 'H3K27ac' for day0
bw_histone_mark_day0 = BigwigExtractor(data.output_histone['day0']['H3K27ac'])
print 'Finished extracting bigwig for day0, 140bp'

Finished extracting bigwig for day0, 140bp


In [7]:
# normalize day0 intervals
normalized_day0_intervals = [normalize_interval(interval, window_size) for interval in day0_intervals if normalize_interval(interval, window_size)]
print 'Finished normalizing day0 intervals!'

Finished normalizing day0 intervals!


In [8]:
assert (len(day0_intervals)==len(normalized_day0_intervals))
print "Examples of original intervals"
print [(int(_interval.start)+int(_interval[-1]), [int(_interval.start), int(_interval.end)])
       for _interval in day0_intervals[:3]]
print "Examples of normalized intervals with window size of {}".format(window_size)
print [([int(_interval.start), int(_interval.end)])
       for _interval in  normalized_day0_intervals[:3]]

Examples of original intervals
[(123412027, [123411855, 123412989]), (123411941, [123411855, 123412989]), (131908564, [131908487, 131910071])]
Examples of normalized intervals with window size of 2001
[[123411027, 123413028], [123410941, 123412942], [131907564, 131909565]]


In [9]:
atac_seq_day0 = bw_140bp_day0(normalized_day0_intervals)
print atac_seq_day0.shape

(267226, 2001, 5)


In [10]:
#TODO: put this into utils if possible
def prune_invalid_intervals(intervals, bigwig_file):
    for _interval in intervals[:]:
        try:
            bigwig_file([_interval])
        except:
            intervals.remove(_interval)
            pass
        
print "Before pruning day0: {}".format(len(normalized_day0_intervals))
prune_invalid_intervals(normalized_day0_intervals, bw_140bp_day0)
print "After pruning day0: {}".format(len(normalized_day0_intervals))

Before pruning day0: 267226
After pruning day0: 267226


In [11]:
print "Dimension of ATAC-seq signal: {}".format(bw_140bp_day0(normalized_day0_intervals[:1]).shape)

Dimension of ATAC-seq signal: (1, 2001, 5)


In [12]:
print "Dimension of histone mark signal: {}".format(bw_histone_mark_day0(normalized_day0_intervals[:1]).shape)

Dimension of histone mark signal: (1, 2001)


In [13]:
# replace nan values with zeros and convert it to p-values
histone_mark_day0 = np.nan_to_num(bw_histone_mark_day0(normalized_day0_intervals))
print histone_mark_day0.shape

(267226, 2001)


In [14]:
histone_mark_day0 = np.expand_dims(histone_mark_day0, axis=2)
print histone_mark_day0.shape

(267226, 2001, 1)


In [15]:
print "Example histone mark signal"
print "\tRaw value: {}".format(bw_histone_mark_day0(normalized_day0_intervals[:1])[0][:5].reshape(-1))

Example histone mark signal
	Raw value: [ 0.01014  0.01014  0.01014  0.02435  0.02435]


In [16]:
from keras.layers import AveragePooling1D, Input, Dense, Conv1D, Dropout, BatchNormalization, Activation, ZeroPadding1D, Reshape, Flatten
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.convolutional import UpSampling2D, Conv2D
from keras.models import Sequential, Model
from keras import optimizers
from keras import metrics
from keras import losses
from keras import backend as K
from keras.callbacks import Callback, TensorBoard, ReduceLROnPlateau, ModelCheckpoint
from keras.optimizers import Adam, SGD

In [17]:
smooth_rate = 0.05
dropout_rate = 0.5
# parameters for first conv layer
hidden_filters_1 = 32
hidden_kernel_size_1 = window_size
# parameters for second conv layer
output_filters = 1
output_kernel_size = 16
# parameters for training
batch_size = 128
num_epochs = 200
evaluation_freq = 10

In [18]:
# Helper functions for writing the scores into bigwig file
from itertools import izip
from itertools import groupby
import subprocess

def interval_key(interval):
    return (interval.chrom, interval.start, interval.stop)

def merged_scores(scores, intervals, merge_type):
    # A generator that returns merged intervals/scores
    # Scores should have shape: #examples x #categories x #interval_size
    # Second dimension can be omitted for a 1D signal
    signal_dims = scores.ndim - 1
    assert signal_dims in {1, 2}

    # Only support max for now
    assert merge_type == 'max'
    score_first_dim = 1 if signal_dims == 1 else scores.shape[1]

    dtype = scores.dtype

    sort_idx, sorted_intervals = \
        zip(*sorted(enumerate(intervals),
                    key=lambda item: interval_key(item[1])))
    sorted_intervals = BedTool(sorted_intervals)

    # Require at least 1bp overlap
    # Explicitly convert to list otherwise it will keep opening a file when
    # retrieving an index resulting in an error (too many open files)
    interval_clust = list(sorted_intervals.cluster(d=-1))
    for _, group in groupby(izip(sort_idx, interval_clust),
                            key=lambda item: item[1].fields[-1]):
        idx_interval_pairs = list(group)
        group_idx, group_intervals = zip(*idx_interval_pairs)

        if len(idx_interval_pairs) == 1:
            yield group_intervals[0], scores[group_idx[0], ...]
        else:
            group_chrom = group_intervals[0].chrom
            group_start = min(interval.start for interval in group_intervals)
            group_stop = max(interval.stop for interval in group_intervals)

            # This part needs to change to support more merge_types (e.g. mean)
            group_score = np.full((score_first_dim, group_stop - group_start),
                                  -np.inf, dtype)
            for idx, interval in idx_interval_pairs:
                slice_start = interval.start - group_start
                slice_stop = slice_start + (interval.stop - interval.start)
                group_score[..., slice_start:slice_stop] = \
                    np.maximum(group_score[..., slice_start:slice_stop],
                               scores[idx, ...])
            if signal_dims == 1:
                group_score = group_score.squeeze(axis=0)
            yield Interval(group_chrom, group_start, group_stop), group_score
            
def interval_score_pairs(intervals, scores, merge_type):
    return (izip(intervals, scores) if merge_type is None
            else merged_scores(scores, intervals, merge_type))

def _write_1D_deeplift_track(scores, intervals, file_prefix, merge_type='max',
                             CHROM_SIZES='/mnt/data/annotations/by_release/hg19.GRCh37/hg19.chrom.sizes'):
    assert scores.ndim == 2

    bedgraph = file_prefix + '.bedGraph'
    bigwig = file_prefix + '.bw'

    print 'Writing 1D track of shape: {}'.format(scores.shape)
    print 'Writing to file: {}'.format(bigwig)

    with open(bedgraph, 'w') as fp:
        for interval, score in interval_score_pairs(intervals, scores,
                                                    merge_type):
            chrom = interval.chrom
            start = interval.start
            for score_idx, val in enumerate(score):
                fp.write('%s\t%d\t%d\t%g\n' % (chrom,
                                               start + score_idx,
                                               start + score_idx + 1,
                                               val))
    print 'Wrote bedgraph.'

    try:
        output = subprocess.check_output(
            ['wigToBigWig', bedgraph, CHROM_SIZES, bigwig],
            stderr=subprocess.STDOUT)
        print 'wigToBigWig output: {}'.format(output)
    except subprocess.CalledProcessError as e:
        print 'wigToBigWig terminated with exit code {}'.format(
            e.returncode)
        print 'output was:\n' + e.output

    print 'Wrote bigwig.'

In [19]:
model_dir = os.path.join("models", "gan_fixed_smooth_inter")
log_dir = os.path.join("logs", "gan_fixed_smooth_inter")
srv_dir = os.path.join("/srv", "www", "kundaje", "jesikmin", "gan_fixed_smooth_inter")
#srv_dir = os.path.join("/users", "jesikmin", "gan_fixed_smooth_2")
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
if not os.path.exists(srv_dir):
    os.makedirs(srv_dir)

In [20]:
class GAN():
    def __init__(self):
        
        self.window_size = window_size
        self.channels = 5
        self.input_shape = (self.window_size, self.channels,)
        self.output_shape = (self.window_size, 1,)

        optimizer = Adam(0.002, beta_1=0.5)
        doptimizer = Adam(0.001, beta_1=0.5)
        #doptimizer = SGD(0.001)

        # Build and compile the discriminator
        self.discriminator = self.build_discriminator()
        self.discriminator.compile(loss='binary_crossentropy', 
                                   optimizer=doptimizer,
                                   metrics=['accuracy'])

        # Build and compile the generator
        self.generator = self.build_generator()
        self.generator.compile(loss='binary_crossentropy',
                               optimizer=optimizer)

        # The generator takes noise as input and generated imgs
        z = Input(shape=self.input_shape)
        img = self.generator(z)

        # For the combined model we will only train the generator
        self.discriminator.trainable = False

        # The valid takes generated images as input and determines validity
        valid = self.discriminator(img)

        # The combined model  (stacked generator and discriminator) takes
        # noise as input => generates images => determines validity 
        self.combined = Model(z, valid)
        self.combined.compile(loss='binary_crossentropy',
                              optimizer=optimizer)
        print "Combined Model"
        print self.combined.summary()

    def build_generator(self):

        noise_shape = self.input_shape
        
        model = Sequential()

        model.add(Conv1D(hidden_filters_1,
                         hidden_kernel_size_1,
                         padding="same",
                         strides=1,
                         input_shape=noise_shape,
                         activation='relu'))
        model.add(Dropout(dropout_rate))

        model.add(Conv1D(output_filters,
                         output_kernel_size,
                         padding='same',
                         strides=1))
        model.add(Activation('linear'))
        
        print "Generator"
        model.summary()

        noise = Input(shape=noise_shape)
        img = model(noise)

        return Model(noise, img)

    def build_discriminator(self):
        model = Sequential()
        
        model.add(Conv1D(hidden_filters_1,
                         200,
                         padding="valid",
                         strides=1,
                         input_shape=self.output_shape))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dropout(dropout_rate))
        model.add(BatchNormalization(momentum=0.8))
        
        model.add(AveragePooling1D(25))
        model.add(Flatten())
        model.add(Dense(int(window_size/16)))
        model.add(LeakyReLU(alpha=0.2)) 
        
        model.add(Dense(1, activation='sigmoid'))
        
        print "Discriminator"
        model.summary()

        img = Input(shape=self.output_shape)
        validity = model(img)

        return Model(img, validity)

    def train(self, epochs, batch_size, X_train, y_train):
        
        max_pearson = -1.0

        half_batch = int(batch_size / 2)
            
        d_loss_real, d_loss_fake, g_loss = [1, 0], [1, 0], [1, 0]
            
        for epoch in range(epochs):
            
            # list for storing losses/accuracies for both discriminator and generator
            d_losses, d_accuracies, g_losses = [], [], []
            
            # sufficient number of minibatches for each epoch
            for _minibatch_idx in range(128):

                # ---------------------
                #  Train Discriminator
                # ---------------------

                # Select a random half batch of images
                dis_idx = np.random.randint(0, y_train.shape[0], half_batch)
                imgs = y_train[dis_idx]
                dis_noise = X_train[dis_idx]

                # Generate a half batch of new images
                gen_imgs = self.generator.predict(dis_noise)
                
                # Train the discriminator with label smoothing
                #smoothed_idx = np.random.choice(half_batch, int(half_batch*smooth_rate), replace=False)
                smoothed_labels = np.ones((half_batch, 1))
                #smoothed_labels[smoothed_idx] = 0
                
                # Train the discriminator
                all_imgs = np.concatenate([imgs, gen_imgs])
                all_labels = np.concatenate([smoothed_labels, np.zeros((half_batch, 1))])
                if _minibatch_idx % 32 == 0:
                    d_loss = self.discriminator.train_on_batch(all_imgs, all_labels)                     
                    d_losses.append(d_loss[0])
                    d_accuracies.append(d_loss[1])

                
#                 if epoch % 5 == 0:
#                     d_loss = self.discriminator.train_on_batch(all_imgs, all_labels)
#                 else:
#                     d_loss = (-1, -1)
                
#                 d_loss_real = self.discriminator.train_on_batch(imgs, smoothed_labels)
#                 d_loss_fake = self.discriminator.train_on_batch(gen_imgs, np.zeros((half_batch, 1)))
                    
                # take the average of each loss and accuracy
                #d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
                
                # ---------------------
                #  Train Generator
                # ---------------------
                
                gen_idx = np.random.randint(0, y_train.shape[0], batch_size)
                gen_noise = X_train[gen_idx]
                #noise = np.random.normal(0, 1, (batch_size, 100))

                # The generator wants the discriminator to label the generated samples
                # as valid (ones)
                valid_y = np.array([1] * batch_size)

                # Train the generator
                g_loss = self.combined.train_on_batch(gen_noise, valid_y)
                
                g_losses.append(g_loss)
                
            # convert each histories into numpy arrays to get means
            d_losses = np.array(d_losses)
            d_accuracies = np.array(d_accuracies)
            g_losses = np.array(g_losses)
            
            predictions = self.generator.predict(X_train).flatten()
            avg_pearson = pearsonr(predictions, y_train.flatten())
            print "Pearson R on Train set: {}".format(avg_pearson)
            
            val_predictions = self.generator.predict(X_val).flatten()
            avg_val_pearson = pearsonr(val_predictions, y_val.flatten())
            print "Pearson R on Val set: {}".format(avg_val_pearson)
            
#             predictions = self.generator.predict(X_train)
#             pearsons = []
#             for pred_idx in range(len(predictions)):
#                 prediction = predictions[pred_idx]
#                 pearsons.append(pearsonr(prediction, y_train[pred_idx]))
#             avg_pearson = np.array(pearsons).mean()
#             print "Pearson R on Train set: {}".format(avg_pearson)
            
#             val_predictions = self.generator.predict(X_val)
#             val_pearsons = []
#             for val_pred_idx in range(len(val_predictions)):
#                 prediction = val_predictions[val_pred_idx]
#                 val_pearsons.append(pearsonr(prediction, y_val[val_pred_idx]))
#             avg_val_pearson = np.array(val_pearsons).mean()
#             print "Pearson R on Val set: {}".format(avg_val_pearson)  
            
            if max_pearson < avg_val_pearson:
                print "Perason on val improved from {} to {}".format(max_pearson, avg_val_pearson)
                _write_1D_deeplift_track(predictions.reshape(7500, 2001),
                                         normalized_day0_intervals[:7500], os.path.join(srv_dir, 'train'))
                _write_1D_deeplift_track(val_predictions.reshape(2500, 2001),
                                         normalized_day0_intervals[7500:10000], os.path.join(srv_dir, 'val'))
                f = open(os.path.join(srv_dir, 'meta.txt'), 'wb')
                f.write(str(epoch) + " " + str(avg_pearson) + "  " + str(avg_val_pearson))
                f.close()
                max_pearson = avg_val_pearson
            
            # Plot the progress
            print ("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_losses.mean(), 100.0*d_accuracies.mean(), g_losses.mean()))

In [21]:
def pearson(y_true, y_pred):
    x = y_true
    y = y_pred
    mx = K.mean(x)
    my = K.mean(y)
    xm, ym = x-mx, y-my
    r_num = K.sum(np.multiply(xm,ym))
    r_den = K.sqrt(np.multiply(K.sum(K.square(xm)), K.sum(K.square(ym))))
    r = r_num / r_den
    r = K.maximum(K.minimum(r, 1.0), -1.0)
    return K.square(r)

In [22]:
print "Fitting the model..."
X_train, y_train = atac_seq_day0[:7500], histone_mark_day0[:7500]
X_val, y_val = atac_seq_day0[7500:10000], histone_mark_day0[7500:10000]
X_test, y_test = atac_seq_day0[10000:11000], histone_mark_day0[10000:11000]

gan = GAN()
gan.train(num_epochs, batch_size, X_train, y_train)

Fitting the model...
Discriminator
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 1802, 32)          6432      
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 1802, 32)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1802, 32)          0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 1802, 32)          128       
_________________________________________________________________
average_pooling1d_1 (Average (None, 72, 32)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2304)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 125) 

  'Discrepancy between trainable weights and collected trainable'


Pearson R on Train set: (0.21808229, 0.0)
Pearson R on Val set: (0.24442413, 0.0)
Perason on val improved from -1.0 to (0.24442413, 0.0)
Writing 1D track of shape: (7500, 2001)
Writing to file: /srv/www/kundaje/jesikmin/gan_fixed_smooth_inter/train.bw
Wrote bedgraph.
wigToBigWig output: 
Wrote bigwig.
Writing 1D track of shape: (2500, 2001)
Writing to file: /srv/www/kundaje/jesikmin/gan_fixed_smooth_inter/val.bw
Wrote bedgraph.
wigToBigWig output: 
Wrote bigwig.
0 [D loss: 0.521966, acc.: 68.95%] [G loss: 0.498337]
Pearson R on Train set: (0.22742966, 0.0)
Pearson R on Val set: (0.27246478, 0.0)
Perason on val improved from (0.24442413, 0.0) to (0.27246478, 0.0)
Writing 1D track of shape: (7500, 2001)
Writing to file: /srv/www/kundaje/jesikmin/gan_fixed_smooth_inter/train.bw
Wrote bedgraph.
wigToBigWig output: 
Wrote bigwig.
Writing 1D track of shape: (2500, 2001)
Writing to file: /srv/www/kundaje/jesikmin/gan_fixed_smooth_inter/val.bw
Wrote bedgraph.
wigToBigWig output: 
Wrote bigwig.

Pearson R on Train set: (-0.010231514, 0.0)
Pearson R on Val set: (-0.0072526131, 3.5483556304150269e-59)
51 [D loss: 0.132095, acc.: 97.46%] [G loss: 0.207261]
Pearson R on Train set: (-0.010145585, 0.0)
Pearson R on Val set: (-0.00705984, 3.6215430006797657e-56)
52 [D loss: 0.138998, acc.: 99.80%] [G loss: 0.176799]
Pearson R on Train set: (0.0065967212, 4.7510037141112479e-144)
Pearson R on Val set: (0.0067838477, 5.3293033999456077e-52)
53 [D loss: 0.279361, acc.: 92.19%] [G loss: 0.167311]
Pearson R on Train set: (-0.0091221659, 1.4504641566450323e-273)
Pearson R on Val set: (-0.0061701122, 2.5355877145185826e-43)
54 [D loss: 0.306994, acc.: 83.01%] [G loss: 0.189405]
Pearson R on Train set: (-0.0040875236, 1.7865013874545097e-56)
Pearson R on Val set: (-0.0022498854, 4.8498609337305816e-07)
55 [D loss: 0.284725, acc.: 84.57%] [G loss: 0.253865]
Pearson R on Train set: (0.00043860308, 0.089295261197624876)
Pearson R on Val set: (0.00034221285, 0.44403202024605648)
56 [D loss: 0.83

Pearson R on Val set: (-0.021729708, 0.0)
100 [D loss: 0.280750, acc.: 89.06%] [G loss: 0.021816]
Pearson R on Train set: (-0.018187629, 0.0)
Pearson R on Val set: (-0.017417163, 0.0)
101 [D loss: 0.329854, acc.: 91.21%] [G loss: 0.022455]
Pearson R on Train set: (-0.017086612, 0.0)
Pearson R on Val set: (-0.01663452, 5.1079063985345688e-303)
102 [D loss: 0.310889, acc.: 87.70%] [G loss: 0.021550]
Pearson R on Train set: (-0.015686397, 0.0)
Pearson R on Val set: (-0.01527005, 1.1122715378722694e-255)
103 [D loss: 0.309936, acc.: 81.64%] [G loss: 0.019271]
Pearson R on Train set: (-0.023582201, 0.0)
Pearson R on Val set: (-0.022618582, 0.0)
104 [D loss: 0.511451, acc.: 77.54%] [G loss: 0.020037]
Pearson R on Train set: (-0.017016815, 0.0)
Pearson R on Val set: (-0.01622287, 2.6050145035380216e-288)
105 [D loss: 0.473507, acc.: 76.76%] [G loss: 0.019843]
Pearson R on Train set: (-0.018535525, 0.0)
Pearson R on Val set: (-0.017758345, 0.0)
106 [D loss: 0.312426, acc.: 88.67%] [G loss: 0.0

Pearson R on Train set: (-0.0026421566, 1.3733652973956951e-24)
Pearson R on Val set: (-0.0042502969, 1.9743385899368568e-21)
151 [D loss: 0.342247, acc.: 80.86%] [G loss: 0.010188]
Pearson R on Train set: (-0.002497324, 3.8687893560164975e-22)
Pearson R on Val set: (-0.0040969132, 5.0332908202846075e-20)
152 [D loss: 0.313559, acc.: 79.88%] [G loss: 0.010722]
Pearson R on Train set: (-0.0023105687, 3.5214166740941946e-19)
Pearson R on Val set: (-0.003884983, 3.6471725806653336e-18)
153 [D loss: 0.189110, acc.: 92.58%] [G loss: 0.009913]
Pearson R on Train set: (-0.0020029086, 8.5481664306054942e-15)
Pearson R on Val set: (-0.0035687785, 1.4392324970415583e-15)
154 [D loss: 0.272010, acc.: 85.94%] [G loss: 0.009509]
Pearson R on Train set: (-0.0018621478, 5.4381300155507138e-13)
Pearson R on Val set: (-0.0033708219, 4.7253474543785242e-14)
155 [D loss: 0.287797, acc.: 79.69%] [G loss: 0.009830]
Pearson R on Train set: (-0.0017647959, 8.1016559903375798e-12)
Pearson R on Val set: (-0.00

Pearson R on Val set: (-0.0032405765, 4.230398562091208e-13)
196 [D loss: 0.401241, acc.: 85.55%] [G loss: 0.014472]
Pearson R on Train set: (-0.0010070893, 9.5631787806564619e-05)
Pearson R on Val set: (-0.0032629427, 2.9207012468608062e-13)
197 [D loss: 0.421456, acc.: 84.38%] [G loss: 0.014170]
Pearson R on Train set: (-0.00096824579, 0.00017617442675321358)
Pearson R on Val set: (-0.003220258, 5.9104249722055824e-13)
198 [D loss: 0.386277, acc.: 85.55%] [G loss: 0.013624]
Pearson R on Train set: (-0.00098533649, 0.0001350040949871981)
Pearson R on Val set: (-0.003213177, 6.6378522794427226e-13)
199 [D loss: 0.385177, acc.: 86.52%] [G loss: 0.013391]
