In [2]:
#uncomment the lines below if you are running this tutorial from Google Colab 
#!pip install dragonn>=0.2.3

In [3]:
# Making sure our results are reproducible
from numpy.random import seed
seed(1234)
from tensorflow import set_random_seed
set_random_seed(1234)

In [4]:
#load dragonn tutorial utilities 
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


from dragonn.tutorial_utils import *

Using TensorFlow backend.
  from numpy.core.umath_tests import inner1d


## Input data <a name='1'>
<a href=#outline>Home</a>

Tutorials 1 - 3 have used simulated data generated with the simdna package. In this tutorial, we will examine how well CNN's are able to predict transcription factor binding for four TF's in vivo. 

We will learn to predict transcription factor binding for four transcription factors in the GM12878 cell line (one of the Tier 1 cell lines for the ENCODE project). First, we download the narrowPeak bed files for each of these transcription factors. You can skip the following code block if you already have the data downloaded. 

In [None]:
## SPI1, optimal IDR thresholded peaks, Myers lab, hg19
# https://www.encodeproject.org/experiments/ENCSR000BGQ/
!wget -O SPI1.narrowPeak.gz http://mitra.stanford.edu/kundaje/projects/dragonn/dragonn_gm12878_pipeline/spi1_ENCSR000BGQ/cromwell-executions/chip/bb0c3c5a-3889-43fe-a218-05851cecc74a/call-reproducibility_idr/execution/optimal_peak.regionPeak.gz
    
## Download "ambiguous" peak sets -- these peaks are in the optimal overlap set across replicates, but are not
## found to be reproducible at a high confidence (p<0.05) by IDR 
! wget -O SPI1.ambiguous.gz http://mitra.stanford.edu/kundaje/projects/dragonn/SPI1.ambiguous.gz
    
## Download the hg19 chromsizes file (We only use chroms 1 -22, X, Y for training)
!wget https://github.com/kundajelab/dragonn/blob/keras_2.2_tensorflow_1.6_purekeras/paper_supplement/hg19.chrom.sizes
    
## Download the hg19 fasta reference genome (and corresponding .fai index)
!wget http://mitra.stanford.edu/kundaje/projects/dragonn/hg19.genome.fa.gz
!wget http://mitra.stanford.edu/kundaje/projects/dragonn/hg19.genome.fa.fai 


## Generating positive and negative bins for genome-wide training <a name='2'>
<a href=#outline>Home</a>

We will use the *genomewide_labels* function from the  [seqdataloader](https://github.com/kundajelab/seqdataloader) package to generate positive and negative labels for the TF-ChIPseq peaks across the genome. We will treat each sample as a task for the model and compare the performance of the model on SPI1 task in the single-tasked and multi-tasked setting.

In [5]:
from seqdataloader import * 

In [6]:
## seqdataloader accepts an input file, which we call tasks.tsv, with task names in column 1, the corresponding
## peak files in column 2, skip column 3 (which will be used for regression in Tutorial 5), and ambiguous peaks in 
## column4 
!echo "SPI1\tSPI1.narrowPeak.gz\t\tSPI1.ambiguous.gz" >> tasks.tsv 
! cat tasks.tsv

CTCF	CTCF.narrowPeak.gz		CTCF.ambiguous.gz
SPI1	SPI1.narrowPeak.gz		SPI1.ambiguous.gz
SIX5	SIX5.narrowPeak.gz		SIX5.ambiguous.gz
ZNF143	ZNF143.narrowPeak.gz		ZNF143.ambiguous.gz


With the parameter configuration below, seqdataloader splits the genome into 1kb regions, with a stride of 50. Each 1kb region is centered at a 200 bp bin, with a left flank of 400 bases and a right flank of 400 bases. 

* Each 200 bp bin is labeled as positive if a narrowPeak summit overlaps with it. 

* The bin is labeled ambiguous (label = -1) and excluded from training if there is some overlap with the narrowPeak, but the peak summit does not lie in that overlap. 

* The bin is labeled negative if there is no overlap with the narrowPeak. 

In [None]:

positives_train_set_params={
    'store_positives_only':True,
    'task_list':"tasks.tsv",
    'outf':"positives.TF.train.hdf5",
    'output_type':'hdf5',
    'chrom_sizes':'hg19.chrom.sizes',
    'chroms_to_exclude':['chr1','chr2','chr19'],
    'bin_stride':50,
    'left_flank':400,
    'right_flank':400,
    'bin_size':200,
    'threads':20,
    'subthreads':2,
    'allow_ambiguous':True,
    'output_hdf5_low_mem':True,
    'labeling_approach':'peak_summit_in_bin_classification'
    }
genomewide_labels(positives_train_set_params)

In [None]:
positives_valid_set_params={
    'store_positives_only':True,
    'task_list':"tasks.tsv",
    'outf':"positives.TF.valid.hdf5",
    'output_type':'hdf5',
    'chrom_sizes':'hg19.chrom.sizes',
    'chroms_to_keep':'chr1',
    'bin_stride':50,
    'left_flank':400,
    'right_flank':400,
    'bin_size':200,
    'threads':20,
    'subthreads':2,
    'allow_ambiguous':True,
    'output_hdf5_low_mem':True,
    'labeling_approach':'peak_summit_in_bin_classification'
    }
genomewide_labels(positives_valid_set_params)


In [None]:
positives_test_set_params={
    'store_positives_only':True,
    'task_list':"tasks.tsv",
    'outf':"positives.TF.test.hdf5",
    'output_type':'hdf5',
    'chrom_sizes':'hg19.chrom.sizes',
    'chroms_to_keep':['chr2','chr19'],
    'bin_stride':50,
    'left_flank':400,
    'right_flank':400,
    'bin_size':200,
    'threads':20,
    'subthreads':2,
    'allow_ambiguous':True,
    'output_hdf5_low_mem':True,
    'labeling_approach':'peak_summit_in_bin_classification'
    }
genomewide_labels(positives_test_set_params)


In [8]:
# When provided with the --store-positives_only flag, the code generates all bins for each task that are labeled positive.
pd.read_hdf("SPI1.positives.TF.train.hdf5",start=0,stop=10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SPI1
CHR,START,END,Unnamed: 3_level_1
chr3,129600,130600,1.0
chr3,129650,130650,1.0
chr3,129700,130700,1.0
chr3,129750,130750,1.0
chr3,260300,261300,1.0
chr3,260350,261350,1.0
chr3,260400,261400,1.0
chr3,260450,261450,1.0
chr3,319100,320100,1.0
chr3,319150,320150,1.0


In [10]:
from dragonn.generators import * 

In [24]:
#To prepare for model training, we import the necessary functions and submodules from keras
from keras.models import Sequential
from keras.layers.core import Dropout, Reshape, Dense, Activation, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.optimizers import Adadelta, SGD, RMSprop;
import keras.losses;
from keras.constraints import maxnorm;
from keras.layers.normalization import BatchNormalization
from keras.regularizers import l1, l2
from keras.callbacks import EarlyStopping, History, TensorBoard 
from keras import backend as K 
K.set_image_data_format('channels_last')

#we use a custom binary cross-entropy loss that can handle ambiguous labels (denoted with -1 ) and exclude them 
# from the loss calculation 
from dragonn.custom_losses import get_ambig_binary_crossentropy

In [12]:
from concise.metrics import tpr, tnr, fpr, fnr, precision, f1
from keras.constraints import max_norm

def initialize_model(ntasks=1):
    #Define the model architecture in keras (regularized, 3-layer convolution model followed by 1 dense layer)
    model=Sequential() 
    
    model.add(Conv2D(filters=50,kernel_size=(1,15),padding="same", kernel_constraint=max_norm(7.0,axis=-1),input_shape=(1,1000,4)))
    model.add(BatchNormalization(axis=-1))
    model.add(Activation('relu'))

    model.add(Conv2D(filters=50,kernel_size=(1,15),padding="same"))
    model.add(BatchNormalization(axis=-1))
    model.add(Activation('relu'))

    model.add(Conv2D(filters=50,kernel_size=(1,13),padding="same"))
    model.add(BatchNormalization(axis=-1))
    model.add(Activation('relu'))
    
    model.add(MaxPooling2D(pool_size=(1,40)))
    
    model.add(Flatten())
    model.add(Dense(50))
    model.add(BatchNormalization(axis=-1))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    
    model.add(Dense(ntasks))
    model.add(Activation("sigmoid"))
    
    #use the custom ambig_binary_crossentropy loss, indicating that a value of -1 indicates an ambiguous label 
    loss=get_ambig_binary_crossentropy(-1)
    
    ##compile the model, specifying the Adam optimizer, and binary cross-entropy loss. 
    model.compile(optimizer='adam', loss=loss,
                  metrics=[tpr,
                           tnr,
                           fpr,
                           fnr,
                           precision,
                           f1])
    return model

In [19]:
#create the generators
from dragonn.generators import * 
case1_spi1_train_gen=DataGenerator("SPI1.positives.TF.train.hdf5","hg19.genome.fa.gz",shuffled_ref_negatives=True,upsample=False,batch_size=256)
case1_spi1_valid_gen=DataGenerator("SPI1.positives.TF.valid.hdf5","hg19.genome.fa.gz",shuffled_ref_negatives=True,upsample=False,batch_size=256)
case1_ctcf_train_gen=DataGenerator("CTCF.positives.TF.train.hdf5","hg19.genome.fa.gz",shuffled_ref_negatives=True,upsample=False,batch_size=256)
case1_ctcf_valid_gen=DataGenerator("CTCF.positives.TF.valid.hdf5","hg19.genome.fa.gz",shuffled_ref_negatives=True,upsample=False,batch_size=256)

We now follow the standard protocol we used in tutorials 1 - 3 to train a keras model, with the exception that we use the fit_generator function in keras, rather than the fit function.

In [25]:
callbacks=[EarlyStopping(patience=3,restore_best_weights=True),History()]

In [26]:
#If you are running this notebook in google colab, uncomment the lines below to observe the model's training 
!mkdir logs
%tensorboard --logdir logs 
tensorboard_visualizer=TensorBoard(log_dir="logs", histogram_freq=0, batch_size=500, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)
callbacks.append(tensorboard_visualizer)

In [27]:
#Train the SPI1 model 
case1_spi1_model=initialize_model()

## use the keras fit_generator function to train the model with early stopping after 3 epochs 
history_case1_spi1=case1_spi1_model.fit_generator(case1_spi1_train_gen,
                                                  validation_data=case1_spi1_valid_gen,
                                                  epochs=150,
                                                  verbose=1,
                                                  use_multiprocessing=True,
                                                  workers=40,
                                                  max_queue_size=100,
                                                  callbacks=callbacks)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150


In [None]:
## Plot the learning curves for SPI1  
from dragonn.tutorial_utils import plot_learning_curve
plot_learning_curve(history_case1_spi1)