# DeepBind Analysis

## Imports

In [1]:
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Input, Maximum, Dense, Dropout, Flatten
from keras.layers import Conv1D, MaxPooling1D
from keras.layers import Conv2D, MaxPooling2D
from keras.models import Model

from keras import backend as K

Using TensorFlow backend.


In [38]:
import os
import numpy as np
import Bio
from Bio import SeqIO
import seaborn as sns
import pandas as pd
%matplotlib inline

from sklearn import model_selection

In [18]:
if not os.path.isdir('/home/jtao/analysis/deepbind_analysis'):
    os.mkdir('/home/jtao/analysis/deepbind_analysis')
os.chdir('/home/jtao/analysis/deepbind_analysis')

In [2]:
batch_size = 128
num_classes = 10
epochs = 12

## Copy Fasta File and Background

In [9]:
! cp /home/jtao/analysis/ap1_fdr_analysis/idr_peak_files/c57bl6_atf3_veh_idr_peaks.tsv ./

In [129]:
target_peak_size = 200
current_peak_frame = pd.read_csv('./c57bl6_atf3_veh_idr_peaks.tsv', sep='\t' )
peakCenters = (current_peak_frame['start'] + current_peak_frame['end']) /2
int_peakCenters = np.ceil(peakCenters).astype(int)
new_starts = int_peakCenters - int(target_peak_size/2)
new_ends = int_peakCenters + int(target_peak_size/2)
current_peak_frame['start'] = new_starts
current_peak_frame['end'] = new_ends
current_peak_frame.to_csv('./c57bl6_atf3_veh_resized_peaks.tsv', sep='\t', index=False)

In [130]:
! pos2bed.pl ./c57bl6_atf3_veh_resized_peaks.tsv > ./c57bl6_atf3_veh_idr_peaks.bed


	Converted 23160 peaks total



In [548]:
! /home/jtao/code/tba/model_training/extract_sequences.py ./c57bl6_atf3_veh_idr_peaks.bed mm10 ./c57bl6_atf3_veh_idr_peaks.fasta

/home/jtao/code/tba/model_training
reading genome mm10


In [552]:
! /home/jtao/code/tba/model_training/generate_background_coordinates.py ./c57bl6_atf3_veh_idr_peaks.bed ./ #-filterChromosomes chrY

filtering out: chrM chrY
reading genome mm10
done reading genome
0 0
target GC: 0.3800343317878802 background GC: 0.3776312864446259 target length: 201 numTargetPositions 2316 backgroundPositions 2316
0 0
target GC: 0.4188299824646237 background GC: 0.41583303408052125 target length: 201 numTargetPositions 2316 backgroundPositions 2316
0 0
target GC: 0.4404232205932298 background GC: 0.43940559859085326 target length: 201 numTargetPositions 2316 backgroundPositions 2316
0 0
target GC: 0.4578405042088684 background GC: 0.45426084725093463 target length: 201 numTargetPositions 2315 backgroundPositions 2315
0 0
target GC: 0.4735631833138448 background GC: 0.4691829010113831 target length: 201 numTargetPositions 2315 backgroundPositions 2315
0 0
target GC: 0.4895179632038591 background GC: 0.4831212710902887 target length: 201 numTargetPositions 2315 backgroundPositions 2315
0 0
target GC: 0.5072371388764302 background GC: 0.5015311250346425 target length: 201 numTargetPositions 2315 backg

## Sequential API Implementation

### Set up Training and Test Data

In [422]:
def convert_sequences_to_array(sequences):
    '''
    inputs: sequence of nucleotides represented as a string composed of A, C, G, T
    outputs: a list of numpy array representations of a sequence with:
             A = [1, 0, 0, 0]
             C = [0, 1, 0, 0]
             G = [0, 0, 1, 0]
             T = [0, 0, 0, 1]
             
    '''

    nucleotide_array_dict = {'A': [1, 0, 0, 0],
                             'C': [0, 1, 0, 0],
                             'G': [0, 0, 1, 0],
                             'T': [0, 0, 0, 1],
                             'N': [0.25,0.25,0.25,0.25]}

    sequence_array_list = []
    for seq in sequences:
        seq_array = []
        for nuc in seq:
            seq_array.append(nucleotide_array_dict[nuc])
        seq_array = np.array(seq_array)
        sequence_array_list.append(seq_array)
    return sequence_array_list

In [434]:
sequences = []
positive_seqRecords = list(SeqIO.parse('./c57bl6_atf3_veh_idr_peaks.fasta', 'fasta'))
negative_seqRecords = list(SeqIO.parse('./background.fasta', 'fasta'))

fasta_seq = [str(x.seq)[:200] for x in positive_seqRecords] + [str(x.seq)[:200] for x in negative_seqRecords]

In [435]:

labels = [1 for x in positive_seqRecords] + [0 for x in negative_seqRecords]
labels = np.array(labels)

In [436]:
sequence_arrays = convert_sequences_to_array(fasta_seq)
sequence_arrays = np.array(sequence_arrays)

In [437]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(sequence_arrays, labels, test_size=0.2)

In [438]:
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

### Define Model

In [469]:
num_classes = 2
model = Sequential()
model.add(Conv1D(filters=16, 
                 kernel_size=24,
                 activation='relu',
                 input_shape=(200,4)))
model.add(MaxPooling1D(pool_size=176))
model.add(Dense(32, activation='relu'))

model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

### Train Model

In [470]:
model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          verbose=1,
          validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 37044 samples, validate on 9262 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.34897923555456006
Test accuracy: 0.8500323904253088


## Functional API Implementation

In [592]:
num_classes = 2
input_fwd = Input(shape=(200,4), name='input_fwd')

shared_motif_convolution = Conv1D(filters=16, 
     kernel_size=24,
     activation='relu',
     input_shape=(200,4))

motif_scores_fwd = shared_motif_convolution(input_fwd)

max_seq_scores = MaxPooling1D(pool_size=176)(motif_scores_fwd)

dense_out = Dense(32, activation='relu')(max_seq_scores)

drop_out = Dropout(0.25)(dense_out)

flattened = Flatten()(drop_out)

predictions = Dense(num_classes, activation = 'softmax')(flattened)

model = Model(inputs=input_fwd, outputs=predictions)
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

In [587]:
model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          verbose=1,
          validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 37044 samples, validate on 9262 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.3660328049680065
Test accuracy: 0.8416108831785791


## Define Model with Reverse Complement

In [513]:
sequences = []
positive_seqRecords = list(SeqIO.parse('./c57bl6_atf3_veh_idr_peaks.fasta', 'fasta'))
negative_seqRecords = list(SeqIO.parse('./background.fasta', 'fasta'))
fasta_seq = [str(x.seq[:200]) for x in positive_seqRecords] + [str(x[:200].seq) for x in negative_seqRecords]

In [514]:
fasta_rc_seq = [str(x[:200].reverse_complement().seq) for x in positive_seqRecords] + \
    [str(x[:200].reverse_complement().seq) for x in negative_seqRecords]

In [436]:
sequence_arrays = convert_sequences_to_array(fasta_seq)
sequence_arrays = np.array(sequence_arrays)

In [517]:
sequence_rc_arrays = convert_sequences_to_array(fasta_rc_seq)
sequence_rc_arrays = np.array(sequence_rc_arrays)

In [543]:
x_train, x_test, x_rc_train, x_rc_test, y_train, y_test = model_selection.train_test_split(sequence_arrays, sequence_rc_arrays, labels, test_size=0.2)

In [544]:
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [646]:
num_classes = 2
input_fwd = Input(shape=(200,4), name='input_fwd')
input_rev = Input(shape=(200,4), name='input_rev')

shared_motif_convolution = Conv1D(filters=16, 
     kernel_size=24,
     activation='relu',
     input_shape=(200,4))

motif_scores_fwd = shared_motif_convolution(input_fwd)
motif_scores_rev = shared_motif_convolution(input_rev)

max_strand_scores = Maximum()([motif_scores_fwd, motif_scores_rev])

max_seq_scores = MaxPooling1D(pool_size=176)(max_strand_scores)

dense_out = Dense(32, activation='relu')(max_seq_scores)

drop_out = Dropout(0.25)(dense_out)

flattened = Flatten()(drop_out)

predictions = Dense(num_classes, activation = 'softmax')(flattened)

model = Model(inputs=[input_fwd, input_rev], outputs=predictions)

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

In [649]:
model.fit([x_train, x_rc_train], y_train,
          batch_size=64,
          epochs=10,
          verbose=1,
          validation_data=([x_test, x_rc_test], y_test))
score = model.evaluate([x_test, x_rc_test], y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 37044 samples, validate on 9262 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.28490236515464445
Test accuracy: 0.8827467069361231
