In [1]:
import os
import sys
sys.path.append('/home/kal/TF_models/bin/')
sys.path.append('/home/kal/K27act_models/convolution_model/')
sys.path.append('/home/kal/K27act_models/cg_model/')
sys.path.append('/home/thouis/basenji_embeddings')
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import tf_memory_limit
from zinb import ZINB
import seaborn as sns
import h5py
import matplotlib.pyplot as plt
from keras.utils import plot_model

import pandas as pd
import numpy as np
import itertools
import sequence
import ucscgenome
from tqdm import tqdm
import time
import datagen

from keras.models import Model
from keras.layers import Input, Dense, SpatialDropout1D, Conv1D, Lambda
from keras.optimizers import RMSprop, SGD
from keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from keras import backend as K
import tensorflow as tf

Using TensorFlow backend.
  self.seq = helper.softmax(np.log(dist))


In [2]:
#load in the DNA
genome = ucscgenome.Genome('/home/kal/.ucscgenome/hg19.2bit')

In [3]:
# load in ATAC data
atac_path = '/home/kal/K27act_models/GM_data/ATAC/atac_average.hdf5'
atac = h5py.File(atac_path, 'r')

In [None]:
# load in peaks data
peaks_path = '/home/kal/K27act_models/GM_data/merged_annotated.bed'
peaks = pd.read_table(peaks_path, header=None)
peaks.columns='chr narrowstart narrowend name score atac k27act'.split()

peaks['start'] = (peaks['narrowend'] + peaks['narrowstart'])//2 - 512
peaks['end'] = (peaks['narrowend'] + peaks['narrowstart'])//2 + 512

In [None]:
# preproces the data
for idx, row in tqdm(peaks.iterrows(), total=len(peaks)):
    peaks.set_value(idx, 'nucs', genome[row.chr][row.start:row.end])
    #mini.set_value(idx, 'coverage', atac[row.chr][row.start:row.end])
    
columns='chr start end name score atac k27act nucs'.split()
peaks.to_csv('/home/kal/K27act_models/GM_data/k27act_training_regions.bed', columns=columns, header=None, index=False, sep='\t')

In [4]:
# load in preprocessed data
peaks_path = '/home/kal/K27act_models/GM_data/k27act_training_regions.bed'
peaks = pd.read_table(peaks_path, header=None)
peaks.columns='chr start end name score atac k27act nucs'.split()

In [5]:
# facts about the data
num_training_samples = len(peaks[(peaks.chr != 'chr8')])
print('{} training samples'.format(num_training_samples))

num_testing_samples = len(peaks[(peaks.chr == 'chr8') & (peaks.index%2 == 0)])
print('{} testing samples'.format(num_testing_samples))

num_validaiton_samples = len(peaks[(peaks.chr == 'chr8') & (peaks.index%2 == 1)])
print('{} validation samples'.format(num_validaiton_samples))

140163 training samples
3011 testing samples
3011 validation samples


In [6]:
# directory stuff
out_dir = '/home/kal/K27act_models/convolution_model/'
timestr = time.strftime("%Y%m%d_%H%M%S")
out_path = os.path.join(out_dir, timestr)
os.makedirs(out_path)
# make a file system
weights_path = os.path.join(out_path, 'intermediate_weights')
os.makedirs(weights_path)
history_path = os.path.join(out_path, 'history')
os.makedirs(history_path)

In [7]:
# MODEL
batch_size=32
seqs = Input(batch_shape=(batch_size, 1024, 5))

def add_RC(x):
    RC = K.concatenate([x[:, ::-1, :1], x[:, ::-1, 1:][::-1]], axis=2)
    return K.concatenate([x, RC], axis=0)
def add_RC_shape(s):
    return 2*s[0], s[1], s[2]

rc = Lambda(add_RC, name='add_rc', output_shape=add_RC_shape)(seqs)
conv = Conv1D(32, 32, name='conv_in')(rc)
#for k, n in [[32, 3], [32, 32], [16, 3], [8, 3]]:
#    conv = Conv1D(k, n)(conv)
#    conv = SpatialDropout1D(.05)(conv)
  
#conv = Dense(8)(conv)
#out = Dense(1)(conv)

def max_by_direction(x):
    forward_max = K.max(x[:x.shape[0]//2, :, :], axis=1)
    reverse_max = K.max(x[x.shape[0]//2:, ::-1, :], axis=1)
    return K.maximum(forward_max, reverse_max)
def max_by_direction_shape(s):
    return s[0]//2, 1

predictions = Lambda(max_by_direction, name='max_by_direciton', output_shape=max_by_direction_shape)(conv)
model = Model(inputs=seqs, outputs=predictions)

#zinb loss stuff (only for counts, not log fold change)
#pi_layer = Dense(num_outputs, activation='sigmoid')
#pi = pi_layer(expand)
#zinb = ZINB(pi, theta_init=tf.zeros([1, num_outputs]))

#model.layers[-1].trainable_weights.extend([zinb.theta_variable, *pi_layer.trainable_weights])

#plot model
plot_model(model, to_file=os.path.join(out_path, 'model.png'))

In [None]:
from keras.optimizers import RMSprop, SGD, Adam

opt = RMSprop(lr=1e-5)
model.compile(optimizer=opt, loss='mean_squared_error') 

early_stop = EarlyStopping(monitor='val_loss', patience=100)
filepath = os.path.join(weights_path, 'weights-{epoch:02d}-{val_loss:.3f}.hdf5')
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

losses = model.fit_generator(datagen.batch_gen(peaks, mode='train'), steps_per_epoch=num_training_samples//batch_size, 
                             epochs=20, callbacks=[early_stop, checkpoint], validation_data=datagen.batch_gen(peaks,mode='val'), 
                             validation_steps=num_validaiton_samples//batch_size, verbose=2)


val_hist = losses.history['val_loss']
train_hist = losses.history['loss']

Epoch 1/20
 - 258s - loss: 3044.5503 - val_loss: 532.0186

Epoch 00001: val_loss improved from inf to 532.01857, saving model to /home/kal/K27act_models/convolution_model/20180628_121908/intermediate_weights/weights-01-532.019.hdf5
Epoch 2/20


In [None]:
# final save
model_json = model.to_json()
with open(os.path.join(out_path, 'model.json'), 'w') as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(os.path.oin(out_path, 'final_model.h5'))
print('Saved model')