In [9]:
import os
import sys
sys.path.append('/home/kal/TF_models/bin/')
sys.path.append('/home/kal/K27act_models/convolution_model/')
sys.path.append('/home/kal/K27act_models/cg_model/')
sys.path.append('/home/thouis/basenji_embeddings')
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import tf_memory_limit
from zinb import ZINB
import seaborn as sns
import h5py
import matplotlib.pyplot as plt
from keras.utils import plot_model

import pandas as pd
import numpy as np
import itertools
import sequence
import ucscgenome
from tqdm import tqdm
import time
import datagen
import sequence
import helper
import viz_sequence
import pickle
import cg_annotations

from keras.models import Model, load_model
from keras.layers import Input, Dense, SpatialDropout1D, Conv1D, Lambda, Dropout, Activation
from keras.optimizers import RMSprop, SGD
from keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from keras import backend as K
import tensorflow as tf

In [3]:
#load in the DNA
genome = ucscgenome.Genome('/home/kal/.ucscgenome/hg19.2bit')

# load in ATAC data
atac_path = '/home/kal/K27act_models/GM_data/ATAC/atac_average.hdf5'
atac = h5py.File(atac_path, 'r')

# load in (semi) preprocessed data
peaks_path = '/home/kal/K27act_models/GM_data/k27act_training_regions.bed'
peaks = pd.read_table(peaks_path, header=None)
peaks.columns='chr start end name log_fold atac k27act nucs'.split()

# facts about the data
num_training_samples = len(peaks[(peaks.chr != 'chr8')])
print('{} training samples'.format(num_training_samples))

num_testing_samples = len(peaks[(peaks.chr == 'chr8') & (peaks.index%2 == 0)])
print('{} testing samples'.format(num_testing_samples))

num_validaiton_samples = len(peaks[(peaks.chr == 'chr8') & (peaks.index%2 == 1)])
print('{} validation samples'.format(num_validaiton_samples))

160363 training samples
3413 testing samples
3414 validation samples


In [5]:
# make score cg for regression matched ml
peaks['score'] = peaks.apply(cg_annotations.gc_frac, axis=1)
#peaks['cpg_frac'] = peaks.apply(cg_annotations.cpg_frac, axis=1)

In [6]:
# directory stuff
out_dir = '/home/kal/K27act_models/cg_model/'
timestr = time.strftime("%Y%m%d_%H%M%S")
out_path = os.path.join(out_dir, timestr + '_cg_ml')
os.makedirs(out_path)
# make a file system
weights_path = os.path.join(out_path, 'intermediate_weights')
os.makedirs(weights_path)
history_path = os.path.join(out_path, 'history')
os.makedirs(history_path)

In [8]:
# model's input
batch_size=32
seqs = Input(batch_shape=(batch_size, 1024, 5))
num_outputs=1

#build the model
def add_RC(x):
    RC = K.concatenate([x[:, ::-1, :1], x[:, ::-1, 1:][::-1]], axis=2)
    return K.concatenate([x, RC], axis=0)
def add_RC_shape(s):
    return 2*s[0], s[1], s[2]

add_rc = Lambda(add_RC, name='add_rc', output_shape=add_RC_shape)
conv = Conv1D(128, 32, name='conv_in')
drop = SpatialDropout1D(.1)
dense = Dense(num_outputs)

def max_by_direction(x):
    forward_max = K.max(x[:x.shape[0]//2, :, :], axis=1)
    reverse_max = K.max(x[x.shape[0]//2:, ::-1, :], axis=1)
    return K.maximum(forward_max, reverse_max)
def max_by_direction_shape(s):
    return s[0]//2, 1

wide_scan = Conv1D(1, 128, use_bias=False, kernel_initializer='ones', trainable=False, name='wide_scan', padding='valid')
take_max = Lambda(max_by_direction, name='max_by_direciton', output_shape=max_by_direction_shape)

predictions = take_max(wide_scan(dense(drop(conv(add_rc(seqs))))))

model = Model(inputs=seqs, outputs=predictions)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (32, 1024, 5)             0         
_________________________________________________________________
add_rc (Lambda)              (64, 1024, 5)             0         
_________________________________________________________________
conv_in (Conv1D)             (64, 993, 128)            20608     
_________________________________________________________________
spatial_dropout1d_2 (Spatial (64, 993, 128)            0         
_________________________________________________________________
dense_2 (Dense)              (64, 993, 1)              129       
_________________________________________________________________
wide_scan (Conv1D)           (64, 866, 1)              128       
_________________________________________________________________
max_by_direciton (Lambda)    (32, 1)                   0         
Total para

In [None]:
from keras.optimizers import RMSprop, SGD, Adam

opt = Adam(beta_1=0.95, lr=1e-5, epsilon=.001)
model.compile(optimizer=opt, loss='mean_squared_error') 

early_stop = EarlyStopping(monitor='val_loss', patience=100)
filepath = os.path.join(weights_path, 'weights-{epoch:02d}-{val_loss:.3f}.hdf5')
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

losses = model.fit_generator(datagen.batch_gen(peaks, mode='train', log=True), 
                             steps_per_epoch=num_training_samples//batch_size, 
                             epochs=20, callbacks=[early_stop, checkpoint], 
                             validation_data=datagen.batch_gen(peaks, mode='val', log=True), 
                             validation_steps=num_validaiton_samples//batch_size, verbose=2)

Epoch 1/20
 - 406s - loss: 8201.1474 - val_loss: 4620.2170

Epoch 00001: val_loss improved from inf to 4620.21704, saving model to /home/kal/K27act_models/cg_model/20180710_135752_cg_ml/intermediate_weights/weights-01-4620.217.hdf5
Epoch 2/20
 - 408s - loss: 5508.0746 - val_loss: 4867.8939

Epoch 00002: val_loss did not improve from 4620.21704
Epoch 3/20


In [None]:
# final save
model.save(os.path.join(out_path, 'final_model.h5'))

# write out history
pickle.dump(losses.history, open(os.path.join(history_path, 'history.pk'), 'wb'))