In [None]:
import os
import sys
sys.path.append('/home/kal/TF_models/bin/')
sys.path.append('/home/kal/K27act_models/convolution_model/')
sys.path.append('/home/kal/K27act_models/cg_model/')
sys.path.append('/home/thouis/basenji_embeddings')
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import tf_memory_limit
from zinb import ZINB
import seaborn as sns
import h5py
import matplotlib.pyplot as plt
from keras.utils import plot_model

import pandas as pd
import numpy as np
import itertools
import sequence
import ucscgenome
from tqdm import tqdm
import time
import datagen
import sequence
import helper
import viz_sequence
import pickle

from keras.models import Model, load_model
from keras.layers import Input, Dense, SpatialDropout1D, Conv1D, Lambda, Dropout, Activation
from keras.optimizers import RMSprop, SGD
from keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from keras import backend as K
import tensorflow as tf

import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

In [None]:
#load in the DNA
genome = ucscgenome.Genome('/home/kal/.ucscgenome/hg19.2bit')

In [None]:
# load in ATAC data
atac_path = '/home/kal/K27act_models/GM_data/ATAC/atac_average.hdf5'
atac = h5py.File(atac_path, 'r')

In [None]:
# load in preprocessed data
peaks_path = '/home/kal/K27act_models/GM_data/k27act_training_regions.bed'
peaks = pd.read_table(peaks_path, header=None)
#note we make the 'k27act' column the 'score' column for the correct training behavior
peaks.columns='chr start end name fold_change atac score nucs'.split() 

bins=50
plt.hist(np.log2(peaks[['dhs' in n for n in peaks['name']]]['score'] + 1), label='DHS', alpha=0.5, bins=bins)
plt.hist(np.log2(peaks[['k27' in n for n in peaks['name']]]['score'] + 1), label='K27act', alpha=0.5, bins=bins)
plt.ylabel('Number of peaks')
plt.xlabel('Log Average Normalized K27act')
plt.title('Distribution of k27 actetylation for GM128 experiments')
plt.legend()
plt.show()

In [None]:
# facts about the data
num_training_samples = len(peaks[(peaks.chr != 'chr8')])
print('{} training samples'.format(num_training_samples))

num_testing_samples = len(peaks[(peaks.chr == 'chr8') & (peaks.index%2 == 0)])
print('{} testing samples'.format(num_testing_samples))

num_validaiton_samples = len(peaks[(peaks.chr == 'chr8') & (peaks.index%2 == 1)])
print('{} validation samples'.format(num_validaiton_samples))

In [None]:
# directory stuff
out_dir = '/home/kal/K27act_models/convolution_model/'
timestr = time.strftime("%Y%m%d_%H%M%S")
out_path = os.path.join(out_dir, timestr + '_counts_conv')
os.makedirs(out_path)
# make a file system
weights_path = os.path.join(out_path, 'intermediate_weights')
os.makedirs(weights_path)
history_path = os.path.join(out_path, 'history')
os.makedirs(history_path)

In [None]:
# MODEL
batch_size=32
seqs = Input(batch_shape=(batch_size, 1024, 5))

def add_RC(x):
    RC = K.concatenate([x[:, ::-1, :1], x[:, ::-1, 1:][::-1]], axis=2)
    return K.concatenate([x, RC], axis=0)
def add_RC_shape(s):
    return 2*s[0], s[1], s[2]

conv = Lambda(add_RC, name='add_rc', output_shape=add_RC_shape)(seqs)

for k, n in [[128, 8], [64, 32], [64, 3], [32, 3]]:
    conv = Conv1D(k, n)(conv)
    conv = SpatialDropout1D(.05)(conv)

out = Dense(32)(conv) 
out = Dropout(.1)(out)
out = Dense(1)(out)

def max_by_direction(x):
    forward_max = K.max(x[:x.shape[0]//2, :, :], axis=1)
    reverse_max = K.max(x[x.shape[0]//2:, ::-1, :], axis=1)
    return K.maximum(forward_max, reverse_max)
def max_by_direction_shape(s):
    return s[0]//2, 1

wide_scan = Conv1D(1, 128, use_bias=False, kernel_initializer='ones', trainable=False, name='wide_scan', padding='valid')

scan = Lambda(max_by_direction, name='max_by_direciton', output_shape=max_by_direction_shape)(wide_scan(out))
predictions =  Activation('relu')(scan)
model = Model(inputs=seqs, outputs=predictions)

#plot model
plot_model(model, to_file=os.path.join(out_path, 'model.png'))

In [None]:
from keras.optimizers import RMSprop, SGD, Adam

opt_1 = Adam(beta_1=0.95, lr=1e-5, epsilon=.001)
model.compile(optimizer=opt, loss='mean_absolute_error') 

early_stop = EarlyStopping(monitor='val_loss', patience=100)
filepath = os.path.join(weights_path, 'weights-{epoch:02d}-{val_loss:.3f}.hdf5')
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

losses = model.fit_generator(datagen.batch_gen(peaks, mode='train', log=True), 
                             steps_per_epoch=num_training_samples//batch_size, 
                             epochs=20, callbacks=[early_stop, checkpoint], 
                             validation_data=datagen.batch_gen(peaks, mode='val', log=True), 
                             validation_steps=num_validaiton_samples//batch_size, verbose=2)




val_hist = losses.history['val_loss']
train_hist = losses.history['loss']

In [None]:
# final save
model.save(os.path.join(out_path, 'final_model.h5'))

In [None]:
# write out history
pickle.dump(losses.history, open(os.path.join(history_path, 'history.pk'), 'wb'))

In [None]:
# look at loss
plt.plot(val_hist, label='validation')
plt.plot(train_hist, label='training')
plt.title('Loss for training model')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()