In [1]:
import os
import sys
sys.path.append('/home/kal/TF_models/bin/')
os.environ['CUDA_VISIBLE_DEVICES'] = '2' # Must be before importing keras!
import tf_memory_limit

from keras.models import Model
from keras.callbacks import ModelCheckpoint, TensorBoard, Callback, LearningRateScheduler
from keras.utils import plot_model
from keras.layers import Input, Lambda, Dense, Conv1D, Activation
from keras.optimizers import SGD, Adam
import keras.backend as K


import numpy as np
import matplotlib.pylab as plt 
from sklearn.metrics import precision_recall_curve
from scipy.integrate import trapz
from tqdm import tqdm
import ucscgenome
import pandas as pd
import pickle
import time

import sequence
import train_TFmodel
import eval_TFmodel
import ctcfgen
import seq_only_gen
import train_seq_regression_convnet
pwm = eval_TFmodel.TFmodel('/home/kal/TF_models/seq_only/seq_classifier/pwm_frozen/')

Using TensorFlow backend.
  self.seq = helper.softmax(np.log(dist))


Loading model without Bias layer


In [2]:
out_path = os.path.join('/home/kal/TF_models/seq_only/count_regression/9_channel_CTCF')
os.makedirs(out_path)

In [3]:
#make some paths
bed_path = '/home/kal/TF_models/data/count_regression/ctcf_regions_9_log.bed'
columns='chr start end name score nucs nlog1 nlog2 nlog3 nlog4 nlog5 nlog6 nlog7 nlog8 nlog9'
score_columns ='nlog1 nlog2 nlog3 nlog4 nlog5 nlog6 nlog7 nlog8 nlog9'.split()
peaks = pd.read_table(bed_path, header=None)
peaks.columns = columns.split()

prediction_window = 256
half_window = prediction_window // 2
num_training_examples = sum(peaks.chr != 'chr8')

In [4]:
def native_gen(mode='train', once=False):
    """Generate a positive seqeunce sample."""
    done = False
    if mode == 'test':
        indices = np.asarray(peaks[peaks.chr == 'chr8'].index.values)
        indices = [x for x in indices if x%2 == 0]
    elif mode =='val':
        indices = np.asarray(peaks[peaks.chr == 'chr8'].index.values)
        indices = [x for x in indices if x%2 == 1]
    else:
        indices = np.asarray(peaks[peaks.chr != 'chr8'].index.values)
    while not done:
        np.random.shuffle(indices)
        for idx in indices:
            if len(score_columns) == 1:
                yield peaks.get_value(idx, 'nucs'), peaks.get_value(idx, score_columns)
            else:
                scores=list()
                for c in score_columns:
                    scores.append(peaks.get_value(idx, c))
                yield peaks.get_value(idx, 'nucs'), np.asarray(scores)
            done = once
            
def scrambled_gen(scrambled, mode='train'):
        posgen = native_gen(mode=mode)
        if prediction_window % scrambled != 0:
            print(str(scrambled) + 'mers do not evenly divide the sequence.')
            scrambled = 1
        for p, q in posgen:
            p = np.asarray([base for base in p])
            p = p.reshape((-1,scrambled))
            np.random.shuffle(p)
            p = p.reshape([-1])
            yield ''.join(p) 
            
            
def pair_gen(mode='train', once=False, batch_size=32):
    """Generate batched of paired samples."""
    p = native_gen(mode=mode, once=once)
    n = scrambled_gen(2, mode=mode)
    while True:
        pos_seqs = list()
        neg_seqs = list()
        scores = list()
        for i in range(batch_size // 2):
            pos_seq, score = next(p)
            neg_seq = next(n)
            pos_seqs.append(sequence.encode_to_onehot(pos_seq))
            neg_seqs.append(sequence.encode_to_onehot(neg_seq))
            scores.append(score)
        labels = np.append(np.asarray(scores), np.zeros((32 // 2, len(scores[0]))), axis=0)
        yield np.asarray(pos_seqs + neg_seqs), labels

In [5]:
#Define some params
batch_size = 32
drop_rate = 0.1
conv_string='32.3_32.32_32.3_16.3'
conv_list = [[int(x) for x in cell.split('.')] for cell in conv_string.split('_')]
num_outputs = len(score_columns)
# Get the time-tag for the model.
timestr = time.strftime("%Y%m%d_%H%M%S")
# make a file system
weights_path = os.path.join(out_path, 'intermediate_weights')
os.makedirs(weights_path)
history_path = os.path.join(out_path, 'history')
os.makedirs(history_path)

In [6]:
# define the model
# A one how input with reverse complement -- > series of convolutions --> smoothing --> maximum over directions --> bias --> activation
# Input *is* one hot encoded - and type np.uint8.
input = Input(batch_shape=(batch_size, prediction_window, 4))
# add reverse complement so the model only has to learn one direciton
add_RC_to_batch = Lambda(lambda x: K.concatenate([x, x[:, ::-1, ::-1]], axis=0), output_shape=lambda s: (2 * s[0], s[1], s[2]))  
# output an acitvaiton at each base form convolutions of convolutions
per_base_score = train_TFmodel.BasicConv(prediction_window, conv_list, final_activation=None, num_outputs=num_outputs, drop_rate=drop_rate) 
# Take wide-window convolution scan with fixed wieghts to smooth out peaks.
wide_scan = Conv1D(num_outputs, 50, use_bias=False, kernel_initializer='ones', trainable=False, name='wide_scan', padding='valid')
# Get the forward/reverse sequence maximum. 
max_by_direction = Lambda(lambda x: K.maximum(K.max(x[:x.shape[0]//2, :, :], axis=1), K.max(x[x.shape[0]//2:, ::-1, :], axis=1)), name='stackmax', output_shape=lambda s: (s[0] // 2, num_outputs))
# Add a custom bias layer
final_bias = train_TFmodel.Bias(num_outputs, name='bias')

# build the model
predictions = final_bias(max_by_direction(wide_scan(per_base_score(add_RC_to_batch(input)))))
model = Model(inputs=[input], outputs=[predictions])
# save a graph of the model configuration
#plot_model(model, to_file=os.path.join(out_path, timestr + '_' + conv_string + '_model.png'), show_shapes=True)

Convolutions used: [[32, 3], [32, 32], [32, 3], [16, 3]] [neurons, filter]


In [None]:
verb=0
loss_func='mean_squared_error'
num_epochs=3

# create optimizers for the three learning phases with learning rate 1/10th of previous at each step
optimizer_1 = Adam(beta_1=0.95, lr=0.0005, epsilon=.1)
optimizer_2 = Adam(beta_1=0.95, lr=0.00005, epsilon=.001)
optimizer_3 = Adam(beta_1=0.95, lr=0.000005, epsilon=.00001)

# Create a callback to save the model weights.
checkpath = os.path.join(weights_path, '_'.join([timestr, 'weights_1_{epoch:02d}_{val_loss:.2f}.hdf5']))
checkpointer = ModelCheckpoint(checkpath, verbose=verb, monitor='val_loss', mode='min')
callbacks_list = [checkpointer]

#create data generators
traingen=pair_gen(mode='train')
valgen=pair_gen(mode='val')

In [None]:
#train the model
num_batches = len(peaks[peaks.chr != 'chr8']) // batch_size
print(str(num_batches) + ' batches')

# go for 250 'epochs' in each of the three training stages
# each 'epoch' is actually only 1/50 of the data -- so 5 real epochs

# compile and run the first iteration of the training params and model
model.compile(loss=loss_func, optimizer=optimizer_1, metrics=['mse'])

History_1 = model.fit_generator(traingen, num_batches // 50, epochs=num_epochs*50 // 3, validation_data=valgen, validation_steps=20, callbacks=callbacks_list, verbose=verb)

# compile and train the second iteration
checkpath = os.path.join(weights_path, '_'.join([timestr, 'weights_2_{epoch:02d}_{val_loss:.2f}.hdf5']))
checkpointer = ModelCheckpoint(checkpath, verbose=verb, monitor='val_loss', mode='max')
callbacks_list = [checkpointer]
model.compile(loss=loss_func, optimizer=optimizer_2, metrics=['mse'])
History_2 = model.fit_generator(traingen, num_batches // 50, epochs=num_epochs*50 // 3, validation_data=valgen, validation_steps=20, callbacks=callbacks_list, verbose=verb)

# compile and train the third iteration
checkpath = os.path.join(weights_path, '_'.join([timestr, 'weights_3_{epoch:02d}_{val_loss:.2f}.hdf5']))
checkpointer = ModelCheckpoint(checkpath, verbose=verb, monitor='val_loss', mode='min')
callbacks_list = [checkpointer]
model.compile(loss=loss_func, optimizer=optimizer_3, metrics=['mse'])
History_3 = model.fit_generator(traingen, num_batches // 50, epochs=num_epochs*50 // 3, validation_data=valgen, validation_steps=20, callbacks=callbacks_list, verbose=verb)

14522 batches


In [None]:
#do this in bash
print('Training model')
train_seq_regression_convnet.make_model(out_path, '32.3_32.32_16.3_8.3', gen_path, loss_func='mean_squared_error', num_epochs=9, verb=0)
ml_model = eval_TFmodel.TFmodel(out_path)

In [None]:
# get training loss stuff

out_path='/home/kal/TF_models/seq_only/count_regression/test2_CTCF/'
ml_model = eval_TFmodel.TFmodel(out_path)

for file in os.listdir(os.path.join(out_path, 'history')):
    file = os.path.join(os.path.join(out_path, 'history'), file)
    # find the history pickles
    if file.endswith('1.pk1'):
        with open(file, 'rb') as infile:
            h1 = pickle.load(infile)
    elif file.endswith('2.pk1'):
        with open(file, 'rb') as infile:
            h2 = pickle.load(infile)
    elif file.endswith('3.pk1'):
        with open(file, 'rb') as infile:
            h3 = pickle.load(infile)

In [None]:
# Summarize history for accuracy
plt.plot(eval_TFmodel.group_stats('loss', h1, h2, h3))
plt.plot(eval_TFmodel.group_stats('val_loss', h1, h2, h3))
plt.title('Training and Validation Loss for CTCF model')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# predict on all atac sequences from k562 and make a p-r and p-r gain curve
npreds = dict()
bed_path = '/home/kal/TF_models/data/K562_atac_peaks/final_atac.bed'
peaks = pd.read_table(bed_path, header=None)
peaks = peaks.sample(1000)
peaks.columns = 'chr start end ctcf_label . . '.split()
peaks = peaks[peaks['chr']!='chrM']
print(len(peaks))
pwm_preds = pwm.predict_bed(peaks)
true_labels = peaks['ctcf_label']
ml_preds = ml_model.predict_bed(peaks)

In [None]:
#p-r curve
plt.figure(figsize=(8,6))
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('P-R Curve for CTCF Binding in K562 ATAC Regions')
plt.xlabel('Recall')
plt.ylabel('Precision')
pwm_p, pwm_r, pwm_t = precision_recall_curve(peaks['ctcf_label'], pwm_preds, pos_label=1)
plt.plot(pwm_r, pwm_p, label='PWM Model')
p, r, t = precision_recall_curve(peaks['ctcf_label'], ml_preds, pos_label=1)
plt.plot(r, p, label='ML Model')
plt.legend()
plt.show()
print('AOC: ' + str(- trapz(p,r)))