The pipeline will be as follows:

1. Use Peyton's model json architecture file + h5 weights file to generate predictions for all the train/val/test examples
2. Write these predictions to a file. These will be used as labels for the MPRA model
3. Convert sequences to 145bp by taking the central 145bp regions
4. Using these sequences as input, train to predict accessibility outputs
5. Initialize MPRA models with this saved architecture

# Use accessibility model architecture + weights to generate predictions

In [2]:
from keras.models import model_from_json
import json

# json_path = "../model_files/atac_xferlearn_jul24/record_12_model_FbFup_modelJson.json"
# json_path = "../model_files/sharpr_znormed_jul23/record_13_model_bgGhy_modelJson.json"
# json_path = "../model_files/regressionJun24Positives/record_2_model_Yjv2n_modelJson.json"
json_path = "../model_files/atac_xferlearn_jul24/transferlearn_json_keras1.json"
# json_path = "../model_files/atac_xferlearn_jul24/record_3_model_RatMV_modelJson.json"
with open(json_path) as json_file:
    json_string = json.dumps(json.load(json_file))
    model = model_from_json(json_string) 
    
print(model.summary())

Using Theano backend.
Using cuDNN version 6020 on context None
Mapped name None to device cuda: Tesla P100-PCIE-16GB (0000:03:00.0)


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
conv1_layer0 (Convolution1D)     (None, 984, 300)      20700       convolution1d_input_1[0][0]      
____________________________________________________________________________________________________
batchnorm1_layer1 (BatchNormaliz (None, 984, 300)      1200        conv1_layer0[0][0]               
____________________________________________________________________________________________________
activation1_layer2 (Activation)  (None, 984, 300)      0           batchnorm1_layer1[0][0]          
____________________________________________________________________________________________________
conv2_layer3 (Convolution1D)     (None, 978, 250)      525250      activation1_layer2[0][0]         
___________________________________________________________________________________________

In [19]:
import h5py
import numpy as np

# (300, 4, 17, 1) --> (17, 1, 4, 300)
# swap 0,3; 1,2; 0,1: 0, 1, 2, 3 --> 3, 1, 2, 0 --> 3, 2, 1, 0 --> 2, 3, 1, 0

f = h5py.File('../model_files/atac_xferlearn_jul24/record_12_model_FbFup_modelWeights.h5', 'r')

# Checking to see if shapes match up
# # print f.keys()
# # print model.layers[0].name
# print "Previous model weight shapes"
# for i in range(len(f.keys())):
#     layer = 'layer_%d' % i
# #     print layer
# #     print f[layer].keys()
#     for params in f[layer].keys():
#         print f[layer][params].shape

# print "\nNew model weight shapes"
# for w in model.get_weights():
#     print w.shape

weights = []
for i in range(len(f.keys())):
    layer = 'layer_%d' % i
#     if len(f[layer].keys()) == 4:
#         batchnorm_params = f[layer].keys()
#         print batchnorm_params
# #         print f[layer][batchnorm_params[2]].shape
#         weights.append(f[layer][batchnorm_params[0]])
#         weights.append(f[layer][batchnorm_params[1]])
#         weights.append(f[layer][batchnorm_params[2]])
#         weights.append(f[layer][batchnorm_params[3]])
#         continue
    for params in f[layer].keys():
        layer_W = np.array(f[layer][params])
        if len(layer_W.shape) == 4:
            layer_W = np.swapaxes(np.swapaxes(np.swapaxes(layer_W, 0, 3), 1, 2), 0, 1)
        weights.append(layer_W)

model.set_weights(weights)

In [22]:
import h5py
import numpy as np
import time

train_data = h5py.File("../hdf5files/atac_xferlearn_jul24/train_data.hdf5")
X_train = train_data['X']['sequence']
y_train_pred = np.ndarray(shape = (len(X_train), 16))

batch_size = 500
t0 = time.time()
print "Total batches: %d" % (len(X_train)/batch_size + 1)
for i in range(len(X_train)/batch_size + 1):
    if i % 100 == 0 and i > 0:
        print("Batches %d to %d took %.3f sec" % (i-100, i, time.time() - t0))
        t0 = time.time()
        print "On batch %d" % i
    if (i+1)*batch_size > len(X_train):
        y_train_pred[i*batch_size:] = model.predict_on_batch(X_train[i*batch_size:])
    y_train_pred[i*batch_size : (i+1)*batch_size] = model.predict_on_batch(X_train[i*batch_size : (i+1)*batch_size])

Total batches: 1676
Batches 0 to 100 took 66.978
On batch 100
Batches 100 to 200 took 68.213
On batch 200
Batches 200 to 300 took 68.261
On batch 300
Batches 300 to 400 took 67.920
On batch 400
Batches 400 to 500 took 67.196
On batch 500
Batches 500 to 600 took 67.861
On batch 600
Batches 600 to 700 took 67.277
On batch 700
Batches 700 to 800 took 68.547
On batch 800
Batches 800 to 900 took 68.343
On batch 900
Batches 900 to 1000 took 66.979
On batch 1000
Batches 1000 to 1100 took 66.711
On batch 1100
Batches 1100 to 1200 took 71.508
On batch 1200
Batches 1200 to 1300 took 68.118
On batch 1300
Batches 1300 to 1400 took 67.790
On batch 1400
Batches 1400 to 1500 took 70.220
On batch 1500
Batches 1500 to 1600 took 68.200
On batch 1600


In [6]:
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score
from scipy.stats import spearmanr
import time

def generate_predictions(model, X, ntasks, batchsize=500):
    y_pred = np.ndarray(shape = (len(X), ntasks))
    
    t0 = time.time()
    print "Total batches: %d" % (len(X)/batchsize + 1)
    for i in range(len(X)/batchsize + 1):
        if (i % 100 == 0 or i == len(X)/batchsize) and i > 0:
            print("Batches %d to %d took %.3f sec" % (i-100, i, time.time() - t0))
            t0 = time.time()
            print "On batch %d" % i
        if (i+1)*batchsize > len(X):
            y_pred[i*batchsize:] = model.predict_on_batch(X[i*batchsize:])
        y_pred[i*batchsize : (i+1)*batchsize] = model.predict_on_batch(X[i*batchsize : (i+1)*batchsize])
    
    return y_pred

def evaluate_predictions(y_true, y_pred, ntasks):
    for i in range(ntasks):
        y_true_task = y_true[:, i]
        y_pred_task = y_pred[:, i]
        auroc = roc_auc_score(y_true_task, y_pred_task)
        auprc = average_precision_score(y_true_task, y_pred_task)
        sprmn = spearmanr(y_true_task, y_pred_task)
        print "Task %d: AUROC = %.3f, AUPRC = %.3f, Spearman = %.3f, p = %.3f" % (i, auroc, auprc, sprmn[0], sprmn[1])

In [20]:
val_data = h5py.File("../hdf5files/atac_xferlearn_jul24/valid_data.hdf5")
X_val = val_data['X']['sequence'][:600]

y_pred_val = generate_predictions(model, X_val, batchsize = 500, ntasks = 16)

Total batches: 2
Batches -99 to 1 took 0.384 sec
On batch 1


In [22]:
# y_train_true = train_data['Y']['output']
y_val_true = val_data['Y']['output'][:600]
print np.sum(y_val_true[:, 0])
print np.sum(y_pred_val[:, 0])

# print "Training set evaluation"
# evaluate_predictions(y_train_true, y_train_pred, ntasks=16)
print "\nValidation set evaluation"
evaluate_predictions(y_val_true, y_pred_val, ntasks=16)

184.0
1.19464308713e-25

Validation set evaluation
Task 0: AUROC = 0.496, AUPRC = 0.307, Spearman = -0.047, p = 0.249
Task 1: AUROC = 0.500, AUPRC = 0.240, Spearman = nan, p = nan
Task 2: AUROC = 0.533, AUPRC = 0.379, Spearman = 0.055, p = 0.182
Task 3: AUROC = 0.499, AUPRC = 0.163, Spearman = -0.018, p = 0.659
Task 4: AUROC = 0.466, AUPRC = 0.117, Spearman = -0.067, p = 0.102
Task 5: AUROC = 0.500, AUPRC = 0.055, Spearman = nan, p = nan
Task 6: AUROC = 0.629, AUPRC = 0.200, Spearman = 0.155, p = 0.000
Task 7: AUROC = 0.483, AUPRC = 0.251, Spearman = -0.077, p = 0.058
Task 8: AUROC = 0.438, AUPRC = 0.141, Spearman = -0.102, p = 0.012
Task 9: AUROC = 0.510, AUPRC = 0.258, Spearman = 0.021, p = 0.606
Task 10: AUROC = 0.579, AUPRC = 0.406, Spearman = 0.126, p = 0.002
Task 11: AUROC = 0.500, AUPRC = 0.303, Spearman = nan, p = nan
Task 12: AUROC = 0.484, AUPRC = 0.087, Spearman = -0.053, p = 0.198
Task 13: AUROC = 0.596, AUPRC = 0.190, Spearman = 0.116, p = 0.005
Task 14: AUROC = 0.500, AUP

In [16]:
# On cpu, ~30s for 15 sample batch. On gpu, 0.5 sec.

import time

t0 = time.time()
y = model.predict_on_batch(X_train[i*batch_size : (i+1)*batch_size])
print(time.time() - t0)

0.509535074234


For some reason, the parameter initialization did not seem to work. The model just kept outputting full zeros for all the inputs, which was unfortunate. Not sure why - I thought it had something to do with the batch norm, but it looks like that's not it. So, I'm just going to create 145bp input sequences and train the MPRA model directly on the accessibility data (as opposed to training on the outputs of the accessibility model)

# Convert train/val/test HDF5 files to 145bp inputs 

Each example was previously 1000bp; I'm going to take the $145*5 = 725$ central base pairs in each sequence, and use those as 5 separate training examples. Performance will probably be lower since accessibility probably does require more context than just 145bp.

In [25]:
import h5py
import numpy as np

fnames = ['valid_data.hdf5', 'train_data.hdf5', 'test_data.hdf5']
fnames = ['../hdf5files/atac_xferlearn_jul24/pretrain_' + name for name in fnames]
windows_per_seq = 5
seqlen = 145
start_indices = np.arange(500 - seqlen*windows_per_seq/2, 500 + seqlen*windows_per_seq/2, seqlen)
print start_indices
for fname in fnames:
    print "On file %s" % fname
    f = h5py.File(fname, 'r+')
    
    # Sequences
    sequences = np.array(f['X/sequence'])
    print sequences.shape
    new_sequences = np.ndarray(shape = (windows_per_seq*len(sequences), seqlen, 4))
    for i in range(windows_per_seq):
        new_sequences[np.arange(i, len(new_sequences), windows_per_seq)] = sequences[:, start_indices[i] : start_indices[i] + 145]
    print new_sequences.shape
    del f['X/sequence']
    f.create_dataset('X/sequence', data = new_sequences)
    
    # Labels
    labels = np.array(f['Y/output'])
    print labels.shape
    new_labels = np.repeat(labels, windows_per_seq, axis=0)
    print new_labels.shape
    del f['Y/output']
    f.create_dataset('Y/output', data = new_labels)
    
    f.close()
    

[138 283 428 573 718]
On file ../hdf5files/atac_xferlearn_jul24/pretrain_valid_data.hdf5
(41635, 1000, 4)
(208175, 145, 4)
(41635, 16)
(208175, 16)
On file ../hdf5files/atac_xferlearn_jul24/pretrain_train_data.hdf5
(837977, 1000, 4)
(4189885, 145, 4)
(837977, 16)
(4189885, 16)
On file ../hdf5files/atac_xferlearn_jul24/pretrain_test_data.hdf5
(154967, 1000, 4)
(774835, 145, 4)
(154967, 16)
(774835, 16)


In [29]:
f = h5py.File('../hdf5files/atac_xferlearn_jul24/pretrain_valid_data.hdf5')
print f['X/sequence'].shape
print f['Y/output'].shape
print f['Y/output'][5:10]

(208175, 145, 4)
(208175, 16)
[[ 1.  1.  1.  1.  0.  0.  1.  0.  1.  1.  1.  1.  0.  0.  1.  1.]
 [ 1.  1.  1.  1.  0.  0.  1.  0.  1.  1.  1.  1.  0.  0.  1.  1.]
 [ 1.  1.  1.  1.  0.  0.  1.  0.  1.  1.  1.  1.  0.  0.  1.  1.]
 [ 1.  1.  1.  1.  0.  0.  1.  0.  1.  1.  1.  1.  0.  0.  1.  1.]
 [ 1.  1.  1.  1.  0.  0.  1.  0.  1.  1.  1.  1.  0.  0.  1.  1.]]


# Evaluate predictions of ensembled MPRA models

In [31]:
from keras.models import model_from_json
import json

# Load architecture from any one of the many JSON files
json_path = "../model_files/sharpr_znormed_jul23/record_13_model_bgGhy_modelJson.json"
with open(json_path) as json_file:
    json_string = json.dumps(json.load(json_file))
    model = model_from_json(json_string) 
    
# print(model.summary())

In [None]:
weights_files = ["../model_files/sharpr_znormed_jul23//record_13_model_bgGhy_modelWeights.h5", # 0.191
                 "../model_files/sharpr_znormed_jul23//record_14_model_10Sx5_modelWeights.h5", # 0.186
                 
                ]