In [None]:
import sys
import os
sys.path.append(os.path.abspath("../src/"))
import model.util as model_util
import model.profile_models as profile_models
import model.train_profile_model as train_profile_model
import model.profile_performance as profile_performance
import feature.util as feature_util
import feature.make_profile_dataset as make_profile_dataset
import keras
import tensorflow as tf
import numpy as np
import pandas as pd
import scipy.special
import sklearn
import matplotlib.pyplot as plt
import json
import tqdm
from modisco.visualization import viz_sequence
tqdm.tqdm_notebook(range(1))

### Define paths for the model and data of interest

In [None]:
# Define the paths to the files and model, and some constants
files_spec_path = "/users/amtseng/tfmodisco/data/processed/ENCODE/config/TEAD4/TEAD4_training_paths.json"
model_path = "/users/amtseng/tfmodisco/models/trained_models/TEAD4_fold7/3/model_ckpt_epoch_10.h5"

reference_fasta = "/users/amtseng/genomes/hg38.fasta"
chrom_sizes = "/users/amtseng/genomes/hg38.canon.chrom.sizes"
input_length = 1346
profile_length = 1000
num_tasks = 1

In [None]:
splits_json_path = "/users/amtseng/tfmodisco/data/processed/ENCODE/chrom_splits.json"
with open(splits_json_path, "r") as f:
    splits_json = json.load(f)
train_chroms, val_chroms, test_chroms = \
    splits_json["1"]["train"], splits_json["1"]["val"], \
    splits_json["1"]["test"]
all_chroms = train_chroms + val_chroms + test_chroms

In [None]:
# Extract the file specs
with open(files_spec_path, "r") as f:
    files_spec = json.load(f)
peak_beds = files_spec["peak_beds"]
profile_hdf5 = files_spec["profile_hdf5"]

In [None]:
# Import the model
custom_objects = {
    "kb": keras.backend,
    "profile_loss": train_profile_model.get_profile_loss_function(num_tasks, profile_length),
    "count_loss": train_profile_model.get_count_loss_function(num_tasks)
}
new_model = keras.models.load_model(model_path, custom_objects=custom_objects)

### Data preparation
Use classes from `make_profile_dataset` to prepare positive and negative inputs.

In [None]:
batch_size = 128
loader = make_profile_dataset.create_data_loader(
    peak_beds, profile_hdf5, "SummitCenteringCoordsBatcher", batch_size,
    reference_fasta, chrom_sizes, input_length, profile_length, 1,
    None, True, 0, None, chrom_set=all_chroms, return_coords=True
)

### Get first layer filter activations

In [None]:
# Get the filters in the existing model
filters = model.get_layer("dil_conv_1").get_weights()
filter_size, num_filters = filters[0].shape[0], filters[0].shape[2]
num_windows = input_length - filter_size + 1

In [None]:
# Create a new model that takes in input sequence and passes it through an
# identical first convolutional layer
filter_model_input = keras.layers.Input(shape=(input_length, 4), name="input_seq")
filter_model_conv = keras.layers.Conv1D(
    filters=num_filters, kernel_size=filter_size, padding="valid",
    activation="relu", dilation_rate=1, name="dil_conv_1"
)
filter_model = keras.Model(
    inputs=filter_model_input, outputs=filter_model_conv(filter_model_input)
)

In [None]:
# Set the weights of this layer to be the same as the imported model
filter_model.get_layer("dil_conv_1").set_weights(filters)

In [None]:
enq = keras.utils.OrderedEnqueuer(loader, use_multiprocessing=True)
workers, queue_size = 10, 20
enq.start(workers, queue_size)
para_batch_gen = enq.get()

In [None]:
num_batches = len(enq.sequence)
num_per_batch = batch_size * 2  # With revcomp
num_samples_exp = num_batches * num_per_batch
num_samples_seen = 0
all_coords = np.empty((num_samples_exp, 3), dtype=object)
all_input_seqs = np.empty((num_samples_exp, input_length, 4))
all_activations = np.empty((num_samples_exp, num_windows, num_filters))

In [None]:
# For each input example, record the set of activations seen for each filter
for i in tqdm.notebook.trange(len(enq.sequence)):
    input_seqs, profiles, statuses, coords, peaks = next(para_batch_gen)
    
    start, end = num_samples_seen, num_samples_seen + input_seqs.shape[0]
    
    all_coords[start:end] = coords
    all_input_seqs[start:end] = input_seqs
    
    activations = filter_model.predict_on_batch(input_seqs)
    all_activations[start:end] = activations
    
    num_samples_seen += input_seqs.shape[0]

In [None]:
all_coords = all_coords[:num_samples_seen]
all_input_seqs = all_input_seqs[:num_samples_seen]
all_activations = all_activations[:num_samples_seen]

In [None]:
enq.stop()

### Get output predictions after nullifying each filter

In [None]:
# For each of the first layer filters, nullify so that the output is always
# just the average activation, then rerun the data to get output predictions
all_log_pred_profs = np.empty((num_samples_exp, num_filters, num_tasks, profile_length, 2))
all_log_pred_counts = np.empty((num_samples_exp, num_filters, num_tasks, 2))

filter_weights = model.get_layer("dil_conv_1").get_weights()
for filter_index in range(num_filters):
    # Nullify the filter
    filter_weights_copy = [x.copy() for x in filter_weights]
    filter_weights_copy[0][:, :, filter_index] = 0  # Weights to 0
    filter_weights_copy[1][filter_index] = np.mean(all_activations[:, :, filter_index])  # Bias to average
    
    # Set the weights to nullify the filter
    model.get_layer("dil_conv_1").set_weights(filter_weights_copy)
    
    enq = keras.utils.OrderedEnqueuer(loader, use_multiprocessing=True)
    workers, queue_size = 10, 20
    enq.start(workers, queue_size)
    para_batch_gen = enq.get()
    num_samples_seen = 0
    for i in tqdm.notebook.trange(len(enq.sequence)):
        input_seqs, profiles, statuses, coords, peaks = next(para_batch_gen)

        start, end = num_samples_seen, num_samples_seen + input_seqs.shape[0]
        
        log_pred_profs, log_pred_counts = model.predict_on_batch([input_seqs, profiles[:,:num_tasks,:,:]])

        all_log_pred_profs[start:end, filter_index, :, :, :] = log_pred_profs
        all_log_pred_counts[start:end, filter_index, :, :] = log_pred_counts

        num_samples_seen += input_seqs.shape[0]