### Direct links to results
[Motifs](#motifs)

In [None]:
import h5py
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import deeplift.visualization.viz_sequence as viz_sequence
import pandas as pd
import json
import os
import pyfaidx
import tqdm
tqdm.tqdm_notebook()

In [None]:
# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

### Define constants and paths

In [None]:
# Define parameters/fetch arguments
filter_activations_path = os.environ["TFM_RESULTS_FILTER_ACTIVATIONS"]
    
print("Path to filter activations: %s" % filter_activations_path)

In [None]:
# Constants/paths
input_length = 2114
filter_width = 21
reference_genome_path = "/users/amtseng/genomes/hg38.fasta"

### Helper functions
For extracting motifs

In [None]:
def dna_to_one_hot(seqs):
    """
    Converts a list of DNA ("ACGT") sequences to one-hot encodings, where the
    position of 1s is ordered alphabetically by "ACGT". `seqs` must be a list
    of N strings, where every string is the same length L. Returns an N x L x 4
    NumPy array of one-hot encodings, in the same order as the input sequences.
    All bases will be converted to upper-case prior to performing the encoding.
    Any bases that are not "ACGT" will be given an encoding of all 0s.
    """
    seq_len = len(seqs[0])
    assert np.all(np.array([len(s) for s in seqs]) == seq_len)

    # Join all sequences together into one long string, all uppercase
    seq_concat = "".join(seqs).upper()

    one_hot_map = np.identity(5)[:, :-1]

    # Convert string into array of ASCII character codes;
    base_vals = np.frombuffer(bytearray(seq_concat, "utf8"), dtype=np.int8)

    # Anything that's not an A, C, G, or T gets assigned a higher code
    base_vals[~np.isin(base_vals, np.array([65, 67, 71, 84]))] = 85

    # Convert the codes into indices in [0, 4], in ascending order by code
    _, base_inds = np.unique(base_vals, return_inverse=True)

    # Get the one-hot encoding for those indices, and reshape back to separate
    return one_hot_map[base_inds].reshape((len(seqs), seq_len, 4))

In [None]:
def extract_motifs(filter_activations_path, reference_genome_path):
    """
    Extracts the motifs that correspond to each filter. Returns an
    F x W x 4 array, where F is the number of filters and W is the width
    of each filter. The order of filters matches those in the saved HDF5/model.
    """
    reader = h5py.File(filter_activations_path, "r")
    activations_reader = reader["activations"]
    num_coords, two, num_windows, num_filters = activations_reader.shape
    
    assert two == 2
    assert num_windows == input_length - filter_width + 1
    
    print("Importing coordinates...")
    coords = np.empty((num_coords, 3), dtype=object)
    coords[:, 0] = reader["coords"]["coords_chrom"][:].astype(str)
    coords[:, 1] = reader["coords"]["coords_start"][:]
    coords[:, 2] = reader["coords"]["coords_end"][:]
    
    print("Fetching one-hot sequences...")
    genome_reader = pyfaidx.Fasta(reference_genome_path)
    one_hot_seqs = np.empty((num_coords, input_length, 4))
    batch_size = 128
    num_batches = int(np.ceil(num_coords / batch_size))
    for i in tqdm.notebook.trange(num_batches):
        batch_slice = slice(i * batch_size, (i + 1) * batch_size)
        one_hot_seqs[batch_slice] = dna_to_one_hot([
            genome_reader[chrom][start:end].seq for chrom, start, end in coords[batch_slice]
        ])
    
    pfms = np.empty((num_filters, filter_width, 4))
    for filter_index in range(num_filters):
        print("Extracting motif for filter %d..." % filter_index)
    
        print("\tComputing maximum activation...")
        acts = activations_reader[:, :, :, filter_index]
        max_act = np.max(acts)
        
        inds = np.where(acts >= 0.5 * max_act)
        
        windows, num_windows = np.zeros((filter_width, 4)), 0
        for coord_index, strand_index, pos_index in tqdm.notebook.tqdm(
            zip(*inds), total=len(inds[0]), desc="Extracting windows..."
        ):
            if strand_index == 0:
                window = one_hot_seqs[coord_index, pos_index : pos_index + filter_width]
            else:
                # Reverse complement; the positions are flipped
                window = np.flip(
                    one_hot_seqs[coord_index, input_length - filter_width - pos_index : input_length - pos_index],
                    axis=(0, 1)
                )
            windows = windows + window
            num_windows += 1
        
        pfms[filter_index] = windows / num_windows
    
    return pfms

In [None]:
def compute_filter_influence(filter_activations_path):
    """
    Extracts the influence of each filter by computing the difference
    in normalized NLL when each filter is nullified.
    Returns an F-array, where F is the number of filters, containing the
    change in average normalized NLL (after nullification - before
    nullification). The order of filters matches those in the saved
    HDF5/model.
    """
    reader = h5py.File(filter_activations_path, "r")
    print("Reading in normalized NLLs...")
    before_null_norm_nlls = reader["predictions"]["norm_nlls"][:]
    after_null_norm_nlls = reader["nullified_predictions"]["norm_nlls"][:]
    
    before_null = np.nanmean(before_null_norm_nlls)
    
    num_filters = after_null_norm_nlls.shape[1]
    
    influences = []
    for filter_index in tqdm.notebook.trange(num_filters):
        after_null = np.nanmean(after_null_norm_nlls[:, filter_index])
        influences.append(after_null - before_null)
        
    return np.array(influences)

In [None]:
background_freqs = np.array([0.25, 0.25, 0.25, 0.25])
def info_content(track, pseudocount=0.001):
    """
    Given an L x 4 track, computes information content for each base and
    returns it as an L-array.
    """
    num_bases = track.shape[1]
    # Normalize track to probabilities along base axis
    track_norm = (track + pseudocount) / (np.sum(track, axis=1, keepdims=True) + (num_bases * pseudocount))
    ic = track_norm * np.log2(track_norm / np.expand_dims(background_freqs, axis=0))
    return np.sum(ic, axis=1)

### Extract motifs
Extract the motifs derived from each filter, ranked by filter influence.

Deriving a filter's motif: 
1. Identify the top 10000 most well-predicted input sequences, ranked by normalized NLL
2. For each window in each of these sequences, compute the filter activation for each 1st-layer filter
3. A filter's motif is the aggregation of sequence windows which activate that filter to at least half its maximum activation (over the top 10000 most well-predicted inputs)

Deriving a filter's influence:
1. Identify the top 10000 most well-predicted input sequences, ranked by normalized NLL
2. Nullify each filter by setting it to the average activation over these 10000 most well-predicted inputs
3. A filter's influence is the average change in normalized NLL before and after nullification

In [None]:
filter_pfms = extract_motifs(filter_activations_path, reference_genome_path)

In [None]:
filter_influences = compute_filter_influence(filter_activations_path)

<a id="motifs"></a>
### Show motifs

In [None]:
for i, filter_index in enumerate(np.flip(np.argsort(filter_influences))):
    print("%d) Filter %d (influence = %.3f)" % (i, filter_index, filter_influences[filter_index]))
    
    pfm = filter_pfms[filter_index]
    pwm = pfm * np.expand_dims(info_content(pfm), axis=1)
    viz_sequence.plot_weights(pwm)