In [1]:
%load_ext autoreload
%autoreload 2

In [49]:
import sys
import os
sys.path.append(os.path.abspath("../src/"))
import feature.util as feature_util
import h5py
import numpy as np
import tqdm
from collections import OrderedDict
import modisco

### Define constants/paths and import SHAP scores

In [29]:
shap_score_hdf5_path = "/users/amtseng/tfmodisco/motifs/SPI1/importance_scores.h5"

reference_fasta = "/users/amtseng/genomes/hg38.fasta"
padded_size = 400  # Length of sequence used in explanation

In [7]:
score_reader = h5py.File(shap_score_hdf5_path, "r")

### Create sequence one-hot encoder

In [34]:
# Maps coordinates to 1-hot encoded sequence
coords_to_seq = feature_util.CoordsToSeq(reference_fasta, center_size_to_use=padded_size)

### Read in hypothetical importances and compute actual importances
Prepare them for passing to TF-MoDISco

In [30]:
# Get shapes; we'll need to cut off inputs outside the central `padded_size`
num_seqs, input_length, _ = score_reader["prof_scores"].shape
center_start = (input_length // 2) - (padded_size // 2)
center_end = center_start + padded_size

In [32]:
# Read in hypothetical scores, cutting off everything outside central `padded_size`
hyp_scores = np.empty((num_seqs, padded_size, 4), dtype=float)
batch_size = 1000
num_batches = int(np.ceil(num_seqs / batch_size))
for i in tqdm.tqdm_notebook(range(num_batches)):
    s, e = i * batch_size, (i + 1) * batch_size
    hyp_scores[s : e, :, :] = score_reader["prof_scores"][s : e, center_start : center_end, :]

HBox(children=(IntProgress(value=0, max=104), HTML(value='')))




In [35]:
# Read in coordinates and compute 1-hot encoding
coords = np.empty((num_seqs, 3), dtype=object)
coords[:, 0] = score_reader["coords_chrom"][:].astype(str)
coords[:, 1] = score_reader["coords_start"][:]
coords[:, 2] = score_reader["coords_end"][:]

input_seqs = np.empty((num_seqs, padded_size, 4), dtype=float)

batch_size = 1000
num_batches = int(np.ceil(num_seqs / batch_size))
for i in tqdm.tqdm_notebook(range(num_batches)):
    s, e = i * batch_size, (i + 1) * batch_size
    input_seqs[s : e, :, :] = coords_to_seq(coords[s : e])

HBox(children=(IntProgress(value=0, max=104), HTML(value='')))




In [36]:
# Compute actual scores
assert hyp_scores.shape == input_seqs.shape
act_scores = hyp_scores * input_seqs

In [39]:
# Put scores into OrderedDict
task_to_hyp_scores, task_to_act_scores = OrderedDict(), OrderedDict()
task_to_hyp_scores["task0"] = hyp_scores
task_to_act_scores["task0"] = act_scores

### Run TF-MoDISco

In [46]:
tfm_workflow = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
    sliding_window_size=15,
    flank_size=5,
    target_seqlet_fdr=0.15,
    seqlets_to_patterns_factory=modisco.tfmodisco_workflow.seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
        trim_to_window_size=15,
        initial_flank_to_add=5,
        kmer_len=5,
        num_gaps=1,
        num_mismatches=0,
        final_min_cluster_size=60
    )
)

In [50]:
tfm_results = tfm_workflow(
    task_names=["task0"],
    contrib_scores=task_to_act_scores,
    hypothetical_contribs=task_to_hyp_scores,
    one_hot=input_seqs
)

MEMORY 5.70937344
On task task0
Computing windowed sums on original
Generating null dist
peak(mu)= -0.0012864335230819471
Computing threshold
Thresholds from null dist were -0.2015567604190437  and  0.2727465066418522
Final raw thresholds are -0.2015567604190437  and  0.2727465066418522
Final transformed thresholds are -0.8454202795469766  and  0.8805889319483612


<Figure size 640x480 with 1 Axes>

Got 435068 coords
After resolving overlaps, got 435068 seqlets
Across all tasks, the weakest transformed threshold used was: 0.8454201795469767
MEMORY 8.48314368
435068 identified in total
min_metacluster_size_frac * len(seqlets) = 4350 is more than min_metacluster_size=100.
Using it as a new min_metacluster_size
2 activity patterns with support >= 4350 out of 3 possible patterns
Metacluster sizes:  [313529, 121539]
Idx to activities:  {0: '1', 1: '-1'}
MEMORY 8.487780352
On metacluster 1
Metacluster size 121539 limited to 20000
Relevant tasks:  ('task0',)
Relevant signs:  (-1,)

TfModiscoSeqletsToPatternsFactory: seed=1234
(Round 1) num seqlets: 20000
(Round 1) Computing coarse affmat
MEMORY 8.490078208
Beginning embedding computation
Computing embeddings





Using TensorFlow backend.



Finished embedding computation in 11.34 s
Starting affinity matrix computations
Normalization computed in 4.74 s
Cosine similarity mat computed in 19.92 s
Normalization computed in 3.99 s
Cosine similarity mat computed in 19.07 s
Finished affinity matrix computations in 64.38 s
(Round 1) Compute nearest neighbors from coarse affmat
MEMORY 11.672420352
Computed nearest neighbors in 40.92 s
MEMORY 13.351247872
(Round 1) Computing affinity matrix on nearest neighbors
MEMORY 13.351247872
Launching nearest neighbors affmat calculation job
MEMORY 13.433675776
Parallel runs completed
MEMORY 13.632663552
Job completed in: 242.98 s
MEMORY 16.809373696
Launching nearest neighbors affmat calculation job
MEMORY 16.80769024
Parallel runs completed
MEMORY 16.90116096
Job completed in: 309.63 s
MEMORY 20.077871104
(Round 1) Computed affinity matrix on nearest neighbors in 624.74 s
MEMORY 16.839958528


KeyboardInterrupt: 