Skip to content

Commit

Permalink
Merge pull request #78 from kundajelab/variablelenseqletidentification
Browse files Browse the repository at this point in the history
Lower Mem AGKM + Variable Len Seqlet ID + Exemplar-Based Hit Scoring
  • Loading branch information
AvantiShri committed Jan 20, 2021
2 parents 17cbafe + e668030 commit 150f4a4
Show file tree
Hide file tree
Showing 16 changed files with 4,400 additions and 276 deletions.
2,843 changes: 2,843 additions & 0 deletions examples/simulated_TAL_GATA_deeplearning/TF_MoDISco_TAL_GATA_hitscoring.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion modisco.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: modisco
Version: 0.5.9.2
Version: 0.5.10.0
Summary: TF MOtif Discovery from Importance SCOres
Home-page: https://github.com/kundajelab/tfmodisco
License: UNKNOWN
Expand Down
1 change: 1 addition & 0 deletions modisco.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ modisco/cluster/phenograph/louvain/osx-hierarchy
modisco/clusterinit/__init__.py
modisco/clusterinit/memeinit.py
modisco/hit_scoring/__init__.py
modisco/hit_scoring/exemplar_based_hitscoring.py
modisco/hit_scoring/fast_hit_scoring.py
modisco/seqlet_embedding/__init__.py
modisco/seqlet_embedding/advanced_gapped_kmer.py
Expand Down
2 changes: 1 addition & 1 deletion modisco/affinitymat/core.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from __future__ import division, print_function, absolute_import
from .. import backend as B
import numpy as np
from .. import util as modiscoutil
from .. import core as modiscocore
Expand Down Expand Up @@ -629,6 +628,7 @@ def __init__(self, batch_size=50, func_params_size=1000000,
self.progress_update = progress_update

def __call__(self, filters, things_to_scan, min_overlap):
from .. import backend as B
return B.max_cross_corrs(
filters=filters,
things_to_scan=things_to_scan,
Expand Down
812 changes: 602 additions & 210 deletions modisco/coordproducers.py

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions modisco/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,11 @@ def get_example_idx_len(self, example_idx):
list(self.track_name_to_data_track.keys())[0]]
.fwd_tracks[example_idx])

@property
def num_examples(self):
return len(self.track_name_to_data_track[
list(self.track_name_to_data_track.keys())[0]].fwd_tracks)

def add_track(self, data_track):
assert type(data_track).__name__=="DataTrack"
if len(self.track_name_to_data_track)==0:
Expand Down
592 changes: 592 additions & 0 deletions modisco/hit_scoring/exemplar_based_hitscoring.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion modisco/seqlet_embedding/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from . import gapped_kmer
#from . import gapped_kmer #don't want to auto-import tensorflow
from . import advanced_gapped_kmer
from . import onehot_model_output
101 changes: 82 additions & 19 deletions modisco/seqlet_embedding/advanced_gapped_kmer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import division, print_function, absolute_import
from ..affinitymat.core import MeanNormalizer
from .. import backend as B
from .core import (AbstractSeqletsToOnedEmbedder,
AbstractSeqletsToOnedEmbedderFactory)
from .. import core as modiscocore
Expand Down Expand Up @@ -95,7 +94,8 @@ def unravel_fast_recursively_get_gappedkmersandimp(posbaseimptuples, **kwargs):


def prepare_gapped_kmer_from_contribs(contrib_scores, topn, min_k,
max_k, max_gap, max_len):
max_k, max_gap, max_len,
max_entries):
#get the top n positiosn
per_pos_imp = np.sum(contrib_scores, axis=-1)
per_pos_bases = np.argmax(contrib_scores, axis=-1)
Expand All @@ -122,12 +122,15 @@ def prepare_gapped_kmer_from_contribs(contrib_scores, topn, min_k,
if (len(gapped_kmer_rep) >= min_k):
gapped_kmer_to_totalseqimp[gapped_kmer_rep] = (
gapped_kmer_to_totalseqimp.get(gapped_kmer_rep, 0)
+ gapped_kmer_imp)
return gapped_kmer_to_totalseqimp
+ gapped_kmer_imp/len(gapped_kmer_rep)
)
#only retain the top max_entries entries
return sorted(gapped_kmer_to_totalseqimp.items(),
key=lambda x: -abs(x[1]))[:max_entries]


def prepare_gapped_kmer_from_seqlet(seqlet, topn, min_k,
max_k, max_gap, max_len,
max_k, max_gap, max_len, max_entries,
take_fwd,
onehot_track_name,
toscore_track_names_and_signs):
Expand All @@ -140,19 +143,40 @@ def prepare_gapped_kmer_from_seqlet(seqlet, topn, min_k,
for track_name, sign in toscore_track_names_and_signs ], axis=0)
return prepare_gapped_kmer_from_contribs(
contrib_scores=contrib_scores, topn=topn, min_k=min_k,
max_k=max_k, max_gap=max_gap, max_len=max_len)
max_k=max_k, max_gap=max_gap, max_len=max_len,
max_entries=max_entries)


def prepare_gapped_kmer_from_seqlet_and_make_sparse_vec_dat(
seqlet, topn, min_k, max_k, max_gap, max_len, max_entries,
take_fwd, onehot_track_name,
toscore_track_names_and_signs, template_to_startidx):

gapped_kmer_to_totalseqimp = prepare_gapped_kmer_from_seqlet(
seqlet=seqlet, topn=topn, min_k=min_k,
max_k=max_k, max_gap=max_gap, max_len=max_len,
max_entries=max_entries,
take_fwd=take_fwd, onehot_track_name=onehot_track_name,
toscore_track_names_and_signs=toscore_track_names_and_signs)

return gapped_kmer_to_totalseqimp
#return list(gapped_kmer_to_totalseqimp.items())
#return map_agkm_embedding_to_sparsevec(
# gapped_kmer_to_totalseqimp=gapped_kmer_to_totalseqimp,
# template_to_startidx=template_to_startidx)


class AdvancedGappedKmerEmbedderFactory(object):

def __init__(self, topn=20, min_k=4, max_k=6, max_gap=15, max_len=15,
alphabet_size=4, n_jobs=10,
num_filters_to_retain=None):
max_entries=500,
alphabet_size=4, n_jobs=10):
self.topn = topn
self.min_k = min_k
self.max_k = max_k
self.max_gap = max_gap
self.max_len = max_len
self.max_entries = max_entries
self.alphabet_size = alphabet_size
self.n_jobs = n_jobs

Expand All @@ -163,6 +187,7 @@ def get_jsonable_config(self):
('max_k', self.max_k),
('max_gap', self.max_gap),
('max_len', self.max_len)
('max_entries', self.max_entries)
('alphabet_size', self.alphabet_size)
('n_jobs', self.n_jobs)
])
Expand All @@ -171,6 +196,7 @@ def __call__(self, onehot_track_name, toscore_track_names_and_signs):
return AdvancedGappedKmerEmbedder(
topn=self.topn, min_k=self.min_k, max_k=self.max_k,
max_gap=self.max_gap, max_len=self.max_len,
max_entries=self.max_entries,
alphabet_size=self.alphabet_size,
n_jobs=self.n_jobs,
onehot_track_name=onehot_track_name,
Expand All @@ -179,7 +205,7 @@ def __call__(self, onehot_track_name, toscore_track_names_and_signs):

class AdvancedGappedKmerEmbedder(AbstractSeqletsToOnedEmbedder):

def __init__(self, topn, min_k, max_k, max_gap, max_len,
def __init__(self, topn, min_k, max_k, max_gap, max_len, max_entries,
alphabet_size,
n_jobs,
onehot_track_name,
Expand All @@ -189,38 +215,75 @@ def __init__(self, topn, min_k, max_k, max_gap, max_len,
self.max_k = max_k
self.max_gap = max_gap
self.max_len = max_len
self.max_entries = max_entries
self.alphabet_size = alphabet_size
self.n_jobs = n_jobs
self.onehot_track_name = onehot_track_name
self.toscore_track_names_and_signs = toscore_track_names_and_signs

def __call__(self, seqlets):

template_to_startidx, embedding_size =\
get_template_to_startidx_and_embedding_size(
max_len=self.max_len, min_k=self.min_k,
max_k=self.max_k, alphabet_size=self.alphabet_size)

#sparse_agkm_embeddings_fwd_dataandcols = (
# Parallel(n_jobs=self.n_jobs, verbose=True)(
# delayed(prepare_gapped_kmer_from_seqlet_and_make_sparse_vec_dat)(
# seqlets[i],
# self.topn, self.min_k,
# self.max_k, self.max_gap,
# self.max_len, True,
# self.onehot_track_name,
# self.toscore_track_names_and_signs,
# template_to_startidx)
# for i in range(len(seqlets)))
#)

#sparse_agkm_embeddings_rev_dataandcols = (
# Parallel(n_jobs=self.n_jobs, verbose=True)(
# delayed(prepare_gapped_kmer_from_seqlet_and_make_sparse_vec_dat)(
# seqlets[i],
# self.topn, self.min_k,
# self.max_k, self.max_gap,
# self.max_len, False, #'False' determines doing rc
# self.onehot_track_name,
# self.toscore_track_names_and_signs,
# template_to_startidx)
# for i in range(len(seqlets)))
#)

advanced_gappedkmer_embeddings_fwd =\
Parallel(n_jobs=self.n_jobs, verbose=True)(
delayed(prepare_gapped_kmer_from_seqlet)(
seqlets[i],
self.topn, self.min_k,
self.max_k, self.max_gap,
self.max_len, True,
self.max_len,
self.max_entries,
True,
self.onehot_track_name,
self.toscore_track_names_and_signs)
for i in range(len(seqlets)))

#from matplotlib import pyplot as plt
#plt.hist([len(x) for x in advanced_gappedkmer_embeddings_fwd], bins=20)
#plt.show()
#assert False
advanced_gappedkmer_embeddings_rev =\
Parallel(n_jobs=self.n_jobs, verbose=True)(
delayed(prepare_gapped_kmer_from_seqlet)(
seqlets[i],
self.topn, self.min_k,
self.max_k, self.max_gap,
self.max_len, False,
self.max_len,
self.max_entries,
False,
self.onehot_track_name,
self.toscore_track_names_and_signs)
for i in range(len(seqlets)))

template_to_startidx, embedding_size =\
get_template_to_startidx_and_embedding_size(
max_len=self.max_len, min_k=self.min_k,
max_k=self.max_k, alphabet_size=self.alphabet_size)

sparse_agkm_embeddings_fwd = get_sparse_mat_from_agkm_embeddings(
agkm_embeddings=advanced_gappedkmer_embeddings_fwd,
template_to_startidx=template_to_startidx,
Expand Down Expand Up @@ -265,7 +328,7 @@ def map_agkm_embedding_to_sparsevec(gapped_kmer_to_totalseqimp,
template_to_startidx):
data = []
cols = []
for gkmer, totalimp in gapped_kmer_to_totalseqimp.items():
for gkmer, totalimp in gapped_kmer_to_totalseqimp:
template,offset = get_template_and_offset_from_gkmer(gkmer)
gkmeridx = template_to_startidx[template] + offset
data.append(totalimp)
Expand All @@ -288,8 +351,8 @@ def get_sparse_mat_from_agkm_embeddings(agkm_embeddings,
row_ind = []
data = []
col_ind = []
for (this_row_idx,
(single_agkm_data, single_agkm_cols)) in enumerate(all_agkm_data_and_cols):
for (this_row_idx, (single_agkm_data, single_agkm_cols)) in enumerate(
all_agkm_data_and_cols):
data.extend(single_agkm_data)
col_ind.extend(single_agkm_cols)
row_ind.extend([this_row_idx for x in single_agkm_data])
Expand Down
42 changes: 25 additions & 17 deletions modisco/tfmodisco_workflow/seqlets_to_patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def new_constructor(*args, **kwargs):
assert 'embedder_factory' not in kwargs,\
("Cannot both specify embedder_factory and "
+str(gapped_kmer_kwargs))
from modisco.seqlet_embedding import gapped_kmer
kwargs['embedder_factory'] = (
seqlet_embedding.gapped_kmer
.GappedKmerEmbedderFactory(**gapped_kmer_kwargs))
Expand All @@ -85,8 +86,8 @@ def __init__(self, n_cores=4,
initclusterer_factory=None,

embedder_factory=(
seqlet_embedding.gapped_kmer
.GappedKmerEmbedderFactory()),
seqlet_embedding.advanced_gapped_kmer
.AdvancedGappedKmerEmbedderFactory()),

nn_n_jobs=4,
nearest_neighbors_to_compute=500,
Expand All @@ -95,15 +96,15 @@ def __init__(self, n_cores=4,
filter_beyond_first_round=False,
skip_fine_grained=False,

tsne_perplexity = 10,
tsne_perplexity=10,
use_louvain=False,
louvain_initclusters_weight=1.0,
n_leiden_iterations_r1=-1,
n_leiden_iterations_r2=-1,
louvain_num_runs_and_levels_r1=[(200,-1)],
louvain_num_runs_and_levels_r2=[(200,-1)],
contin_runs_r1 = 50,
contin_runs_r2 = 50,
contin_runs_r1=50,
contin_runs_r2=50,
final_louvain_level_to_return=1,

frac_support_to_trim_to=0.2,
Expand Down Expand Up @@ -570,10 +571,13 @@ def from_hdf5(cls, grp, track_set):
patterns_withoutreassignment = []
cluster_results = None
total_time_taken = grp.attrs["total_time_taken"]
pattern_merge_hierarchy =\
aggregator.PatternMergeHierarchy.from_hdf5(
grp=grp["pattern_merge_hierarchy"],
track_set=track_set)
if ("pattern_merge_hierarchy" in grp):
pattern_merge_hierarchy =\
aggregator.PatternMergeHierarchy.from_hdf5(
grp=grp["pattern_merge_hierarchy"],
track_set=track_set)
else:
pattern_merge_hierarchy = None
return cls(
each_round_initcluster_motifs=each_round_initcluster_motifs,
patterns=patterns,
Expand All @@ -590,8 +594,8 @@ def from_hdf5(cls, grp, track_set):

def save_hdf5(self, grp):
grp.attrs["success"] = self.success
grp.attrs["total_time_taken"] = self.total_time_taken
if (self.success):
grp.attrs["total_time_taken"] = self.total_time_taken
if (self.each_round_initcluster_motifs is not None):
self.save_each_round_initcluster_motifs(
grp=grp.create_group("each_round_initcluster_motifs"))
Expand Down Expand Up @@ -737,14 +741,15 @@ def __call__(self, seqlets):
if (self.verbose):
print("len(seqlets) is 0 - bailing!")
return SeqletsToPatternsResults(
patterns=None,
seqlets=None,
affmat=None,
each_round_initcluster_motifs=None,
patterns=None,
patterns_withoutreassignment=None,
pattern_merge_hierarchy=None,
cluster_results=None,
total_time_taken=None,
success=False)
success=False,
seqlets=None,
affmat=None)

if (self.verbose):
print("(Round "+str(round_num)+
Expand Down Expand Up @@ -896,12 +901,15 @@ def __call__(self, seqlets):
if (self.verbose):
print("No more surviving patterns - bailing!")
return SeqletsToPatternsResults(
each_round_initcluster_motifs=None,
patterns=None,
seqlets=None,
affmat=None,
patterns_withoutreassignment=None,
pattern_merge_hierarchy=None,
cluster_results=None,
total_time_taken=None,
success=False)
success=False,
seqlets=None,
affmat=None)

#Now start merging patterns
if (self.verbose):
Expand Down
Loading

0 comments on commit 150f4a4

Please sign in to comment.