Skip to content

Commit

Permalink
Merge pull request #61 from kundajelab/filterembed
Browse files Browse the repository at this point in the history
Deriving embeddings using conv filter activations rather than gapped kmers
  • Loading branch information
AvantiShri committed May 14, 2020
2 parents e706388 + 77c64cb commit 3697287
Show file tree
Hide file tree
Showing 11 changed files with 4,538 additions and 2,487 deletions.
4,592 changes: 2,283 additions & 2,309 deletions examples/simulated_TAL_GATA_deeplearning/TF_MoDISco_TAL_GATA.ipynb

Large diffs are not rendered by default.

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions modisco.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Metadata-Version: 2.1
Name: modisco
Version: 0.5.6.5
Version: 0.5.7.0
Summary: TF MOtif Discovery from Importance SCOres
Home-page: https://github.com/kundajelab/tfmodisco
License: UNKNOWN
Description: Algorithm for discovering consolidated patterns from base-pair-level importance scores
Platform: UNKNOWN
Provides-Extra: tensorflow with gpu
Provides-Extra: tensorflow
Provides-Extra: tensorflow with gpu
1 change: 1 addition & 0 deletions modisco/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@
from . import tfmodisco_workflow
from . import value_provider
from . import clusterinit
from . import seqlet_embedding
154 changes: 5 additions & 149 deletions modisco/affinitymat/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,152 +92,6 @@ def __call__(self, seqlets):
raise NotImplementedError()


class AbstractSeqletsToOnedEmbedder(object):

def __call__(seqlets):
raise NotImplementedError()


class GappedKmerEmbedder(AbstractSeqletsToOnedEmbedder):

def __init__(self, alphabet_size,
kmer_len,
num_gaps,
num_mismatches,
toscore_track_names_and_signs,
normalizer,
batch_size,
num_filters_to_retain=None,
onehot_track_name=None,
progress_update=None):
self.alphabet_size = alphabet_size
self.kmer_len = kmer_len
self.num_gaps = num_gaps
self.num_mismatches = num_mismatches
self.num_filters_to_retain = num_filters_to_retain
self.filters, self.biases = self.prepare_gapped_kmer_filters()
self.onehot_track_name = onehot_track_name
self.toscore_track_names_and_signs = toscore_track_names_and_signs
assert len(toscore_track_names_and_signs) >= 0,\
"toscore_track_names_and_signs length is 0"
self.normalizer = normalizer
self.batch_size = batch_size
self.progress_update = progress_update
self.require_onehot_match = (True if self.onehot_track_name
is not None else False)
self.gapped_kmer_embedding_func =\
B.get_gapped_kmer_embedding_func(
filters=self.filters,
biases=self.biases,
require_onehot_match=self.require_onehot_match)

def prepare_gapped_kmer_filters(self):
nonzero_position_combos = list(itertools.combinations(
iterable=range(self.kmer_len),
r=(self.kmer_len-self.num_gaps)))
letter_permutations = list(itertools.product(
*[list(range(self.alphabet_size)) for x in
range(self.kmer_len-self.num_gaps)]))
filters = []
biases = []
##removed: unique_nonzero_positions = set()
for nonzero_positions in nonzero_position_combos:
string_representation = [" " for x in range(self.kmer_len)]
for nonzero_position in nonzero_positions:
string_representation[nonzero_position] = "X"
nonzero_positions_string =\
("".join(string_representation)).lstrip().rstrip()
#The logic for using 'unique_nonzero_positions' was that
# ' XX' and 'XX ' are in principle equivalent, so I did not want
# to double-count them. However, ' XX' and 'XX ' would generate
# slightly different embeddings, and if we don't include both, then
# the forward and reverse-complement version of the same seqlet
# would not wind up generating equivalent embeddings...
#So I have decided to just accept the double-counting and
# include both in order to preserve reverse-complement symmetry
##removed: if (nonzero_positions_string not in unique_nonzero_positions):
##removed: unique_nonzero_positions.add(nonzero_positions_string)
for letter_permutation in letter_permutations:
assert len(nonzero_positions)==len(letter_permutation)
the_filter = np.zeros((self.kmer_len, self.alphabet_size))
for nonzero_position, letter\
in zip(nonzero_positions, letter_permutation):
the_filter[nonzero_position, letter] = 1
filters.append(the_filter)
biases.append(-(len(nonzero_positions)-1
-self.num_mismatches))
return np.array(filters), np.array(biases)

def __call__(self, seqlets):
print("Computing embeddings")
sys.stdout.flush()
if (self.require_onehot_match):
onehot_track_fwd, onehot_track_rev =\
modiscocore.get_2d_data_from_patterns(
patterns=seqlets,
track_names=[self.onehot_track_name],
track_transformer=None)

data_to_embed_fwd = np.zeros(
(len(seqlets),
len(list(seqlets)[0]), self.alphabet_size)).astype("float32")
data_to_embed_rev = (np.zeros(
(len(seqlets),
len(list(seqlets)[0]), self.alphabet_size)).astype("float32")
if (onehot_track_rev is not None) else None)
for (track_name, sign) in self.toscore_track_names_and_signs:
fwd_data, rev_data = modiscocore.get_2d_data_from_patterns(
patterns=seqlets,
track_names=[track_name], track_transformer=None)
data_to_embed_fwd += fwd_data*sign
if (rev_data is not None):
data_to_embed_rev += (rev_data*sign if
(rev_data is not None) else None)
data_to_embed_fwd = np.array([self.normalizer(x) for x in
data_to_embed_fwd])
if (data_to_embed_rev is not None):
data_to_embed_rev = np.array([self.normalizer(x) for x in
data_to_embed_rev])
common_args = {'batch_size': self.batch_size,
'progress_update': self.progress_update}
if (self.require_onehot_match):
embedding_fwd = self.gapped_kmer_embedding_func(
onehot=onehot_track_fwd,
to_embed=data_to_embed_fwd,
**common_args)
embedding_rev = (self.gapped_kmer_embedding_func(
onehot=onehot_track_rev,
to_embed=data_to_embed_rev,
**common_args)
if (onehot_track_rev is not None) else None)
else:
embedding_fwd = self.gapped_kmer_embedding_func(
to_embed=data_to_embed_fwd,
**common_args)
embedding_rev = (self.gapped_kmer_embedding_func(
to_embed=data_to_embed_rev,
**common_args)
if (onehot_track_rev is not None) else None)
if (self.num_filters_to_retain is not None):
all_embeddings = (np.concatenate(
[embedding_fwd, embedding_rev], axis=0)
if (embedding_rev is not None)
else np.array(embedding_fwd))
embeddings_denominators =\
np.sum(np.abs(all_embeddings) > 0, axis=0).astype("float")
embeddings_denominators += 10.0
embeddings_mean_impact =\
(np.sum(np.abs(all_embeddings), axis=0)/
embeddings_denominators)
top_embedding_indices = [
x[0] for x in sorted(enumerate(embeddings_mean_impact),
key=lambda x: -x[1])][:self.num_filters_to_retain]
embedding_fwd = embedding_fwd[:,top_embedding_indices]
embedding_rev = (embedding_rev[:,top_embedding_indices]
if (embedding_rev is not None) else None)
return embedding_fwd, embedding_rev


class AbstractAffinityMatrixFromOneD(object):

def __call__(self, vecs1, vecs2):
Expand Down Expand Up @@ -368,9 +222,11 @@ def __call__(self, seqlets):
#check for enforced symmetry
assert np.max(np.abs(affinity_mat_fwd.T - affinity_mat_fwd))<1e-3,\
np.max(np.abs(affinity_mat_fwd.T - affinity_mat_fwd))
if (affinity_mat_rev is not None):
assert np.max(np.abs(affinity_mat_rev.T - affinity_mat_rev))<1e-3,\
np.max(np.abs(affinity_mat_rev.T - affinity_mat_rev))
#This assert need not hold anymore with filter embeddings, which aren't
# guaranteed revcomp symmetry...
#if (affinity_mat_rev is not None):
# assert np.max(np.abs(affinity_mat_rev.T - affinity_mat_rev))<1e-3,\
# np.max(np.abs(affinity_mat_rev.T - affinity_mat_rev))

cp3_time = time.time()

Expand Down
2 changes: 2 additions & 0 deletions modisco/seqlet_embedding/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from . import gapped_kmer
from . import onehot_model_output
14 changes: 14 additions & 0 deletions modisco/seqlet_embedding/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from __future__ import division, print_function


class AbstractSeqletsToOnedEmbedderFactory(object):

#Return an instance of AbstractSeqletsToOnedEmbedder
def __call__(onehot_track_name, toscore_track_names_and_signs):
raise NotImplementedError()


class AbstractSeqletsToOnedEmbedder(object):

def __call__(seqlets):
raise NotImplementedError()
194 changes: 194 additions & 0 deletions modisco/seqlet_embedding/gapped_kmer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
from __future__ import division, print_function, absolute_import
from ..affinitymat.core import MeanNormalizer
from .. import backend as B
from .core import (AbstractSeqletsToOnedEmbedder,
AbstractSeqletsToOnedEmbedderFactory)
from .. import core as modiscocore
import itertools
import numpy as np
import sys


class GappedKmerEmbedderFactory(object):

def __init__(self, alphabet_size=4, kmer_len=8, num_gaps=3,
num_mismatches=2, gpu_batch_size=20,
num_filters_to_retain=None,
#note: I should experiment to see whether this
# mean normalization helps results...it might not
mean_normalize=True):
self.alphabet_size = alphabet_size
self.kmer_len = kmer_len
self.num_gaps = num_gaps
self.num_mismatches = num_mismatches
self.gpu_batch_size = gpu_batch_size
self.num_filters_to_retain = num_filters_to_retain
self.mean_normalize = mean_normalize
if (self.mean_normalize):
self.normalizer = MeanNormalizer()
else:
self.normalizer = lambda x: x

def get_jsonable_config(self):
return OrderedDict([
('alphabet_size', self.alphabet_size),
('kmer_len', self.kmer_len),
('num_gaps', self.num_gaps),
('num_mismatches', self.alphabet_size),
('gpu_batch_size', self.gpu_batch_size),
('num_filters_to_retain', self.num_filters_to_retain),
('mean_normalize', self.mean_normalize),
])

def __call__(self, onehot_track_name, toscore_track_names_and_signs):
return GappedKmerEmbedder(
alphabet_size=self.alphabet_size,
kmer_len=self.kmer_len,
num_gaps=self.num_gaps,
num_mismatches=self.num_mismatches,
onehot_track_name=onehot_track_name,
toscore_track_names_and_signs=toscore_track_names_and_signs,
normalizer=self.normalizer,
batch_size=self.gpu_batch_size,
num_filters_to_retain=self.num_filters_to_retain)


class GappedKmerEmbedder(AbstractSeqletsToOnedEmbedder):

def __init__(self, alphabet_size,
kmer_len,
num_gaps,
num_mismatches,
toscore_track_names_and_signs,
normalizer,
batch_size,
num_filters_to_retain=None,
onehot_track_name=None,
progress_update=None):
self.alphabet_size = alphabet_size
self.kmer_len = kmer_len
self.num_gaps = num_gaps
self.num_mismatches = num_mismatches
self.num_filters_to_retain = num_filters_to_retain
self.filters, self.biases = self.prepare_gapped_kmer_filters()
self.onehot_track_name = onehot_track_name
self.toscore_track_names_and_signs = toscore_track_names_and_signs
assert len(toscore_track_names_and_signs) >= 0,\
"toscore_track_names_and_signs length is 0"
self.normalizer = normalizer
self.batch_size = batch_size
self.progress_update = progress_update
self.require_onehot_match = (True if self.onehot_track_name
is not None else False)
self.gapped_kmer_embedding_func =\
B.get_gapped_kmer_embedding_func(
filters=self.filters,
biases=self.biases,
require_onehot_match=self.require_onehot_match)

def prepare_gapped_kmer_filters(self):
nonzero_position_combos = list(itertools.combinations(
iterable=range(self.kmer_len),
r=(self.kmer_len-self.num_gaps)))
letter_permutations = list(itertools.product(
*[list(range(self.alphabet_size)) for x in
range(self.kmer_len-self.num_gaps)]))
filters = []
biases = []
##removed: unique_nonzero_positions = set()
for nonzero_positions in nonzero_position_combos:
string_representation = [" " for x in range(self.kmer_len)]
for nonzero_position in nonzero_positions:
string_representation[nonzero_position] = "X"
nonzero_positions_string =\
("".join(string_representation)).lstrip().rstrip()
#The logic for using 'unique_nonzero_positions' was that
# ' XX' and 'XX ' are in principle equivalent, so I did not want
# to double-count them. However, ' XX' and 'XX ' would generate
# slightly different embeddings, and if we don't include both, then
# the forward and reverse-complement version of the same seqlet
# would not wind up generating equivalent embeddings...
#So I have decided to just accept the double-counting and
# include both in order to preserve reverse-complement symmetry
##removed: if (nonzero_positions_string not in unique_nonzero_positions):
##removed: unique_nonzero_positions.add(nonzero_positions_string)
for letter_permutation in letter_permutations:
assert len(nonzero_positions)==len(letter_permutation)
the_filter = np.zeros((self.kmer_len, self.alphabet_size))
for nonzero_position, letter\
in zip(nonzero_positions, letter_permutation):
the_filter[nonzero_position, letter] = 1
filters.append(the_filter)
biases.append(-(len(nonzero_positions)-1
-self.num_mismatches))
return np.array(filters), np.array(biases)

def __call__(self, seqlets):
print("Computing embeddings")
sys.stdout.flush()
assert self.require_onehot_match #legacy, now just assume it's True
onehot_track_fwd, onehot_track_rev =\
modiscocore.get_2d_data_from_patterns(
patterns=seqlets,
track_names=[self.onehot_track_name],
track_transformer=None)


data_to_embed_fwd = np.zeros(
(len(seqlets),
len(list(seqlets)[0]), self.alphabet_size)).astype("float32")
data_to_embed_rev = (np.zeros(
(len(seqlets),
len(list(seqlets)[0]), self.alphabet_size)).astype("float32")
if (onehot_track_rev is not None) else None)
for (track_name, sign) in self.toscore_track_names_and_signs:
fwd_data, rev_data = modiscocore.get_2d_data_from_patterns(
patterns=seqlets,
track_names=[track_name], track_transformer=None)
data_to_embed_fwd += fwd_data*sign
if (rev_data is not None):
data_to_embed_rev += (rev_data*sign if
(rev_data is not None) else None)
data_to_embed_fwd = np.array([self.normalizer(x) for x in
data_to_embed_fwd])
if (data_to_embed_rev is not None):
data_to_embed_rev = np.array([self.normalizer(x) for x in
data_to_embed_rev])
common_args = {'batch_size': self.batch_size,
'progress_update': self.progress_update}
if (self.require_onehot_match):
embedding_fwd = self.gapped_kmer_embedding_func(
onehot=onehot_track_fwd,
to_embed=data_to_embed_fwd,
**common_args)
embedding_rev = (self.gapped_kmer_embedding_func(
onehot=onehot_track_rev,
to_embed=data_to_embed_rev,
**common_args)
if (onehot_track_rev is not None) else None)
else:
embedding_fwd = self.gapped_kmer_embedding_func(
to_embed=data_to_embed_fwd,
**common_args)
embedding_rev = (self.gapped_kmer_embedding_func(
to_embed=data_to_embed_rev,
**common_args)
if (onehot_track_rev is not None) else None)
if (self.num_filters_to_retain is not None):
all_embeddings = (np.concatenate(
[embedding_fwd, embedding_rev], axis=0)
if (embedding_rev is not None)
else np.array(embedding_fwd))
embeddings_denominators =\
np.sum(np.abs(all_embeddings) > 0, axis=0).astype("float")
embeddings_denominators += 10.0
embeddings_mean_impact =\
(np.sum(np.abs(all_embeddings), axis=0)/
embeddings_denominators)
top_embedding_indices = [
x[0] for x in sorted(enumerate(embeddings_mean_impact),
key=lambda x: -x[1])][:self.num_filters_to_retain]
embedding_fwd = embedding_fwd[:,top_embedding_indices]
embedding_rev = (embedding_rev[:,top_embedding_indices]
if (embedding_rev is not None) else None)
return embedding_fwd, embedding_rev
Loading

0 comments on commit 3697287

Please sign in to comment.