From 948c4b4ec760a5f352d937bd24149427627d3584 Mon Sep 17 00:00:00 2001 From: Av Shrikumar Date: Sun, 5 Dec 2021 15:24:41 -0600 Subject: [PATCH 1/4] trying to bring down mem usage of leiden step --- modisco/cluster/core.py | 1 - modisco/cluster/run_leiden | 4 ++-- modisco/tfmodisco_workflow/seqlets_to_patterns.py | 15 +++++++++++---- setup.py | 2 +- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/modisco/cluster/core.py b/modisco/cluster/core.py index 2574bba..5dccb70 100644 --- a/modisco/cluster/core.py +++ b/modisco/cluster/core.py @@ -234,7 +234,6 @@ def __call__(self, orig_affinity_mat, initclusters): else: affinity_mat = orig_affinity_mat - the_graph = get_igraph_from_adjacency(adjacency=affinity_mat) best_clustering = None best_quality = None diff --git a/modisco/cluster/run_leiden b/modisco/cluster/run_leiden index 72905c2..f1ec175 100755 --- a/modisco/cluster/run_leiden +++ b/modisco/cluster/run_leiden @@ -14,8 +14,8 @@ def get_igraph(sources_idxs_file, targets_idxs_file, weights_file, n_vertices): weights = np.load(weights_file) g = ig.Graph(directed=None) g.add_vertices(n_vertices) # this adds adjacency.shap[0] vertices - g.add_edges(list(zip(sources.tolist(), targets.tolist()))) - g.es['weight'] = weights.tolist() + g.add_edges(list(zip(sources, targets))) + g.es['weight'] = weights if g.vcount() != n_vertices: print('WARNING: The constructed graph has only ' +str(g.vcount())+' nodes. ' diff --git a/modisco/tfmodisco_workflow/seqlets_to_patterns.py b/modisco/tfmodisco_workflow/seqlets_to_patterns.py index 65a2a4a..05a7207 100644 --- a/modisco/tfmodisco_workflow/seqlets_to_patterns.py +++ b/modisco/tfmodisco_workflow/seqlets_to_patterns.py @@ -79,6 +79,7 @@ class TfModiscoSeqletsToPatternsFactory(object): @legacy_tfmodiscoseqletstopatternsfactory def __init__(self, n_cores=4, + n_cores_mainclustering=None, min_overlap_while_sliding=0.7, #init clusterer factory @@ -138,7 +139,10 @@ def __init__(self, n_cores=4, +" set use_louvain to False") #affinity_mat calculation + if (n_cores_mainclustering is None): + n_cores_mainclustering = n_cores self.n_cores = n_cores + self.n_cores_mainclustering = n_cores_mainclustering self.min_overlap_while_sliding = min_overlap_while_sliding self.embedder_factory = embedder_factory @@ -202,6 +206,7 @@ def get_jsonable_config(self): to_return = OrderedDict([ ('class_name', type(self).__name__), ('n_cores', self.n_cores), + ('n_cores_mainclustering', self.n_cores_mainclustering), ('initclusterer_factory', self.initclusterer_factory.get_jsonable_config()), ('min_overlap_while_sliding', self.min_overlap_while_sliding), @@ -341,7 +346,8 @@ def __call__(self, track_set, onehot_track_name, affinitymat.transformers.LouvainMembershipAverage( n_runs=n_runs, level_to_return=level_to_return, - parallel_threads=self.n_cores, seed=self.seed)) + parallel_threads=self.n_cores_mainclustering, + seed=self.seed)) clusterer_r1 = cluster.core.LouvainCluster( level_to_return=self.final_louvain_level_to_return, affmat_transformer=affmat_transformer_r1, @@ -349,7 +355,7 @@ def __call__(self, track_set, onehot_track_name, verbose=self.verbose, seed=self.seed) else: clusterer_r1 = cluster.core.LeidenClusterParallel( - n_jobs=self.n_cores, + n_jobs=self.n_cores_mainclustering, affmat_transformer=affmat_transformer_r1, numseedstotry=self.contin_runs_r1, n_leiden_iterations=self.n_leiden_iterations_r1, @@ -367,7 +373,8 @@ def __call__(self, track_set, onehot_track_name, affinitymat.transformers.LouvainMembershipAverage( n_runs=n_runs, level_to_return=level_to_return, - parallel_threads=self.n_cores, seed=self.seed)) + parallel_threads=self.n_cores_mainclustering, + seed=self.seed)) clusterer_r2 = cluster.core.LouvainCluster( level_to_return=self.final_louvain_level_to_return, affmat_transformer=affmat_transformer_r2, @@ -376,7 +383,7 @@ def __call__(self, track_set, onehot_track_name, initclusters_weight=self.louvain_initclusters_weight) else: clusterer_r2 = cluster.core.LeidenClusterParallel( - n_jobs=self.n_cores, + n_jobs=self.n_cores_mainclustering, affmat_transformer=affmat_transformer_r2, numseedstotry=self.contin_runs_r2, n_leiden_iterations=self.n_leiden_iterations_r2, diff --git a/setup.py b/setup.py index cb467bc..18a8a91 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ description='TF MOtif Discovery from Importance SCOres', long_description="""Algorithm for discovering consolidated patterns from base-pair-level importance scores""", url='https://github.com/kundajelab/tfmodisco', - version='0.5.16.0', + version='0.5.16.1', packages=find_packages(), package_data={ '': ['cluster/phenograph/louvain/*convert*', 'cluster/phenograph/louvain/*community*', 'cluster/phenograph/louvain/*hierarchy*'] From 78810474e00487b8807ca56cf60c5f623129ee7f Mon Sep 17 00:00:00 2001 From: Av Shrikumar Date: Sun, 5 Dec 2021 15:53:01 -0600 Subject: [PATCH 2/4] trying to bring down mem usage of leiden step --- modisco/cluster/core.py | 27 +++++++++++++++++++++++++-- modisco/util.py | 3 ++- test/test_tfmodisco_workflow.py | 3 ++- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/modisco/cluster/core.py b/modisco/cluster/core.py index 5dccb70..e933b5b 100644 --- a/modisco/cluster/core.py +++ b/modisco/cluster/core.py @@ -11,6 +11,7 @@ import os, re import subprocess from joblib import Parallel, delayed +from ..util import print_memory_use class ClusterResults(object): @@ -105,6 +106,7 @@ def __call__(self, orig_affinity_mat, initclusters): if (self.verbose): print("Beginning preprocessing + Leiden") + print_memory_use() sys.stdout.flush() all_start = time.time() if (self.affmat_transformer is not None): @@ -227,10 +229,17 @@ def __call__(self, orig_affinity_mat, initclusters): if (self.verbose): print("Beginning preprocessing + Leiden") + print_memory_use() sys.stdout.flush() + all_start = time.time() + if (self.affmat_transformer is not None): affinity_mat = self.affmat_transformer(orig_affinity_mat) + if (self.verbose): + print("Affmat transformed") + print_memory_use() + sys.stdout.flush() else: affinity_mat = orig_affinity_mat @@ -249,24 +258,33 @@ def __call__(self, orig_affinity_mat, initclusters): if (initclusters is not None): initclusters_to_try_list.append(True) - #write out the contents of affinity_mat and initclusters if applicable uid = uuid.uuid1().hex sources, targets = affinity_mat.nonzero() weights = affinity_mat[sources, targets] + if (self.verbose): + print("sources, targets, weights extracted") + print_memory_use() + sys.stdout.flush() + np.save(uid+"_sources.npy", sources) np.save(uid+"_targets.npy", targets) np.save(uid+"_weights.npy", weights.A1) #A1 is the same as ravel() + del sources, targets, weights + if (initclusters is not None): np.save(uid+"_initclusters.npy", initclusters) print("initclusters length:",len(initclusters)) for use_initclusters in initclusters_to_try_list: - print("Affmat shape:",affinity_mat.shape[0]) + if (self.verbose): + print("About to launch parallel Leiden runs") + print_memory_use() + sys.stdout.flush() parallel_leiden_results = ( Parallel(n_jobs=self.n_jobs, @@ -278,6 +296,11 @@ def __call__(self, orig_affinity_mat, initclusters): seed*100, self.refine) for seed in toiterover)) + if (self.verbose): + print("Parallel Leiden runs finished") + print_memory_use() + sys.stdout.flush() + for quality,membership in parallel_leiden_results: if ((best_quality is None) or (quality > best_quality)): best_quality = quality diff --git a/modisco/util.py b/modisco/util.py index d1caea8..88f4e01 100644 --- a/modisco/util.py +++ b/modisco/util.py @@ -9,13 +9,14 @@ from sklearn.metrics import average_precision_score, precision_recall_curve from sklearn.isotonic import IsotonicRegression from joblib import Parallel, delayed +import datetime def print_memory_use(): import os import psutil process = psutil.Process(os.getpid()) - print("MEMORY",process.memory_info().rss/1000000000) + print(datetime.now(),"MEMORY",process.memory_info().rss/1000000000) def load_patterns(grp, track_set): diff --git a/test/test_tfmodisco_workflow.py b/test/test_tfmodisco_workflow.py index b29fbff..2c987ea 100644 --- a/test/test_tfmodisco_workflow.py +++ b/test/test_tfmodisco_workflow.py @@ -120,7 +120,8 @@ def test_base_workflow(self): initial_flank_to_add=5, kmer_len=5, num_gaps=1, num_mismatches=0, - final_min_cluster_size=60) + final_min_cluster_size=60, + n_cores=4, n_cores_mainclustering=4) )( task_names=["task0", "task1", "task2"], contrib_scores=task_to_scores, From f9a974917d2f7fc30956b7585fac26e99a7aa95f Mon Sep 17 00:00:00 2001 From: Av Shrikumar Date: Sun, 5 Dec 2021 16:09:08 -0600 Subject: [PATCH 3/4] bugfix --- modisco/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modisco/util.py b/modisco/util.py index 88f4e01..9ccccc5 100644 --- a/modisco/util.py +++ b/modisco/util.py @@ -9,7 +9,7 @@ from sklearn.metrics import average_precision_score, precision_recall_curve from sklearn.isotonic import IsotonicRegression from joblib import Parallel, delayed -import datetime +from datetime import datetime def print_memory_use(): From 587ccd529f99acb710ccce20ab785b798286049e Mon Sep 17 00:00:00 2001 From: Av Shrikumar Date: Wed, 26 Jan 2022 21:28:50 -0800 Subject: [PATCH 4/4] Version bump for mem usage of leiden step --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 18a8a91..c6b4f48 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ description='TF MOtif Discovery from Importance SCOres', long_description="""Algorithm for discovering consolidated patterns from base-pair-level importance scores""", url='https://github.com/kundajelab/tfmodisco', - version='0.5.16.1', + version='0.5.16.2', packages=find_packages(), package_data={ '': ['cluster/phenograph/louvain/*convert*', 'cluster/phenograph/louvain/*community*', 'cluster/phenograph/louvain/*hierarchy*']