Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/haddock/clis/re/clustrmsd.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
get_clusters,
get_matrix_path,
iterate_min_population,
order_clusters,
write_clusters,
write_clustrmsd_file,
)
Expand Down Expand Up @@ -168,9 +169,9 @@ def reclustrmsd(
log.info(f"Updated clustering parameters = {clustrmsd_params}")

# processing the clusters
unq_clusters = np.unique(cluster_arr) # contains -1 (unclustered)
clusters = [c for c in unq_clusters if c != -1]
clusters, cluster_arr = order_clusters(cluster_arr)
log.info(f"clusters = {clusters}")
log.info(f"cluster_arr = {cluster_arr}")

clt_dic, cluster_centers = write_clusters(
clusters,
Expand Down Expand Up @@ -285,4 +286,4 @@ def search_previousstep_matrix(clustrmsd_dir: str) -> Optional[Path]:
else:
matrix_json = Path(workflow_dir, previous_step, "rmsd_matrix.json")
if matrix_json.exists():
return matrix_json
return matrix_json
9 changes: 3 additions & 6 deletions src/haddock/modules/analysis/clustrmsd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@
""" # noqa: E501
from pathlib import Path

import numpy as np

from haddock import log
# from haddock.core.typing import FilePath
from haddock.libs.libclust import (
Expand All @@ -49,6 +47,7 @@
get_dendrogram,
get_matrix_path,
iterate_min_population,
order_clusters,
read_matrix,
write_clusters,
write_clustrmsd_file,
Expand Down Expand Up @@ -115,10 +114,8 @@ def _run(self) -> None:
self.params['min_population'],
)
self.params['min_population'] = min_population

# print clusters
unq_clusters = np.unique(cluster_arr) # contains -1 (unclustered)
clusters = [c for c in unq_clusters if c != -1]

clusters, cluster_arr = order_clusters(cluster_arr)
log.info(f"clusters = {clusters}")

out_filename = Path('cluster.out')
Expand Down
42 changes: 42 additions & 0 deletions src/haddock/modules/analysis/clustrmsd/clustrmsd.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,3 +356,45 @@ def write_clustrmsd_file(clusters, clt_dic, cluster_centers, score_dic, sorted_s
log.info('Saving detailed output to clustrmsd.txt')
with open(output_fname, 'w') as out_fh:
out_fh.write(output_str)


def order_clusters(cluster_arr):
"""
Order the clusters by population.

The most populated cluster will be assigned the ID 1, the second most
populated the ID 2, and so on.

Parameters
----------
cluster_arr : np.ndarray
Array of clusters.

Returns
-------
clusters : list
List of clusters.

cluster_arr : np.ndarray
Array of clusters.
"""
unique_clusters, cluster_counts = np.unique(cluster_arr, return_counts=True)

sorted_indices = np.argsort(-cluster_counts) # must use negative to sort ascending
sorted_clusters = unique_clusters[sorted_indices]

# delete -1 from sorted_clusters if present
sorted_clusters = sorted_clusters[sorted_clusters != -1]
clusters = []
# for every element of sorted_clusters I want to assign a new ID
# to the elements of cluster_arr that match the order of sorted_clusters
index_dict = {}
for c in sorted_clusters:
# index of cluster_arr where the cluster is equal to c
idx = np.where(cluster_arr == c)
index_dict[c] = idx
# now the assignment
for i, c in enumerate(sorted_clusters):
clusters.append(i + 1)
cluster_arr[index_dict[c]] = i + 1
return clusters, cluster_arr
9 changes: 8 additions & 1 deletion tests/test_cli_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def test_cli_reclustrmsd():
clustrmsd_tsv = Path(interactive_folder, "clustrmsd.tsv")
assert clustrmsd_tsv.exists()
lines = clustrmsd_tsv.read_text().splitlines()
assert lines[1] == "1\tensemble_4G6M_6_haddock.pdb\tnan\t1"
assert lines[1] == "1\tensemble_4G6M_1_haddock.pdb\tnan\t1"

# clustrmsd.txt
clustrmsd_txt = Path(interactive_folder, "clustrmsd.txt")
Expand All @@ -168,6 +168,13 @@ def test_cli_reclustrmsd():
assert lines[4] == "> criterion=maxclust"
assert lines[5] == "> n_clusters=2"

# cluster.out
cluster_out = Path(interactive_folder, "cluster.out")
assert cluster_out.exists()
lines = cluster_out.read_text().splitlines()
assert lines[0] == "Cluster 1 -> 1 2 3 4 5 7 8 9"
assert lines[1] == "Cluster 2 -> 6 10"

# Test generation of plot
clustrmsd_html_matrix = Path(interactive_folder, "rmsd_matrix.html")
assert clustrmsd_html_matrix.exists()
Expand Down
27 changes: 26 additions & 1 deletion tests/test_module_clustrmsd.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
get_clusters,
get_dendrogram,
iterate_min_population,
order_clusters,
read_matrix,
)
from haddock.modules.analysis.rmsdmatrix import DEFAULT_CONFIG as rmsd_pars
Expand All @@ -25,7 +26,7 @@

@pytest.fixture
def output_list():
"""Clustfcc output list."""
"""Clustrmsd output list."""
return [
"rmsd.matrix",
"rmsd_matrix.json",
Expand Down Expand Up @@ -352,3 +353,27 @@ def test_iterate_min_population():
exp_cluster_arr = np.array([1, 1, -1, -1, -1])
assert obs_min_population == 2
assert (obs_cluster_arr == exp_cluster_arr).all()


def test_order_clusters():
"""Test order_clusters function."""
cluster_arr = np.array([1, 1, 2, 3, 4])
obs_clusters, obs_cluster_arr = order_clusters(cluster_arr)
exp_clusters = [1, 2, 3, 4]
exp_cluster_arr = np.array([1, 1, 2, 3, 4])
assert obs_clusters == exp_clusters
assert (obs_cluster_arr == exp_cluster_arr).all()
# now with a less trivial cluster_arr
cluster_arr = np.array([3, 3, 2, 4, 3, 1, 3, 3, 1, 3, 4])
obs_clusters, obs_cluster_arr = order_clusters(cluster_arr)
exp_clusters = [1, 2, 3, 4]
exp_cluster_arr = np.array([1, 1, 4, 3, 1, 2, 1, 1, 2, 1, 3])
assert obs_clusters == exp_clusters
assert (obs_cluster_arr == exp_cluster_arr).all()
# yet another cluster_arr with unclustered structures
cluster_arr = np.array([3, 3, 2, 4, -1, -1, 3, 3, -1, -1, 4, 1])
obs_clusters, obs_cluster_arr = order_clusters(cluster_arr)
exp_clusters = [1, 2, 3, 4]
exp_cluster_arr = np.array([1, 1, 4, 2, -1, -1, 1, 1, -1, -1, 2, 3])
assert obs_clusters == exp_clusters
assert (obs_cluster_arr == exp_cluster_arr).all()