haddocking · amjjbonvin · Sep 10, 2024 · Aug 1, 2024 · Aug 13, 2024 · Aug 13, 2024
diff --git a/src/haddock/clis/re/clustrmsd.py b/src/haddock/clis/re/clustrmsd.py
@@ -22,6 +22,7 @@
     get_clusters,
     get_matrix_path,
     iterate_min_population,
+    order_clusters,
     write_clusters,
     write_clustrmsd_file,
     )
@@ -168,9 +169,9 @@ def reclustrmsd(
     log.info(f"Updated clustering parameters = {clustrmsd_params}")
 
     # processing the clusters
-    unq_clusters = np.unique(cluster_arr)  # contains -1 (unclustered)
-    clusters = [c for c in unq_clusters if c != -1]
+    clusters, cluster_arr = order_clusters(cluster_arr)
     log.info(f"clusters = {clusters}")
+    log.info(f"cluster_arr = {cluster_arr}")
 
     clt_dic, cluster_centers = write_clusters(
         clusters,
@@ -285,4 +286,4 @@ def search_previousstep_matrix(clustrmsd_dir: str) -> Optional[Path]:
     else:
         matrix_json = Path(workflow_dir, previous_step, "rmsd_matrix.json")
         if matrix_json.exists():
-            return matrix_json
+            return matrix_json
diff --git a/src/haddock/modules/analysis/clustrmsd/__init__.py b/src/haddock/modules/analysis/clustrmsd/__init__.py
@@ -30,8 +30,6 @@
 """  # noqa: E501
 from pathlib import Path
 
-import numpy as np
-
 from haddock import log
 # from haddock.core.typing import FilePath
 from haddock.libs.libclust import (
@@ -49,6 +47,7 @@
     get_dendrogram,
     get_matrix_path,
     iterate_min_population,
+    order_clusters,
     read_matrix,
     write_clusters,
     write_clustrmsd_file,
@@ -115,10 +114,8 @@ def _run(self) -> None:
                 self.params['min_population'],
                 )
             self.params['min_population'] = min_population
-
-        # print clusters
-        unq_clusters = np.unique(cluster_arr)  # contains -1 (unclustered)
-        clusters = [c for c in unq_clusters if c != -1]
+
+        clusters, cluster_arr = order_clusters(cluster_arr)
         log.info(f"clusters = {clusters}")
 
         out_filename = Path('cluster.out')

diff --git a/src/haddock/modules/analysis/clustrmsd/clustrmsd.py b/src/haddock/modules/analysis/clustrmsd/clustrmsd.py
@@ -356,3 +356,45 @@ def write_clustrmsd_file(clusters, clt_dic, cluster_centers, score_dic, sorted_s
     log.info('Saving detailed output to clustrmsd.txt')
     with open(output_fname, 'w') as out_fh:
         out_fh.write(output_str)
+
+
+def order_clusters(cluster_arr):
+    """
+    Order the clusters by population.
+
+    The most populated cluster will be assigned the ID 1, the second most
+     populated the ID 2, and so on.
+
+    Parameters
+    ----------
+    cluster_arr : np.ndarray
+        Array of clusters.
+
+    Returns
+    -------
+    clusters : list
+        List of clusters.
+
+    cluster_arr : np.ndarray
+        Array of clusters.
+    """
+    unique_clusters, cluster_counts = np.unique(cluster_arr, return_counts=True)
+
+    sorted_indices = np.argsort(-cluster_counts)  # must use negative to sort ascending
+    sorted_clusters = unique_clusters[sorted_indices]
+
+    # delete -1 from sorted_clusters if present
+    sorted_clusters = sorted_clusters[sorted_clusters != -1]
+    clusters = []
+    # for every element of sorted_clusters I want to assign a new ID
+    # to the elements of cluster_arr that match the order of sorted_clusters
+    index_dict = {}
+    for c in sorted_clusters:
+        # index of cluster_arr where the cluster is equal to c
+        idx = np.where(cluster_arr == c)
+        index_dict[c] = idx
+    # now the assignment
+    for i, c in enumerate(sorted_clusters):
+        clusters.append(i + 1)
+        cluster_arr[index_dict[c]] = i + 1
+    return clusters, cluster_arr
diff --git a/tests/test_cli_re.py b/tests/test_cli_re.py
@@ -159,7 +159,7 @@ def test_cli_reclustrmsd():
         clustrmsd_tsv = Path(interactive_folder, "clustrmsd.tsv")
         assert clustrmsd_tsv.exists()
         lines = clustrmsd_tsv.read_text().splitlines()
-        assert lines[1] == "1\tensemble_4G6M_6_haddock.pdb\tnan\t1"
+        assert lines[1] == "1\tensemble_4G6M_1_haddock.pdb\tnan\t1"
 
         # clustrmsd.txt
         clustrmsd_txt = Path(interactive_folder, "clustrmsd.txt")
@@ -168,6 +168,13 @@ def test_cli_reclustrmsd():
         assert lines[4] == "> criterion=maxclust"
         assert lines[5] == "> n_clusters=2"
 
+        # cluster.out
+        cluster_out = Path(interactive_folder, "cluster.out")
+        assert cluster_out.exists()
+        lines = cluster_out.read_text().splitlines()
+        assert lines[0] == "Cluster 1 -> 1 2 3 4 5 7 8 9"
+        assert lines[1] == "Cluster 2 -> 6 10"
+
         # Test generation of plot
         clustrmsd_html_matrix = Path(interactive_folder, "rmsd_matrix.html")
         assert clustrmsd_html_matrix.exists()

diff --git a/tests/test_module_clustrmsd.py b/tests/test_module_clustrmsd.py
@@ -15,6 +15,7 @@
     get_clusters,
     get_dendrogram,
     iterate_min_population,
+    order_clusters,
     read_matrix,
     )
 from haddock.modules.analysis.rmsdmatrix import DEFAULT_CONFIG as rmsd_pars
@@ -25,7 +26,7 @@
 
 @pytest.fixture
 def output_list():
-    """Clustfcc output list."""
+    """Clustrmsd output list."""
     return [
         "rmsd.matrix",
         "rmsd_matrix.json",
@@ -352,3 +353,27 @@ def test_iterate_min_population():
     exp_cluster_arr = np.array([1, 1, -1, -1, -1])
     assert obs_min_population == 2
     assert (obs_cluster_arr == exp_cluster_arr).all()
+
+
+def test_order_clusters():
+    """Test order_clusters function."""
+    cluster_arr = np.array([1, 1, 2, 3, 4])
+    obs_clusters, obs_cluster_arr = order_clusters(cluster_arr)
+    exp_clusters = [1, 2, 3, 4]
+    exp_cluster_arr = np.array([1, 1, 2, 3, 4])
+    assert obs_clusters == exp_clusters
+    assert (obs_cluster_arr == exp_cluster_arr).all()
+    # now with a less trivial cluster_arr
+    cluster_arr = np.array([3, 3, 2, 4, 3, 1, 3, 3, 1, 3, 4])
+    obs_clusters, obs_cluster_arr = order_clusters(cluster_arr)
+    exp_clusters = [1, 2, 3, 4]
+    exp_cluster_arr = np.array([1, 1, 4, 3, 1, 2, 1, 1, 2, 1, 3])
+    assert obs_clusters == exp_clusters
+    assert (obs_cluster_arr == exp_cluster_arr).all()
+    # yet another cluster_arr with unclustered structures
+    cluster_arr = np.array([3, 3, 2, 4, -1, -1, 3, 3, -1, -1, 4, 1])
+    obs_clusters, obs_cluster_arr = order_clusters(cluster_arr)
+    exp_clusters = [1, 2, 3, 4]
+    exp_cluster_arr = np.array([1, 1, 4, 2, -1, -1, 1, 1, -1, -1, 2, 3])
+    assert obs_clusters == exp_clusters
+    assert (obs_cluster_arr == exp_cluster_arr).all()