Time complexity calculation, benchmark results, add __version__ and v…

…ectorise set_nodes_supplies (#48) - Time complexity calculation - Benchmark scripts, notebook and results - Add __version__ - Use vectorised set_nodes_supplies
joshlk · Jul 26, 2023 · 8dc01a8 · 8dc01a8
1 parent 16b2347
commit 8dc01a8
Show file tree

Hide file tree

Showing 9 changed files with 520 additions and 2 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -4,3 +4,4 @@ commit = True
 tag = True
 
 [bumpversion:file:setup.cfg]
+[bumpversion:file:k_means_constrained/__init__.py]
diff --git a/README.md b/README.md
@@ -80,3 +80,38 @@ clf.labels_
 ```
 
 </details>
+
+# Time complexity and runtime
+
+k-means-constrained is a more complex algorithm than vanilla k-means and therefore will take longer to execute and has worse scaling characteristics.
+
+Given a number of data points $n$ and clusters $c$, the time complexity of:
+* k-means: $\mathcal{O}(nc)$
+* k-means-constrained<sup>1</sup>: $\mathcal{O}((n^3c+n^2c^2+nc^3)\log(n+c)))$
+
+This assumes a constant number of algorithm iterations and data-point features/dimensions.
+
+If you consider the case where $n$ is the same order as $c$ ($n \backsim c$) then:
+* k-means: $\mathcal{O}(n^2)$
+* k-means-constrained<sup>1</sup>: $\mathcal{O}(n^4\log(n)))$
+
+Below is a runtime comparison between k-means and k-means-constrained whereby the number of iterations, initializations, multi-process pool size and dimension size are fixed. The number of clusters is also always one-tenth the number of data points $n=10c$. It is shown above that the runtime is independent of the minimum or maximum cluster size, and so none is included below.
+
+![Data-points vs execution time for k-means vs k-means-constrained. Data-points=10*clusters. No min/max constraints](https://raw.githubusercontent.com/joshlk/k-means-constrained/master/ect/execution_time.png)
+
+<details>
+  <summary>System details</summary>
+
+* OS: Linux-5.15.0-75-generic-x86_64-with-glibc2.35
+* CPU: AMD EPYC 7763 64-Core Processor
+* CPU cores: 120
+* k-means-constrained version: 0.7.3
+* numpy version: 1.24.2
+* scipy version: 1.11.1
+* ortools version: 9.6.2534
+* joblib version: 1.3.1
+* sklearn version: 1.3.0
+</details>
+---
+
+<sup>1</sup>: [Ortools notes](https://developers.google.com/optimization/reference/graph/min_cost_flow) states the time complexity of their cost-scaling push-relabel algorithm for the min-cost flow problem as $\mathcal{O}(n^2m\log(nC))$ where $n$ is the number of nodes, $m$ is the number of edges and $C$ is the maximum absolute edge cost.
diff --git a/etc/benchmark.ipynb b/etc/benchmark.ipynb
diff --git a/etc/benchmark_k_means.py b/etc/benchmark_k_means.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+
+from argparse import ArgumentParser
+from sklearn.cluster import KMeans
+import numpy as np
+import time
+import os
+
+p = ArgumentParser()
+p.add_argument("-n", "--data-points", required=True, type=int, help="Number of data-points")
+p.add_argument("-d", "--dimensions", required=True, type=int, help="Number of dimensions/features each data-point has")
+p.add_argument("-K", "--clusters", required=True, type=int, help="Number of clusters")
+p.add_argument("-ge", "--min-cluster-size", default=None, help="Minimum number of clusters assigned to each data-point")
+p.add_argument("-le", "--max-cluster-size", default=None, help="Maximum number of clusters assigned to each data-point")
+p.add_argument("-s", "--seed", type=int, default=42, help="Random state seed")
+p.add_argument("-i", "--info", action='store_true', default=False , help="Print system info. `cpuinfo` is required to be installed.")
+args = p.parse_args()
+
+print(f"K-means benchmark: data-points={args.data_points}, dimensions={args.dimensions}, clusters={args.clusters}, min-cluster-size={args.min_cluster_size}, max-cluster-size={args.max_cluster_size}, seed={args.seed}")
+
+if args.info:
+    import scipy, ortools, joblib, platform, cpuinfo, sklearn, k_means_constrained
+    print(f"OS: {platform.platform()}")
+    print(f"CPU: {cpuinfo.get_cpu_info()['brand_raw']}")
+    print(f"CPU cores: {cpuinfo.get_cpu_info()['count']}")
+    print(f"k-means-constrained version: {k_means_constrained.__version__}")
+    print(f"numpy version: {np.__version__}")
+    print(f"scipy version: {scipy.__version__}")
+    print(f"ortools version: {ortools.__version__}")
+    print(f"joblib version: {joblib.__version__}")
+    print(f"sklearn version: {sklearn.__version__}")
+
+np.random.seed(args.seed)
+
+X = np.random.rand(args.data_points, args.dimensions)
+
+os.environ['OMP_NUM_THREADS'] = '10'  # Used instead of joblib/n_jobs in latest version of sklearn
+
+t = time.perf_counter()
+clf = KMeans(
+     n_clusters=args.clusters,
+     random_state=args.seed+1,
+     algorithm='lloyd',
+     init='k-means++',
+     n_init=10,
+     max_iter=300,
+     tol=0.0001,
+ )
+clf.fit_predict(X)
+
+total_time = time.perf_counter() - t
+print(f"Total time: {total_time:.2f} seconds")
+
diff --git a/etc/benchmark_k_means_constrained.py b/etc/benchmark_k_means_constrained.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+from argparse import ArgumentParser
+import k_means_constrained
+import numpy as np
+import time
+import logging
+import os
+
+p = ArgumentParser()
+p.add_argument("-n", "--data-points", required=True, type=int, help="Number of data-points")
+p.add_argument("-d", "--dimensions", required=True, type=int, help="Number of dimensions/features each data-point has")
+p.add_argument("-K", "--clusters", required=True, type=int, help="Number of clusters")
+p.add_argument("-ge", "--min-cluster-size", default=None, help="Minimum number of clusters assigned to each data-point")
+p.add_argument("-le", "--max-cluster-size", default=None, help="Maximum number of clusters assigned to each data-point")
+p.add_argument("-s", "--seed", type=int, default=42, help="Random state seed")
+p.add_argument("-i", "--info", action='store_true', default=False , help="Print system info. `cpuinfo` is required to be installed.")
+args = p.parse_args()
+
+
+logging.basicConfig(
+    level=os.environ.get('LOGLEVEL', 'DEBUG').upper()
+)
+
+print(f"K-mean-constrained benchmark: data-points={args.data_points}, dimensions={args.dimensions}, clusters={args.clusters}, min-cluster-size={args.min_cluster_size}, max-cluster-size={args.max_cluster_size}, seed={args.seed}")
+
+if args.info:
+    import scipy, ortools, joblib, platform, cpuinfo, sklearn, k_means_constrained
+    print(f"OS: {platform.platform()}")
+    print(f"CPU: {cpuinfo.get_cpu_info()['brand_raw']}")
+    print(f"CPU cores: {cpuinfo.get_cpu_info()['count']}")
+    print(f"k-means-constrained version: {k_means_constrained.__version__}")
+    print(f"numpy version: {np.__version__}")
+    print(f"scipy version: {scipy.__version__}")
+    print(f"ortools version: {ortools.__version__}")
+    print(f"joblib version: {joblib.__version__}")
+    print(f"sklearn version: {sklearn.__version__}")
+
+np.random.seed(args.seed)
+
+X = np.random.rand(args.data_points, args.dimensions)
+
+t = time.perf_counter()
+clf = k_means_constrained.KMeansConstrained(
+     n_clusters=args.clusters,
+     size_min=int(args.min_cluster_size) if args.min_cluster_size else None,
+     size_max=int(args.max_cluster_size) if args.max_cluster_size else None,
+     random_state=args.seed+1,
+     #algorithm='lloyd', # implied
+     init='k-means++',
+     n_init=10,
+     max_iter=300,
+     tol=0.0001,
+     n_jobs=10,
+ )
+clf.fit_predict(X)
+
+total_time = time.perf_counter() - t
+print(f"Total time: {total_time:.2f} seconds")
+
diff --git a/ect/cython_benchmark.ipynb → etc/cython_benchmark.ipynb b/ect/cython_benchmark.ipynb → etc/cython_benchmark.ipynb
diff --git a/etc/execution_time.png b/etc/execution_time.png
diff --git a/k_means_constrained/__init__.py b/k_means_constrained/__init__.py
@@ -1,5 +1,6 @@
 
 __all__ = ['KMeansConstrained']
+__version__ = '0.7.3'
 
 from .k_means_constrained_ import KMeansConstrained
 
diff --git a/k_means_constrained/k_means_constrained_.py b/k_means_constrained/k_means_constrained_.py
@@ -483,8 +483,7 @@ def solve_min_cost_flow_graph(edges, costs, capacities, supplies, n_C, n_X):
     min_cost_flow.add_arcs_with_capacity_and_unit_cost(edges[:, 0], edges[:, 1], capacities, costs)
 
     # Add node supplies
-    for count, supply in enumerate(supplies):
-        min_cost_flow.set_node_supply(count, supply)
+    min_cost_flow.set_nodes_supplies(np.arange(len(supplies)), supplies)
 
     # Find the minimum cost flow between node 0 and node 4.
     if min_cost_flow.solve() != min_cost_flow.OPTIMAL: