In [None]:
import os
import time
import threading

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.optimize import curve_fit
from sklearn.metrics import r2_score

import psutil
from memory_profiler import memory_usage
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

# sklearn and sklearnex
from sklearn.cluster import KMeans as SKLearnKMeans
from sklearnex import patch_sklearn

Fitting analysis of FSCS sampling method implemented in matlab-kmeans

In [None]:
# Read completed experiment results (including raw_n, feat_num, level, comb_idx, elapsed_time, peak_mem_used)
df = pd.read_csv("FSCS_overhead_with_rawsize.csv")

# Keep only valid data
df = df[(df["elapsed_time"] > 0) & (df["peak_mem_used"] > 0)]

# Define independent and dependent variables
n = df["raw_n"].values
k = df["level"].values
d = df["feat_num"].values
t = df["elapsed_time"].values  # Running time (seconds)
m = df["peak_mem_used"].values  # Peak memory (MB)


# Define memory model (power law form)
def mem_model(X, C, beta, a, b, c):
    n, k, d = X
    return C + beta * (n**a) * (k**b) * (d**c)


# Initial guess values
p0_mem = [10000, 1e-8, 1.0, 1.0, 0.0]

# Fit the model
popt_mem, pcov_mem = curve_fit(mem_model, (n, k, d), m, p0=p0_mem, maxfev=50000)
C_m, beta_m, a_m, b_m, c_m = popt_mem

# Calculate fitted values and R²
m_pred = mem_model((n, k, d), *popt_mem)
r2_mem = r2_score(m, m_pred)

# Print model parameters and R²
print("===== Peak Memory Model (Power Law) =====")
print(f"C      = {C_m:.2f}")
print(f"β      = {beta_m:.4e}")
print(f"a (n^a) = {a_m:.4f}")
print(f"b (k^b) = {b_m:.4f}")
print(f"c (d^c) = {c_m:.4f}")
print(f"R²     = {r2_mem:.4f}")


# Define running time model (power law form)
def time_model(X, C, beta, a, b, c):
    n, k, d = X
    return C + beta * (n**a) * (k**b) * (d**c)


# Initial guess values
p0_time = [100, 1e-12, 1.0, 1.0, 0.0]

# Fit the model
popt_time, pcov_time = curve_fit(time_model, (n, k, d), t, p0=p0_time, maxfev=50000)
C_t, beta_t, a_t, b_t, c_t = popt_time

# Calculate fitted values and R²
t_pred = time_model((n, k, d), *popt_time)
r2_time = r2_score(t, t_pred)

# Print model parameters and R²
print("===== Running Time Model (Power Law) =====")
print(f"C      = {C_t:.2f}")
print(f"β      = {beta_t:.4e}")
print(f"a (n^a) = {a_t:.4f}")
print(f"b (k^b) = {b_t:.4f}")
print(f"c (d^c) = {c_t:.4f}")
print(f"R²     = {r2_time:.4f}")

Comparative analysis of MATLAB, scikit-learn, and scikit-learn-ex

In [None]:
# Data loading and preprocessing
# Read CSV
grd = pd.read_csv("sampleddata/class_df.csv")
grd_data = grd.iloc[:, 1:-1].values
grd_scaled = StandardScaler().fit_transform(grd_data).astype(np.float32)

# Test parameters
n_repeats = 5
# Cell [2]: Test parameters
n_repeats = 5
raw_ns = [50000, 80000, 100000]  # List of raw sample sizes
cluster_counts = [5000, 10000, 20000, 50000, 80000]
max_iter = 100
random_state = 2025


# Randomly sample n_samples from the full dataset
def sample_data(data, n_samples, seed=None):
    if n_samples >= data.shape[0]:
        return data
    idx = shuffle(np.arange(data.shape[0]), random_state=seed)[:n_samples]
    return data[idx, :]


X = grd_scaled

In [None]:
# CPU memory monitor (for Jupyter environment)


class JupyterMemoryMonitor:
    def __init__(self):
        self.process = psutil.Process(os.getpid())
        self.peak_memory = 0
        self._is_running = False
        self._monitor_thread = None

    def start(self):
        self.peak_memory = 0
        self._is_running = True
        self._monitor_thread = threading.Thread(target=self._monitor)
        self._monitor_thread.daemon = True
        self._monitor_thread.start()

    def stop(self):
        self._is_running = False
        if self._monitor_thread:
            self._monitor_thread.join()
        return self.peak_memory / 1024**2  # Unit: MB

    def _monitor(self):
        while self._is_running:
            current = self.process.memory_info().rss
            if current > self.peak_memory:
                self.peak_memory

In [None]:
# Measurement function


def measure_kmeans(cls, data, **kwargs):
    """
    General measurement function, only for SKLearnKMeans and SKLearnexKMeans:
    - Use memory_usage to monitor the peak CPU memory inside the function
    - Use JupyterMemoryMonitor to monitor the peak CPU memory during the whole process
    Returns: (elapsed_time_s, peak_cpu_func_mem_MB, peak_cpu_jupyter_mem_MB)
    """
    cpu_monitor = JupyterMemoryMonitor()
    cpu_monitor.start()

    def run_kmeans_cpu():
        km = cls(
            n_clusters=kwargs.get("n_clusters"),
            max_iter=kwargs.get("max_iter", 100),
            n_init=kwargs.get("n_init", 1),
            algorithm=kwargs.get("algorithm", "lloyd"),
            random_state=kwargs.get("random_state", None),
        )
        km.fit(data)

    # Monitor the peak CPU memory inside the function
    mem_usage = memory_usage(proc=run_kmeans_cpu, interval=0.01)

    # Measure running time
    t0 = time.time()
    run_kmeans_cpu()
    elapsed = time.time() - t0

    cpu_peak_jup = cpu_monitor.stop()

    return (
        round(elapsed, 2),
        round(max(mem_usage), 2),
        round(cpu_peak_jup, 2),
    )

In [None]:
# Native scikit-learn test (modified)
results_native = []
for raw_n in raw_ns:
    # For each raw_n, sample once
    X_sampled = sample_data(X, raw_n, random_state)
    for k in cluster_counts:
        if k > raw_n:
            continue
        for r in range(n_repeats):
            t, m_func, m_jup = measure_kmeans(
                SKLearnKMeans,
                X_sampled,
                n_clusters=k,
                max_iter=max_iter,
                n_init=1,
                algorithm="lloyd",
                random_state=random_state + r,
            )
            results_native.append(
                {
                    "raw_n": raw_n,
                    "k": k,
                    "repeat": r,
                    "sklearn_time_s": t,
                    "sklearn_cpu_func_mem_MB": m_func,
                    "sklearn_cpu_jupyter_mem_MB": m_jup,
                }
            )

df_native = pd.DataFrame(results_native).set_index(["raw_n", "k", "repeat"])
display(df_native)

In [None]:
# sklearnex accelerated test (modified)
patch_sklearn()
from sklearn.cluster import KMeans as SKLearnexKMeans

results_ex = []
for raw_n in raw_ns:
    X_sampled = sample_data(X, raw_n, random_state)
    for k in cluster_counts:
        if k > raw_n:
            continue
        for r in range(n_repeats):
            t, m_func, m_jup = measure_kmeans(
                SKLearnexKMeans,
                X_sampled,
                n_clusters=k,
                max_iter=max_iter,
                n_init=1,
                algorithm="lloyd",
                random_state=random_state + r,
            )
            results_ex.append(
                {
                    "raw_n": raw_n,
                    "k": k,
                    "repeat": r,
                    "sklearnex_time_s": t,
                    "sklearnex_cpu_func_mem_MB": m_func,
                    "sklearnex_cpu_jupyter_mem_MB": m_jup,
                }
            )

df_ex = pd.DataFrame(results_ex).set_index(["raw_n", "k", "repeat"])

In [None]:
# Modify the code for reading and merging MATLAB results to ensure all combinations of raw_ns and cluster_counts are processed
has_matlab = False
try:
    # Read MATLAB data
    matlab_df = pd.read_csv("FSCS_overhead_with_rawsize.csv")

    # Filter valid data
    matlab_df = matlab_df[
        (matlab_df["elapsed_time"] > 0) & (matlab_df["peak_mem_used"] > 0)
    ]

    # Initialize result list
    matlab_results = []

    # Iterate over all combinations of raw_ns and cluster_counts
    for raw_n in raw_ns:
        for k in cluster_counts:
            if k > raw_n:
                continue
            # Filter data for current raw_n and k
            subset = matlab_df[
                (matlab_df["raw_n"] == raw_n) & (matlab_df["level"] == k)
            ]

            # If data is empty, skip current combination
            if subset.empty:
                avg_time, avg_mem = np.nan, np.nan
            else:
                avg_time = subset["elapsed_time"].mean()
                avg_mem = subset["peak_mem_used"].mean()

            # Repeat n_repeats times for consistency
            for r in range(n_repeats):
                matlab_results.append(
                    {
                        "raw_n": raw_n,
                        "k": k,
                        "repeat": r,
                        "matlab_time_s": avg_time,
                        "matlab_cpu_func_mem_MB": avg_mem,
                        "matlab_cpu_jupyter_mem_MB": avg_mem,
                    }
                )

    # Convert to DataFrame
    df_matlab = pd.DataFrame(matlab_results).set_index(["raw_n", "k", "repeat"])
    has_matlab = True
    # Reset index to convert raw_n, k, repeat to regular columns
    df_matlab_reset = df_matlab.reset_index()
    display(df_matlab)

except Exception as e:
    # Catch exceptions and print error message
    print("Failed to read MATLAB results:", e)
    has_matlab

In [None]:
# Merge all results & calculate mean and standard deviation

# Ensure MultiIndex for df_comb
df_comb = pd.concat([df_native, df_ex, df_matlab_reset], axis=1)
df_comb.index = pd.MultiIndex.from_frame(df_comb.reset_index()[["raw_n", "k"]])
df_comb.to_csv("output.csv", index=False)

# Group by MultiIndex levels and calculate mean
df_mean = df_comb.groupby(level=["raw_n", "k"]).mean()


# Filter columns containing only time_s and memory related columns
time_memory_columns = [
    col for col in df_mean.columns if "time_s" in col or "mem_MB" in col
]
df_mean_filtered = df_mean[time_memory_columns]
df.to_csv("tool compare results comparison.csv", index=False)

# Display results
print("===== Mean =====")
display(df_mean_filtered)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

t = df["elapsed_time"].values

# Prepare DataFrames
df_plot = df_mean_filtered.reset_index()

mem_cols = {
    "sklearn_cpu_jupyter_mem_MB": "scikit-learn",
    "sklearnex_cpu_jupyter_mem_MB": "scikit-learnex",
}
if has_matlab:
    mem_cols["matlab_cpu_jupyter_mem_MB"] = "MATLAB"
df_mem = df_plot[["raw_n", "k"] + list(mem_cols.keys())].rename(columns=mem_cols)

time_cols = {"sklearn_time_s": "scikit-learn", "sklearnex_time_s": "scikit-learnex"}
if has_matlab:
    time_cols["matlab_time_s"] = "MATLAB"
df_time = df_plot[["raw_n", "k"] + list(time_cols.keys())].rename(columns=time_cols)

# Plot settings
sns.set_style("whitegrid")
plt.rcParams.update({"font.size": 18})

# Create 4×2 grid
fig, axes = plt.subplots(4, 2, figsize=(18, 24))
fig.subplots_adjust(hspace=0.5, wspace=0.3)

# subplot labels (a)-(h)
labels = ["(a)", "(b)", "(c)", "(d)", "(e)", "(f)", "(g)", "(h)"]

# 1st row: fit scatter plots with R²
axes[0, 0].scatter(m, m_pred, alpha=0.4)
lims = [min(m.min(), m_pred.min()), max(m.max(), m_pred.max())]
axes[0, 0].plot(lims, lims, "r--")
axes[0, 0].set_xlabel("Observed Peak Memory (MB)")
axes[0, 0].set_ylabel("Predicted Peak Memory (MB)")
axes[0, 0].grid(True)
axes[0, 0].set_title(
    f"(a) Memory Fit\n(Exponential Saturation), R²={r2_mem:.3f}", pad=20
)

axes[0, 1].scatter(t, t_pred, alpha=0.4)
lims = [min(t.min(), t_pred.min()), max(t.max(), t_pred.max())]
axes[0, 1].plot(lims, lims, "r--")
axes[0, 1].set_xlabel("Observed Time (s)")
axes[0, 1].set_ylabel("Predicted Time (s)")
axes[0, 1].grid(True)
axes[0, 1].set_title(
    f"(b) Time Fit\n(Exponential Saturation), R²={r2_time:.3f}", pad=20
)


# Helper for annotation
def annotate_multiples(ax, df_sub, baseline_col):
    methods = [c for c in df_sub.columns if c not in ("raw_n", "k")]
    n_methods = len(methods)
    ks = df_sub["k"].values
    for i, k_val in enumerate(ks):
        baseline = float(df_sub.loc[df_sub["k"] == k_val, baseline_col])
        for j, method in enumerate(methods):
            height = float(df_sub.loc[df_sub["k"] == k_val, method])
            multiple = height / baseline if baseline else np.nan
            x_pos = i + (j - (n_methods - 1) / 2) * 0.25
            ax.text(
                x_pos,
                height * 1.01,
                f"{multiple:.1f}×",
                ha="center",
                va="bottom",
                fontsize=15,
            )


# 2nd–4th rows: bar plots per raw_n
for idx, raw_n in enumerate([50000, 80000, 100000], start=1):
    mem_sub = df_mem[df_mem["raw_n"] == raw_n].copy()
    time_sub = df_time[df_time["raw_n"] == raw_n].copy()
    # predict missing MATLAB
    if raw_n == 100000 and has_matlab:
        pred_m = mem_model((100000, 80000, d.mean()), *popt_mem)
        pred_t = time_model((100000, 80000, d.mean()), *popt_time)
        mem_sub.loc[mem_sub.k == 80000, "MATLAB"] = pred_m
        time_sub.loc[time_sub.k == 80000, "MATLAB"] = pred_t

    mem_melt = mem_sub.melt(
        id_vars="k",
        value_vars=list(mem_cols.values()),
        var_name="Method",
        value_name="Memory",
    )
    time_melt = time_sub.melt(
        id_vars="k",
        value_vars=list(time_cols.values()),
        var_name="Method",
        value_name="Time",
    )

    # Memory bar
    axm = axes[idx, 0]
    sns.barplot(data=mem_melt, x="k", y="Memory", hue="Method", ax=axm)
    axm.set_title(
        f"{labels[2*idx]} Sampling Dataset Level = {raw_n}\nAverage Peak CPU Memory for KMeans",
        wrap=True,
        pad=20,
    )
    axm.set_xlabel("Sampling Dataset Level")
    axm.set_ylabel("Average Peak CPU Memory (MB)")
    axm.grid(True, which="major", linestyle="-", linewidth=1)
    axm.minorticks_on()
    axm.grid(True, which="minor", linestyle="--", linewidth=0.5, alpha=0.7)
    annotate_multiples(axm, mem_sub, baseline_col="scikit-learn")

    # Time bar
    axt = axes[idx, 1]
    sns.barplot(data=time_melt, x="k", y="Time", hue="Method", ax=axt)
    axt.set_title(
        f"{labels[2*idx+1]} Sampling Dataset Level = {raw_n}\nAverage Running Time for KMeans",
        wrap=True,
        pad=20,
    )
    axt.set_xlabel("Sampling Dataset Level")
    axt.set_ylabel("Average Time (s)")
    axt.grid(True, which="major", linestyle="-", linewidth=1)
    axt.minorticks_on()
    axt.grid(True, which="minor", linestyle="--", linewidth=0.5, alpha=0.7)
    annotate_multiples(axm, mem_sub, baseline_col="scikit-learn")
    annotate_multiples(axt, time_sub, baseline_col="scikit-learn")

    # Hatch predicted MATLAB
    if raw_n == 100000 and has_matlab:
        for patch in axm.patches:
            if np.isclose(patch.get_height(), pred_m, rtol=1e-3):
                patch.set_hatch("//")
                patch.set_edgecolor("black")
        for patch in axt.patches:
            if np.isclose(patch.get_height(), pred_t, rtol=1e-3):
                patch.set_hatch("//")
                patch.set_edgecolor("black")

plt.tight_layout()
plt.show()