In [1]:
import numpy as np
import time
import pandas as pd
from pathlib import Path
from silhouette_upper_bound import upper_bound
from sklearn.metrics import pairwise_distances

In [2]:
# ============================================================
# CONFIGURATION
# ============================================================

OUTPUT_FILE = Path("runtime_results_D_hat.csv")

# Dataset sizes to test
SIZES = [2000, 5000, 10000, 20000, 30000, 40000]   # adjust as needed
# Dimension of synthetic points
DIM = 32

# Number of repeats for averaging runtimes
REPEATS = 5

In [3]:
# ============================================================
# RUNTIME MEASUREMENT
# ============================================================
def measure_runtime(func, D, repeats=5):
    runtimes = []
    for _ in range(repeats):
        start = time.perf_counter()
        func(D)
        end = time.perf_counter()
        runtimes.append(end - start)

    return np.mean(runtimes), np.std(runtimes)

In [4]:
# ============================================================
# MAIN LOOP
# ============================================================
def run_experiments():
    results = []

    for n in SIZES:
        print(f"Running n={n} ...")

        # Synthetic Gaussian data
        np.random.seed(0) 
        X = np.random.randn(n, DIM).astype(np.float32)

        # Algo
        D = pairwise_distances(X, metric="euclidean")
        # mean_rt, std_rt = measure_runtime(upper_bound, D, REPEATS)

        # Constructing D
        #mean_rt, std_rt = measure_runtime(pairwise_distances, X, REPEATS)

        # Constructing D_hat
        mean_rt, std_rt = measure_runtime(lambda D: np.sort(D[~np.eye(D.shape[0], dtype=bool)].reshape(D.shape[0], -1)), D, REPEATS)

        print(f"  mean runtime: {mean_rt:.4f}s   std: {std_rt:.4f}s")

        results.append({
            "n_samples": n,
            "dim": DIM,
            "repeats": REPEATS,
            "runtime_mean_sec": mean_rt,
            "runtime_std_sec": std_rt
        })

    df = pd.DataFrame(results)
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"\nSaved results to: {OUTPUT_FILE.resolve()}")

    return df

In [5]:
run_experiments()

Running n=2000 ...
  mean runtime: 0.0607s   std: 0.0097s
Running n=5000 ...
  mean runtime: 0.3121s   std: 0.0294s
Running n=10000 ...
  mean runtime: 1.3366s   std: 0.0222s
Running n=20000 ...
  mean runtime: 5.8854s   std: 0.1388s
Running n=30000 ...
  mean runtime: 17.4153s   std: 2.1127s
Running n=40000 ...
  mean runtime: 128.7767s   std: 13.4014s

Saved results to: /Users/hugostrang/projects/silhouette-upper-bound/experiments/runtime_results_D_hat.csv


Unnamed: 0,n_samples,dim,repeats,runtime_mean_sec,runtime_std_sec
0,2000,32,5,0.060692,0.009685
1,5000,32,5,0.312119,0.029441
2,10000,32,5,1.336632,0.022223
3,20000,32,5,5.885444,0.138761
4,30000,32,5,17.41526,2.112666
5,40000,32,5,128.77666,13.40145


In [6]:
import matplotlib.pyplot as plt
import seaborn as sns # Seaborn provides better default styles and color palettes
from matplotlib.ticker import ScalarFormatter

In [7]:
# Use a clean style
plt.style.use("seaborn-v0_8-whitegrid")

# Load data (adjust filenames if needed based on previous turns)
# Note: This code assumes you have 'runtime_results.csv' and 'runtime_results_distance_matrix.csv' 
# and 'runtime_results_D_hat.csv' ready with 'n_samples', 'runtime_mean_sec', 'runtime_std_sec' columns.
df = pd.read_csv("runtime_results.csv")
df_distance_matrix = pd.read_csv("runtime_results_distance_matrix.csv")
df_D_hat = pd.read_csv("runtime_results_D_hat.csv") # Assuming this exists from original prompt intent

fig, ax = plt.subplots(figsize=(9, 6))

# Define colors and styles to match the image
# Colors roughly match the image: Black (PAM), Gray (Reynolds/Dist. Matrix), Light Blue (FastPAM1/D_hat), Green (FastPAM2)
colors = ['#000000', '#969696', '#6baed6', '#31a354']
markers = ['o', '', 'x']
linestyles = ['-', '-', '--']
linewidths = [1.5, 1.5, 1.5]

# --- 1. Plot the actual experimental data WITH error bars (Algorithm 1) ---
ax.errorbar(
    df["n_samples"], 
    df["runtime_mean_sec"], 
    yerr=df["runtime_std_sec"], 
    marker=markers[0], 
    linestyle=linestyles[0], 
    color=colors[0], 
    linewidth=linewidths[0], 
    markersize=7, 
    capsize=3, 
    label='Algorithm 1'
)

# --- 2. Add D_hat data ---
ax.errorbar(
    df_D_hat["n_samples"], 
    df_D_hat["runtime_mean_sec"], 
    yerr=df_D_hat["runtime_std_sec"], 
    marker='o',              # The circle marker shape
    markerfacecolor='none',
    linestyle=linestyles[1], 
    color=colors[1], 
    linewidth=linewidths[1], 
    markersize=7, 
    capsize=3,
    label='Constructing $\hat{\Delta}$ (part of Algorithm 1)'
)

# --- 3. Reference: distance matrix computation (Dissimilarity Matrix) ---
ax.errorbar(
    df_distance_matrix["n_samples"], 
    df_distance_matrix["runtime_mean_sec"], 
    yerr=df_distance_matrix["runtime_std_sec"], 
    marker=markers[2], 
    linestyle=linestyles[2], 
    color=colors[2], 
    linewidth=linewidths[2], 
    markersize=7, 
    capsize=3,
    label='Computing dissimilarity matrix $\Delta$'
)


# --- 4. Enhance Plot Aesthetics and Match Image Style ---

ax.set_xlabel("Number of samples ($n$)", fontsize=14)
ax.set_ylabel("Runtime (seconds)", fontsize=14)

# Set the log scale explicitly
#ax.set_xscale('log')
ax.set_yscale('log')

#ax.grid(True, axis="x", which='both', linestyle='--', linewidth=0.5, alpha=0.7)

# Match the legend style from the image (simple box, upper left)
ax.legend(loc='upper left', frameon=True, shadow=False, borderpad=0.8, fontsize=14, handlelength=3)

# Remove top/right spines as in the image
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')

# Use ScalarFormatter to display actual numbers (10, 100) instead of scientific notation (1e1, 1e2)
ax.xaxis.set_major_formatter(ScalarFormatter())

form = plt.FormatStrFormatter('%g')
plt.gca().yaxis.set_major_formatter(form)

#Force all X ticks to display, matching previous request
ax.set_xticks(df["n_samples"].values) 
ax.tick_params(axis='x', rotation=45, which='major', labelsize=14)
ax.tick_params(axis='y', which='major', labelsize=14)

plt.tight_layout()
plt.savefig(f"runtime.pdf", bbox_inches="tight", dpi=300)
plt.close()