In [1]:
import sys
print(sys.version)

3.10.11 (v3.10.11:7d4cc5aa85, Apr  4 2023, 19:05:19) [Clang 13.0.0 (clang-1300.0.29.30)]


This notebook demonstrates why it is necessary to scan all values of $k$ to determine the silhouette upper bound, $1-f(i)$, for datapoint $i$.

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from silhouette_upper_bound import upper_bound_samples
from sklearn.metrics import silhouette_samples
from pathlib import Path
import kmedoids
import json 

In [3]:
def _row_f(row: np.ndarray, n: int, m=1) -> float:

    y = np.sum(row[m - 1 :])
    # Initialize q
    if m == 1:
        x = 0
        q = 1
    else:
        x = np.sum(row[: m - 1])
        q = (x / (m - 1)) / (y / (n - m))

    bound_list = [1 - q]
    k_list = [m]

    for k in range(m + 1, n - m + 1):
        d_to_move = row[k - 2]

        x += d_to_move
        y -= d_to_move

        q_candidate = (x / (k - 1)) / (y / (n - k))

        bound_list.append(1 - q_candidate)
        k_list.append(k)

        if q_candidate < q:
            q = q_candidate

    # max(bound_list) corresponds to upper bound 
    return k_list, bound_list

In [4]:
base_path = Path.cwd()

# Load data

datasets_path = base_path / "datasets"

folder = f"{datasets_path}/1000-300-5-2"

feature_vectors, labels, distance_matrix = (
    np.load(f"{folder}/feature_vectors.npy"),
    np.load(f"{folder}/labels.npy"),
    np.load(f"{folder}/distance_matrix.npy"),
)

# Get parameters

params_file = f"{folder}/parameters.json"

with open(params_file, "r", encoding="utf-8") as f:

    dataset_parameters = json.load(f)

In [5]:
# kmedoids clustering 
random_state = 42

cluster_labels = cluster_labels = kmedoids.pamsil(
            diss=distance_matrix, medoids=dataset_parameters["centers"], random_state=random_state
        ).labels

silh_samples = silhouette_samples(X=distance_matrix, labels=cluster_labels, metric='precomputed')

In [6]:
# Distance matrix with diagonal removed and rows sorted in ascending order 
D_hat = np.sort(distance_matrix[~np.eye(distance_matrix.shape[0], dtype=bool)].reshape(distance_matrix.shape[0], -1))

In [7]:
# Upper bounds 
samples = upper_bound_samples(distance_matrix)

In [8]:
# Investigate point with lowest upper bound 
argm = np.argmin(samples)
samples[argm]

np.float64(0.6510639355599324)

In [9]:
# Achieved silhouette
achieved_silh = silh_samples[argm]
achieved_silh

np.float64(0.6457547899839603)

In [10]:
row = D_hat[argm,:]
# List of 1 - q(k)
k_list, bound_list = _row_f(row, dataset_parameters["n_samples"])

In [11]:
# Generate figure 

plt.style.use('seaborn-v0_8-white')

# Create a figure and an axes object
fig, ax = plt.subplots(figsize=(9, 6))

# Plot the main data line with style
ax.plot(
    k_list[1:], 
    bound_list[1:], 
    label=r"$1-q(\k,k)$", 
    color="#336699",     
    linewidth=3,          
    zorder=2              
)

# Add the horizontal reference line using the 'ax' object
ax.axhline(
    y=achieved_silh,
    color="crimson",      
    linestyle="--",
    linewidth=2,
    label="Achieved silhouette width"
)

# Add the vertical reference lines using the 'ax' object
ax.axvline(
    x=2,
    color="black",
    linestyle="-.",       
    linewidth=1.5,
    label=r"Reference 1 ($k=2$)",
    alpha=0.7             
)

ax.axvline(
    x=200,
    color="black",
    linestyle="-.",
    linewidth=1.5,
    label=r"Reference 2 ($k=200$)",
    alpha=0.7
)

# Optional: Highlight specific intersection points for emphasis
ax.scatter(2, ax.get_ylim()[0], color='gray', marker='v', s=100, zorder=3)
ax.scatter(200, ax.get_ylim()[0], color='gray', marker='v', s=100, zorder=3)


# --- Labels, Title, Legend, and Styling Refinements ---

ax.set_xlabel(r"$k$", fontsize=14, fontweight='semibold')

# Add a legend with a title and put it outside the plot area
ax.legend(
    loc="center left", 
    bbox_to_anchor=(1, 0.5), 
    frameon=False,           
    title="",
    fontsize=14
)

# Set axis limits to give some breathing room
ax.set_xlim(-10, 310)
ax.set_ylim(0.5, 0.7) 

ax.tick_params(axis='both', which='major', labelsize=14)

plt.tight_layout(rect=[0, 0, 0.85, 1]) 

plt.savefig(f"{base_path}/figures/q_search_demo.pdf", bbox_inches="tight")
plt.close()


In [12]:
# ASW 
np.mean(silh_samples)

np.float64(0.6677330849537492)