# Cluster‑Delta Notebook
*Self‑contained, updated 2025-07-14*

Runs end‑to‑end in Colab without manual edits:
1. Reads two Excel files (`plateaus_raw.xlsx`, `plateaus_ops.xlsx`).
2. Computes fractional deltas to α⁻¹, φ, √2, e.
3. Scales the target column (optional) and auto‑derives a suitable `eps` for DBSCAN.
4. Clusters the data, plots results, and outputs per‑cluster delta statistics.
5. Saves `clustered_results.csv` (and optionally a diff vs a previous run).


In [7]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from pathlib import Path


In [8]:

# ============== USER CONFIG (edit if needed) ==============
RAW_FILE      = 'plateaus_raw.xlsx'
OPS_FILE      = 'plateaus_ops.xlsx'
NUM_COL       = 'slice'      # numeric column in OPS_FILE to analyse
SCALE_DATA    = True         # standardise NUM_COL before clustering?
MIN_SAMPLES   = 2            # DBSCAN min_samples
PCT_KDIST     = 5            # percentile of 2‑NN distance to use for eps
EXPORT_NAME   = 'clustered_results.csv'
PREVIOUS_CSV  = None         # set to a previous results CSV to diff
NEW_THRESH    = 1e-12
# ==========================================================


In [9]:

def read_excel_safe(path):
    if not Path(path).exists():
        raise FileNotFoundError(f'Missing {path}. Upload it via Colab → Files sidebar.')
    return pd.read_excel(path)

raw_df = read_excel_safe(RAW_FILE)
ops_df = read_excel_safe(OPS_FILE)
print(f'Loaded raw_df {raw_df.shape}, ops_df {ops_df.shape}')


Loaded raw_df (200, 2), ops_df (200, 1)


In [10]:

CONST = {
    'alpha_inv': 137.035999084,
    'phi'      : (1 + 5**0.5)/2,
    'sqrt2'    : np.sqrt(2),
    'e'        : np.e,
}

def delta(x, ref): return abs(x - ref) / ref

for name, ref in CONST.items():
    ops_df[f'delta_{name}'] = ops_df[NUM_COL].apply(lambda v: delta(v, ref))


In [11]:

X = ops_df[[NUM_COL]].values.astype(float)

if SCALE_DATA:
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
else:
    X_scaled = X

# 2‑nearest neighbour distances
nbrs = NearestNeighbors(n_neighbors=2).fit(X_scaled)
distances, _ = nbrs.kneighbors(X_scaled)
kdist = np.sort(distances[:,1])
eps_auto = np.percentile(kdist, PCT_KDIST)
print(f'Auto‑selected eps = {eps_auto:.6g} (percentile {PCT_KDIST})')


Auto‑selected eps = 0.000334292 (percentile 5)


In [12]:

model = DBSCAN(eps=eps_auto, min_samples=MIN_SAMPLES).fit(X_scaled)
ops_df['cluster'] = model.labels_
print('Cluster counts:', ops_df['cluster'].value_counts().to_dict())


Cluster counts: {-1: 190, 0: 2, 1: 2, 2: 2, 3: 2, 4: 2}


In [None]:

new_entries = pd.DataFrame()
if PREVIOUS_CSV and Path(PREVIOUS_CSV).exists():
    prev_df = pd.read_csv(PREVIOUS_CSV)
    prev_vals = set(np.round(prev_df[NUM_COL], 12))
    curr_vals = set(np.round(ops_df[NUM_COL], 12))
    diff_vals = [v for v in curr_vals if all(abs(v - pv) > NEW_THRESH for pv in prev_vals)]
    new_entries = ops_df[np.isin(np.round(ops_df[NUM_COL], 12), diff_vals)]
    print(f'Detected {len(new_entries)} new values compared to {PREVIOUS_CSV}')
else:
    print('Diff step skipped.')


In [None]:

plt.figure(figsize=(10,5))
plt.scatter(ops_df.index, ops_df[NUM_COL], c=ops_df['cluster'], s=15)
plt.title('Clustered values')
plt.xlabel('Index'); plt.ylabel(NUM_COL)
plt.show()


In [None]:

ops_df.to_csv(EXPORT_NAME, index=False)
print('Results saved to', EXPORT_NAME)
if not new_entries.empty:
    new_name = 'new_' + EXPORT_NAME
    new_entries.to_csv(new_name, index=False)
    print('New entries saved to', new_name)
