# Autoscan Notebook v1
*Last updated 2025-07-14*

**Purpose**
This standalone notebook lets you:
1. Upload two Excel files (raw & ops) *or* use autogenerated dummy data.
2. Compute fractional deltas to several mathematical constants.
3. Cluster a chosen numeric column with DBSCAN or MeanShift.
4. (Optionally) detect new values vs a previous results CSV.
5. Export a clean CSV of results.

Everything runs locally inside Colab—no external drives or prior chats needed.

In [None]:

# ============== CONFIGURATION ==============
# Set these before running the pipeline.
NUM_COL        = 'slice'   # numeric column to analyse
CLUST_METRIC   = 'euclidean'   # 'euclidean', 'manhattan', 'cosine', ...
BANDWIDTH      = 0.001     # ε / bandwidth for clustering
EXPORT_NAME    = 'autoscan_results.csv'
# To diff against an earlier run, put its CSV filename here (or None)
PREVIOUS_CSV   = None
NEW_THRESHOLD  = 1e-12
# ===========================================


In [None]:

import pandas as pd
import numpy as np
from google.colab import files
from pathlib import Path
import io, os

def choose_data():
    """Return raw_df, ops_df after user upload or dummy generation."""
    print('▶ Upload two Excel files (raw & ops). If you skip, dummy data will be used.')
    uploaded = files.upload()
    excel_paths = []
    for name in uploaded.keys():
        if name.endswith('.xlsx'):
            excel_paths.append(name)
    if len(excel_paths) >= 2:
        raw_df = pd.read_excel(excel_paths[0])
        ops_df = pd.read_excel(excel_paths[1])
        print(f'✔ Loaded {excel_paths[0]} and {excel_paths[1]}')
    else:
        # Dummy fallback
        x = np.linspace(0.0005, 0.015, 200)
        raw_df = pd.DataFrame({'dummy': x})
        ops_df = pd.DataFrame({'slice': x + np.random.normal(0, 1e-4, len(x))})
        print('⚠ Using autogenerated dummy data (200 values).')
    return raw_df, ops_df

raw_df, ops_df = choose_data()
raw_df.head(), ops_df.head()


In [None]:

# Mathematical constants
import numpy as np

CONST = {
    'alpha_inv': 137.035999084,
    'phi'      : (1 + 5**0.5) / 2,
    'sqrt2'    : 2**0.5,
    'e'        : np.e,
}

def delta(x, ref): return abs(x - ref) / ref

for name, ref in CONST.items():
    ops_df[f'delta_{name}'] = ops_df[NUM_COL].apply(lambda v: delta(v, ref))

ops_df.head()


In [None]:

from sklearn.cluster import DBSCAN, MeanShift
vals = ops_df[[NUM_COL]].values

if CLUST_METRIC in {'euclidean', 'manhattan'}:
    model = DBSCAN(eps=BANDWIDTH, metric=CLUST_METRIC, min_samples=2).fit(vals)
else:
    model = MeanShift(bandwidth=BANDWIDTH).fit(vals)

ops_df['cluster'] = model.labels_
print('Cluster counts:\n', ops_df['cluster'].value_counts())


In [None]:

import pandas as pd, numpy as np, os
new_entries = pd.DataFrame()

if PREVIOUS_CSV and Path(PREVIOUS_CSV).exists():
    prev_df = pd.read_csv(PREVIOUS_CSV)
    prev_vals = set(np.round(prev_df[NUM_COL], 12))
    curr_vals = set(np.round(ops_df[NUM_COL], 12))
    diff_vals = [v for v in curr_vals if all(abs(v - pv) > NEW_THRESHOLD for pv in prev_vals)]
    new_entries = ops_df[np.isin(np.round(ops_df[NUM_COL], 12), diff_vals)]
    print(f'▲ {len(new_entries)} new values compared to {PREVIOUS_CSV}')
else:
    print('Diff step skipped (no previous CSV provided).')


In [None]:

import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
plt.scatter(ops_df.index, ops_df[NUM_COL], c=ops_df['cluster'])
plt.title('Clustered values')
plt.xlabel('Index'); plt.ylabel(NUM_COL)
plt.show()


In [None]:

ops_df.to_csv(EXPORT_NAME, index=False)
print(f'✔ Results saved to {EXPORT_NAME}')
if not new_entries.empty:
    new_name = 'new_entries_' + EXPORT_NAME
    new_entries.to_csv(new_name, index=False)
    print(f'✔ New entries saved to {new_name}')
