<a href="https://colab.research.google.com/github/korkutanapa/ANOMALY_DETECTION_TDA_YAHOO_DATASET/blob/main/TDA_codes_for_YAHOO_A3_A4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# FULL CODE FOR UNSUPERVISED ANOMALY DETECTION VIA TDA

In [None]:
# @title necessary libraries
# Install
!pip install -U "numpy>=2.0,<2.3" "scipy>=1.11" ripser persim

In [None]:
# @title necessary libraries
!pip install ruptures

# MAIN CODE TDA AND VAAD

In [1]:
# @title segmentation
import pandas as pd
import numpy as np
import ruptures as rpt
import sys

# Define the file path pattern
DATA_PATTERN = "/content/A3Benchmark-TS{ts_id}.csv"
window_size = 100

print(f"Starting batch changepoint detection for 100 files...")
print(f"Using window size: {window_size}")
print("-" * 50)

# Loop from 1 to 100 (inclusive)
for ts_id in range(1, 101):
    filepath = DATA_PATTERN.format(ts_id=ts_id)
    print(f"\nProcessing file: {filepath}")

    try:
        # --- Load the dataset ---
        df = pd.read_csv(filepath)

        # --- 1. Calculate Rolling Statistics ---
        df['rolling_mean'] = df['value'].rolling(window=window_size, center=True).mean()
        df['rolling_variance'] = df['value'].rolling(window=window_size, center=True).var()

        # --- 2. Run Changepoint Detection ---

        # Create the signal from rolling variance
        signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values

        # Check for bad signal (e.g., file shorter than window size)
        if np.all(np.isnan(signal)):
            print("Signal is all NaN. Skipping detection.")
            continue

        # Use 'Pelt' algorithm
        algo = rpt.Pelt(model="l2").fit(signal)

        # Define a penalty value
        n_samples = len(signal)
        sigma_sq = np.nanvar(signal) # Use nan-safe variance

        # Avoid division by zero or invalid penalty if variance is 0
        if sigma_sq == 0:
            print("Signal variance is zero. Skipping detection.")
            continue

        # Using the 50* multiplier from your code
        penalty_value = 50 * np.log(n_samples) * sigma_sq

        # Predict the changepoints using the penalty
        new_changepoints_idx = algo.predict(pen=penalty_value)

        # Clean up the list (remove the end-of-signal index)
        new_changepoints = [idx for idx in new_changepoints_idx if idx < len(signal)]

        # --- 3. Print Ground Truth and Detection Results ---

        # Get the original changepoints from the file
        original_changepoints = []
        if 'changepoint' in df.columns:
            original_changepoints = df.index[df['changepoint'] == 1].tolist()

        print("\n--- Detection Results ---")
        print(f"Original 'Ground Truth' changepoints: {original_changepoints}")
        print(f"Found {len(new_changepoints)} new changepoints: {new_changepoints}")

        # --- 4. Add new column and Overwrite file ---
        df['new_cp'] = 0
        df.loc[new_changepoints, 'new_cp'] = 1

        print(f"Updating and overwriting {filepath}...")
        # Save the file, overwriting the original. index=False stops
        # pandas from adding an extra 'Unnamed: 0' column.
        df.to_csv(filepath, index=False)
        print("File update complete.")

    except FileNotFoundError:
        print(f"File not found: {filepath}. Skipping.")
    except Exception as e:
        # Catch any other errors during processing
        print(f"An error occurred processing {filepath}: {e}", file=sys.stderr)
        print("Skipping this file.")

print("-" * 50)
print("Batch processing complete.")

Starting batch changepoint detection for 100 files...
Using window size: 100
--------------------------------------------------

Processing file: /content/A3Benchmark-TS1.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS1.csv...
File update complete.

Processing file: /content/A3Benchmark-TS2.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS2.csv...
File update complete.

Processing file: /content/A3Benchmark-TS3.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS3.csv...
File update complete.

Processing file: /content/A3Benchmark-TS4.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS4.csv...
File update complete.

Processing file: /content/A3Benchmark-TS5.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS5.csv...
File update complete.

Processing file: /content/A3Benchmark-TS6.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS6.csv...
File update complete.

Processing file: /content/A3Benchmark-TS7.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS7.csv...
File update complete.

Processing file: /content/A3Benchmark-TS8.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS8.csv...
File update complete.

Processing file: /content/A3Benchmark-TS9.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS9.csv...
File update complete.

Processing file: /content/A3Benchmark-TS10.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS10.csv...
File update complete.

Processing file: /content/A3Benchmark-TS11.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 1 new changepoints: [200]
Updating and overwriting /content/A3Benchmark-TS11.csv...
File update complete.

Processing file: /content/A3Benchmark-TS12.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS12.csv...
File update complete.

Processing file: /content/A3Benchmark-TS13.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS13.csv...
File update complete.

Processing file: /content/A3Benchmark-TS14.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS14.csv...
File update complete.

Processing file: /content/A3Benchmark-TS15.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS15.csv...
File update complete.

Processing file: /content/A3Benchmark-TS16.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS16.csv...
File update complete.

Processing file: /content/A3Benchmark-TS17.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS17.csv...
File update complete.

Processing file: /content/A3Benchmark-TS18.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS18.csv...
File update complete.

Processing file: /content/A3Benchmark-TS19.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS19.csv...
File update complete.

Processing file: /content/A3Benchmark-TS20.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS20.csv...
File update complete.

Processing file: /content/A3Benchmark-TS21.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS21.csv...
File update complete.

Processing file: /content/A3Benchmark-TS22.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS22.csv...
File update complete.

Processing file: /content/A3Benchmark-TS23.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS23.csv...
File update complete.

Processing file: /content/A3Benchmark-TS24.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS24.csv...
File update complete.

Processing file: /content/A3Benchmark-TS25.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS25.csv...
File update complete.

Processing file: /content/A3Benchmark-TS26.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS26.csv...
File update complete.

Processing file: /content/A3Benchmark-TS27.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS27.csv...
File update complete.

Processing file: /content/A3Benchmark-TS28.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS28.csv...
File update complete.

Processing file: /content/A3Benchmark-TS29.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS29.csv...
File update complete.

Processing file: /content/A3Benchmark-TS30.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS30.csv...
File update complete.

Processing file: /content/A3Benchmark-TS31.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 2 new changepoints: [660, 765]
Updating and overwriting /content/A3Benchmark-TS31.csv...
File update complete.

Processing file: /content/A3Benchmark-TS32.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS32.csv...
File update complete.

Processing file: /content/A3Benchmark-TS33.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS33.csv...
File update complete.

Processing file: /content/A3Benchmark-TS34.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS34.csv...
File update complete.

Processing file: /content/A3Benchmark-TS35.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS35.csv...
File update complete.

Processing file: /content/A3Benchmark-TS36.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS36.csv...
File update complete.

Processing file: /content/A3Benchmark-TS37.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS37.csv...
File update complete.

Processing file: /content/A3Benchmark-TS38.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS38.csv...
File update complete.

Processing file: /content/A3Benchmark-TS39.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS39.csv...
File update complete.

Processing file: /content/A3Benchmark-TS40.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS40.csv...
File update complete.

Processing file: /content/A3Benchmark-TS41.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS41.csv...
File update complete.

Processing file: /content/A3Benchmark-TS42.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS42.csv...
File update complete.

Processing file: /content/A3Benchmark-TS43.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS43.csv...
File update complete.

Processing file: /content/A3Benchmark-TS44.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS44.csv...
File update complete.

Processing file: /content/A3Benchmark-TS45.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS45.csv...
File update complete.

Processing file: /content/A3Benchmark-TS46.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS46.csv...
File update complete.

Processing file: /content/A3Benchmark-TS47.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS47.csv...
File update complete.

Processing file: /content/A3Benchmark-TS48.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS48.csv...
File update complete.

Processing file: /content/A3Benchmark-TS49.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS49.csv...
File update complete.

Processing file: /content/A3Benchmark-TS50.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS50.csv...
File update complete.

Processing file: /content/A3Benchmark-TS51.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS51.csv...
File update complete.

Processing file: /content/A3Benchmark-TS52.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 1 new changepoints: [840]
Updating and overwriting /content/A3Benchmark-TS52.csv...
File update complete.

Processing file: /content/A3Benchmark-TS53.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS53.csv...
File update complete.

Processing file: /content/A3Benchmark-TS54.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS54.csv...
File update complete.

Processing file: /content/A3Benchmark-TS55.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS55.csv...
File update complete.

Processing file: /content/A3Benchmark-TS56.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS56.csv...
File update complete.

Processing file: /content/A3Benchmark-TS57.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS57.csv...
File update complete.

Processing file: /content/A3Benchmark-TS58.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS58.csv...
File update complete.

Processing file: /content/A3Benchmark-TS59.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS59.csv...
File update complete.

Processing file: /content/A3Benchmark-TS60.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS60.csv...
File update complete.

Processing file: /content/A3Benchmark-TS61.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS61.csv...
File update complete.

Processing file: /content/A3Benchmark-TS62.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS62.csv...
File update complete.

Processing file: /content/A3Benchmark-TS63.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS63.csv...
File update complete.

Processing file: /content/A3Benchmark-TS64.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 1 new changepoints: [1580]
Updating and overwriting /content/A3Benchmark-TS64.csv...
File update complete.

Processing file: /content/A3Benchmark-TS65.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS65.csv...
File update complete.

Processing file: /content/A3Benchmark-TS66.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS66.csv...
File update complete.

Processing file: /content/A3Benchmark-TS67.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS67.csv...
File update complete.

Processing file: /content/A3Benchmark-TS68.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS68.csv...
File update complete.

Processing file: /content/A3Benchmark-TS69.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS69.csv...
File update complete.

Processing file: /content/A3Benchmark-TS70.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS70.csv...
File update complete.

Processing file: /content/A3Benchmark-TS71.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS71.csv...
File update complete.

Processing file: /content/A3Benchmark-TS72.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS72.csv...
File update complete.

Processing file: /content/A3Benchmark-TS73.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS73.csv...
File update complete.

Processing file: /content/A3Benchmark-TS74.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS74.csv...
File update complete.

Processing file: /content/A3Benchmark-TS75.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS75.csv...
File update complete.

Processing file: /content/A3Benchmark-TS76.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS76.csv...
File update complete.

Processing file: /content/A3Benchmark-TS77.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS77.csv...
File update complete.

Processing file: /content/A3Benchmark-TS78.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 2 new changepoints: [135, 1555]
Updating and overwriting /content/A3Benchmark-TS78.csv...
File update complete.

Processing file: /content/A3Benchmark-TS79.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS79.csv...
File update complete.

Processing file: /content/A3Benchmark-TS80.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS80.csv...
File update complete.

Processing file: /content/A3Benchmark-TS81.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 1 new changepoints: [985]
Updating and overwriting /content/A3Benchmark-TS81.csv...
File update complete.

Processing file: /content/A3Benchmark-TS82.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS82.csv...
File update complete.

Processing file: /content/A3Benchmark-TS83.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 1 new changepoints: [1110]
Updating and overwriting /content/A3Benchmark-TS83.csv...
File update complete.

Processing file: /content/A3Benchmark-TS84.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS84.csv...
File update complete.

Processing file: /content/A3Benchmark-TS85.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS85.csv...
File update complete.

Processing file: /content/A3Benchmark-TS86.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS86.csv...
File update complete.

Processing file: /content/A3Benchmark-TS87.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS87.csv...
File update complete.

Processing file: /content/A3Benchmark-TS88.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS88.csv...
File update complete.

Processing file: /content/A3Benchmark-TS89.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 1 new changepoints: [245]
Updating and overwriting /content/A3Benchmark-TS89.csv...
File update complete.

Processing file: /content/A3Benchmark-TS90.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS90.csv...
File update complete.

Processing file: /content/A3Benchmark-TS91.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS91.csv...
File update complete.

Processing file: /content/A3Benchmark-TS92.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS92.csv...
File update complete.

Processing file: /content/A3Benchmark-TS93.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS93.csv...
File update complete.

Processing file: /content/A3Benchmark-TS94.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS94.csv...
File update complete.

Processing file: /content/A3Benchmark-TS95.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS95.csv...
File update complete.

Processing file: /content/A3Benchmark-TS96.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS96.csv...
File update complete.

Processing file: /content/A3Benchmark-TS97.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS97.csv...
File update complete.

Processing file: /content/A3Benchmark-TS98.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS98.csv...
File update complete.

Processing file: /content/A3Benchmark-TS99.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS99.csv...
File update complete.

Processing file: /content/A3Benchmark-TS100.csv


  signal = df['rolling_variance'].fillna(method='bfill').fillna(method='ffill').values



--- Detection Results ---
Original 'Ground Truth' changepoints: []
Found 0 new changepoints: []
Updating and overwriting /content/A3Benchmark-TS100.csv...
File update complete.
--------------------------------------------------
Batch processing complete.


In [2]:
# @title TDA Feature Extractor (RAW values only — no VAAD/scoring)
# Reads A3Benchmark-TS{1..100}.csv, computes H0-based features per sliding window,
# aligns each window to its right-edge point, and saves raw feature values to CSV.

import os
import numpy as np
import pandas as pd
from ripser import ripser

# ---------------- Tunables ----------------
DATA_PATTERN = "/content/A3Benchmark-TS{ts_id}.csv"   # change if needed
OUT_CSV      = "/content/tda_features_a3.csv"         # output CSV with RAW feature values

VALUE_COL = "value"
LABEL_COL = "anomaly"
CP_CANDIDATES = ["change_point", "changepoint", "cp", "cp_flag"]

# Sliding window + Takens
window_size = 14
time_delay  = 1
dimension   = 7
assert window_size - (dimension - 1) * time_delay > 0, \
    "Choose smaller time_delay*dimension or larger window_size."

# Which TDA feature streams to export (column names will match these keys)
FEATURE_NAMES = [
    "H0_ratio_auc_L1_to_sum",
    "H0_ratio_auc_to_max",
    "H0_ratio_auc_to_l2",

    "H0_bottleneck",
    "tail_share_q90",
    "H0_sum_centroid",
    "H0_L2_norm",
    "PETE_p1.6_q0.5",

    "H0_energy_concentration",   # L2 / sumL
    "H0_dominance_share",        # maxL / sumL
    "H0_tail_curvature_80_90",   # tail90 - tail80
    "H0_centroid_to_energy",     # centroid / L2
    "H0_gini",                   # inequality of lifetimes
]

# ------------- Utilities (TDA + features) -------------
def takens_embed(window: np.ndarray, time_delay: int, dimension: int) -> np.ndarray:
    """
    Return Takens embedding with shape (m, dimension): m points × embedding dimension.
    """
    m = len(window) - (dimension - 1) * time_delay
    if m <= 0:
        raise ValueError("Takens params too large for this window_size.")
    # Each slice has length m; stacking along axis=1 yields (m, dimension)
    return np.stack(
        [window[j : j + m * time_delay : time_delay] for j in range(dimension)],
        axis=1
    )

def _clean_diag_h(diag_h):
    if diag_h is None:
        return np.empty((0, 2), dtype=float)
    arr = np.asarray(diag_h, dtype=float)
    if arr.ndim != 2 or arr.shape[1] != 2 or arr.size == 0:
        return np.empty((0, 2), dtype=float)
    finite_mask = np.isfinite(arr).all(axis=1)
    arr = arr[finite_mask]
    if arr.size == 0:
        return np.empty((0, 2), dtype=float)
    b, d = arr[:, 0], arr[:, 1]
    ok = np.isfinite(d) & (d > b)
    if not np.any(ok):
        return np.empty((0, 2), dtype=float)
    return np.stack([b[ok], d[ok]], axis=1)

try:
    _trapz = np.trapezoid
except AttributeError:
    _trapz = np.trapz

_EPS = 1e-12
def _safe_div(a, b, eps=_EPS): return float(a) / float(b + eps)

def _build_grid_for_time_funcs(arr, n_grid=128):
    if arr.size == 0: return np.linspace(0.0, 1.0, num=n_grid)
    lo = float(np.min(arr[:, 0])); hi = float(np.max(arr[:, 1]))
    if not np.isfinite(lo) or not np.isfinite(hi) or hi <= lo: lo, hi = 0.0, 1.0
    return np.linspace(lo, hi, num=n_grid)

def _lifetimes(arr):
    if arr.size == 0:
        return np.empty((0,), dtype=float)
    return np.maximum(arr[:, 1] - arr[:, 0], 0.0)

def _bottleneck_amp(diag_h):
    """Max lifetime."""
    arr = _clean_diag_h(diag_h)
    L = _lifetimes(arr)
    if L.size == 0: return 0.0
    return float(np.max(L))

def h0_l2_norm(diag_h):
    """sqrt(sum L_i^2)."""
    arr = _clean_diag_h(diag_h)
    L = _lifetimes(arr)
    if L.size == 0:
        return 0.0
    return float(np.sqrt(np.sum(L**2)))

def _auc_tri_max(diag_h):
    """Area under the pointwise max of triangular bars over the birth–death axis (discretized)."""
    arr = _clean_diag_h(diag_h)
    if arr.size == 0:
        return 0.0
    # Fast path: one bar → exact
    if arr.shape[0] == 1:
        L = float(arr[0,1] - arr[0,0])
        return 0.25 * (L * L)

    grid = _build_grid_for_time_funcs(arr, n_grid=64)
    lam1 = np.zeros_like(grid, dtype=float)
    b, d = arr[:,0], arr[:,1]
    for j in range(b.size):
        bj, dj = b[j], d[j]
        m  = 0.5 * (bj + dj)
        h  = 0.5 * (dj - bj)
        if not (np.isfinite(h) and h > 0):
            continue
        tri = np.zeros_like(grid, dtype=float)
        left  = (grid >= bj) & (grid <= m)
        right = (grid >= m)  & (grid <= dj)
        if left.any():  tri[left]  = (grid[left]  - bj) * (h / max(m - bj, 1e-12))
        if right.any(): tri[right] = (dj - grid[right]) * (h / max(dj - m, 1e-12))
        lam1 = np.maximum(lam1, tri)
    return float(_trapz(lam1, grid))

# --- Base H0 features (AUC-family) ---
def h0_ratio_auc_l1_to_sum(diag_h):
    """∫ max triangular envelope / sum lifetimes."""
    arr = _clean_diag_h(diag_h)
    if arr.size == 0: return 0.0
    L = _lifetimes(arr); S = float(L.sum())
    if S <= _EPS: return 0.0
    A = _auc_tri_max(arr)
    return _safe_div(A, S)

def h0_ratio_auc_to_max(diag_h):
    """∫ max triangular envelope / max lifetime."""
    arr = _clean_diag_h(diag_h)
    if arr.size == 0: return 0.0
    A = _auc_tri_max(arr)
    M = _bottleneck_amp(arr)
    return _safe_div(A, M)

def h0_ratio_auc_to_l2(diag_h):
    """∫ max triangular envelope / L2(lifetimes)."""
    arr = _clean_diag_h(diag_h)
    if arr.size == 0: return 0.0
    A  = _auc_tri_max(arr)
    L2 = h0_l2_norm(arr)
    return _safe_div(A, L2)

# --- Other base stats + “streaming-friendly” single-window stats ---
def pete(diag_h, p=1.6, q=0.5):
    """Sum (L^p * |(b+d)/√2|^q) / sum L."""
    arr = _clean_diag_h(diag_h)
    if arr.size == 0: return 0.0
    b, d = arr[:, 0], arr[:, 1]
    L = np.maximum(d - b, 0.0); S = float(L.sum())
    if S <= _EPS: return 0.0
    radial = (b + d) / np.sqrt(2.0)
    num = np.sum((L ** p) * (np.abs(radial) ** q))
    return _safe_div(num, S)

def _tail_share_q(diag_h, q=0.90):
    arr = _clean_diag_h(diag_h)
    L = _lifetimes(arr)
    if L.size == 0: return 0.0
    S = float(L.sum())
    if S <= _EPS: return 0.0
    qv = float(np.quantile(L, q))
    return _safe_div(float(L[L >= qv].sum()), S)

def _tail_share_q90(diag_h): return _tail_share_q(diag_h, q=0.90)
def _tail_share_q80(diag_h): return _tail_share_q(diag_h, q=0.80)

def _sum_centroid(diag_h):
    """Lifetime-weighted |(b+d)/√2| mean / sum lifetimes."""
    arr = _clean_diag_h(diag_h)
    if arr.size == 0: return 0.0
    b, d = arr[:, 0], arr[:, 1]
    L = np.maximum(d - b, 0.0); S = float(L.sum())
    if S <= _EPS: return 0.0
    radial = np.abs((b + d) / np.sqrt(2.0))
    return _safe_div(float(np.sum(radial * L)), S)

def _sumL(diag_h):
    arr = _clean_diag_h(diag_h); L = _lifetimes(arr)
    return float(np.sum(L)) if L.size > 0 else 0.0

def H0_energy_concentration(diag_h):   # L2 / sumL
    L2 = h0_l2_norm(diag_h); S = _sumL(diag_h)
    return _safe_div(L2, S)

def H0_dominance_share(diag_h):        # maxL / sumL
    mx = _bottleneck_amp(diag_h); S = _sumL(diag_h)
    return _safe_div(mx, S)

def H0_tail_curvature_80_90(diag_h):   # tail90 - tail80
    return float(_tail_share_q90(diag_h) - _tail_share_q80(diag_h))

def H0_centroid_to_energy(diag_h):     # centroid / L2
    cen = _sum_centroid(diag_h); L2 = h0_l2_norm(diag_h)
    return _safe_div(cen, L2)

def _gini_from_array(x):
    """Gini of nonnegative array x (0 if empty)."""
    x = np.asarray(x, dtype=float)
    x = x[np.isfinite(x)]
    x = x[x >= 0.0]
    if x.size == 0 or np.allclose(x.sum(), 0.0):
        return 0.0
    xs = np.sort(x); n = xs.size
    cumx = np.cumsum(xs)
    return float(1.0 + 1.0/n - 2.0 * (cumx.sum() / (n * xs.sum())))

def H0_gini(diag_h):
    arr = _clean_diag_h(diag_h)
    L = _lifetimes(arr)
    return _gini_from_array(L)

# ------------- Feature extraction per-window -------------
def extract_h0_features_series(D0_list):
    """Return dict with requested H0 features (one value per window)."""
    n = len(D0_list)
    feats = {
        # AUC-family
        "H0_ratio_auc_L1_to_sum":  np.zeros(n, dtype=float),
        "H0_ratio_auc_to_max":     np.zeros(n, dtype=float),
        "H0_ratio_auc_to_l2":      np.zeros(n, dtype=float),
        # base stats
        "H0_bottleneck":           np.zeros(n, dtype=float),
        "tail_share_q90":          np.zeros(n, dtype=float),
        "H0_sum_centroid":         np.zeros(n, dtype=float),
        "H0_L2_norm":              np.zeros(n, dtype=float),
        "PETE_p1.6_q0.5":          np.zeros(n, dtype=float),
        # streaming-friendly
        "H0_energy_concentration": np.zeros(n, dtype=float),
        "H0_dominance_share":      np.zeros(n, dtype=float),
        "H0_tail_curvature_80_90": np.zeros(n, dtype=float),
        "H0_centroid_to_energy":   np.zeros(n, dtype=float),
        "H0_gini":                 np.zeros(n, dtype=float),
    }
    for i, D0 in enumerate(D0_list):
        clean = _clean_diag_h(D0)
        # AUC-family
        feats["H0_ratio_auc_L1_to_sum"][i] = h0_ratio_auc_l1_to_sum(clean)
        feats["H0_ratio_auc_to_max"][i]    = h0_ratio_auc_to_max(clean)
        feats["H0_ratio_auc_to_l2"][i]     = h0_ratio_auc_to_l2(clean)
        # base stats
        feats["H0_bottleneck"][i]          = _bottleneck_amp(clean)
        feats["tail_share_q90"][i]         = _tail_share_q90(clean)
        feats["H0_sum_centroid"][i]        = _sum_centroid(clean)
        feats["H0_L2_norm"][i]             = h0_l2_norm(clean)
        feats["PETE_p1.6_q0.5"][i]         = pete(clean, p=1.6, q=0.5)
        # streaming-friendly
        feats["H0_energy_concentration"][i] = H0_energy_concentration(clean)
        feats["H0_dominance_share"][i]      = H0_dominance_share(clean)
        feats["H0_tail_curvature_80_90"][i] = H0_tail_curvature_80_90(clean)
        feats["H0_centroid_to_energy"][i]   = H0_centroid_to_energy(clean)
        feats["H0_gini"][i]                 = H0_gini(clean)
    return feats

# ------------- Segmentation helpers -------------
def _resolve_cp_column(df: pd.DataFrame) -> str:
    cols_lower = {c.lower(): c for c in df.columns}
    for cand in CP_CANDIDATES:
        if cand in cols_lower: return cols_lower[cand]
    raise ValueError(f"No change-point column found. Looked for {CP_CANDIDATES}. Found: {list(df.columns)}")

def _segments_from_cp(df: pd.DataFrame, cp_col: str):
    """
    Build half-open segments [start, end) using indices where cp==1 as breakpoints.
    The cp index is treated as the *first* index of the new segment.
    """
    n = len(df)
    cp_idx = df.index[df[cp_col] == 1].astype(int).tolist()
    bkps = sorted(set([i for i in cp_idx if 0 < i <= n]) | {n})
    starts = [0] + bkps[:-1]
    return list(zip(starts, bkps))  # [(s,e), ...], covering [0, n)

# ------------- Per-dataset processing → DataFrame -------------
def process_one(ts_id: int) -> pd.DataFrame | None:
    path = DATA_PATTERN.format(ts_id=ts_id)
    if not os.path.exists(path):
        print(f"[WARN] Missing: {path}")
        return None

    df_raw = pd.read_csv(path)
    if VALUE_COL not in df_raw.columns or LABEL_COL not in df_raw.columns:
        print(f"[WARN] {path} missing required columns")
        return None
    cp_col = _resolve_cp_column(df_raw)

    df = pd.DataFrame({
        "value":   pd.to_numeric(df_raw[VALUE_COL], errors="coerce"),
        "anomaly": pd.to_numeric(df_raw[LABEL_COL], errors="coerce").fillna(0).astype(int),
        "cp":      pd.to_numeric(df_raw[cp_col], errors="coerce").fillna(0).astype(int),
    }).interpolate(limit_direction="both").ffill().bfill().reset_index(drop=True)

    n = len(df)
    segments = _segments_from_cp(df, "cp")

    # Prepare output frame for ALL points (0..n-1)
    out_cols = {
        "loadeddataset": [f"ts{ts_id}"] * n,
        "segment": 0,
        "point":   np.arange(n, dtype=int),
        "is_anomaly": df["anomaly"].astype(int).values,
        "y": df["value"].astype(float).values,  # original time-series value
    }
    # Reserve NaNs for features at points without a complete window ending there
    for name in FEATURE_NAMES:
        out_cols[name] = np.nan
    out = pd.DataFrame(out_cols)

    # Tag each point with its segment id
    seg_id_arr = np.zeros(n, dtype=int)
    for sid, (s, e) in enumerate(segments, start=1):
        seg_id_arr[s:e] = sid
    out["segment"] = seg_id_arr

    # Compute features per window and assign to right-edge index (global coords)
    x = df["value"].to_numpy(dtype=float)
    W = window_size
    for sid, (s, e) in enumerate(segments, start=1):
        seg_x = x[s:e]
        if len(seg_x) < W:
            continue

        n_windows = (len(seg_x) - W) + 1  # stride=1
        D0_list = []
        for i in range(n_windows):
            w = seg_x[i:i+W]
            emb = takens_embed(w, time_delay, dimension)  # (m, dim)
            dgms = ripser(emb, maxdim=0)["dgms"]
            D0_list.append(_clean_diag_h(dgms[0] if len(dgms) else None))

        # Compute raw features per window
        feat_dict = extract_h0_features_series(D0_list)
        df_feat = pd.DataFrame({"window": np.arange(n_windows, dtype=int)})
        for k, v in feat_dict.items(): df_feat[k] = v

        # Map window to GLOBAL right-edge index
        t_global = s + df_feat["window"].astype(int) + (W - 1)

        # Write RAW feature values directly (no VAAD, no normalization)
        for name in FEATURE_NAMES:
            if name in df_feat.columns:
                out.loc[t_global.values, name] = df_feat[name].to_numpy(dtype=float)

    return out

# ------------- Batch over datasets → single CSV -------------
all_rows = []
for ts in range(1, 101):  # TS1..TS100 (adjust upper bound as needed)
    try:
        df_one = process_one(ts)
        if df_one is not None:
            all_rows.append(df_one)
            print(f"[OK] ts{ts}: rows={len(df_one)}")
    except Exception as e:
        print(f"[ERROR] ts{ts}: {e}")

if all_rows:
    df_all = pd.concat(all_rows, ignore_index=True)
    # Ensure dtypes are clean
    df_all["y"] = pd.to_numeric(df_all["y"], errors="coerce").astype(float)
    df_all["is_anomaly"] = pd.to_numeric(df_all["is_anomaly"], errors="coerce").fillna(0).astype(int)
    df_all.to_csv(OUT_CSV, index=False)
    print(f"Saved: {OUT_CSV} (rows={len(df_all)})")
else:
    print("No data written.")


[OK] ts1: rows=1680
[OK] ts2: rows=1680
[OK] ts3: rows=1680
[OK] ts4: rows=1680
[OK] ts5: rows=1680
[OK] ts6: rows=1680
[OK] ts7: rows=1680
[OK] ts8: rows=1680
[OK] ts9: rows=1680
[OK] ts10: rows=1680
[OK] ts11: rows=1680
[OK] ts12: rows=1680
[OK] ts13: rows=1680
[OK] ts14: rows=1680
[OK] ts15: rows=1680
[OK] ts16: rows=1680
[OK] ts17: rows=1680
[OK] ts18: rows=1680
[OK] ts19: rows=1680
[OK] ts20: rows=1680
[OK] ts21: rows=1680
[OK] ts22: rows=1680
[OK] ts23: rows=1680
[OK] ts24: rows=1680
[OK] ts25: rows=1680
[OK] ts26: rows=1680
[OK] ts27: rows=1680
[OK] ts28: rows=1680
[OK] ts29: rows=1680
[OK] ts30: rows=1680
[OK] ts31: rows=1680
[OK] ts32: rows=1680
[OK] ts33: rows=1680
[OK] ts34: rows=1680
[OK] ts35: rows=1680
[OK] ts36: rows=1680
[OK] ts37: rows=1680
[OK] ts38: rows=1680
[OK] ts39: rows=1680
[OK] ts40: rows=1680
[OK] ts41: rows=1680
[OK] ts42: rows=1680
[OK] ts43: rows=1680
[OK] ts44: rows=1680
[OK] ts45: rows=1680
[OK] ts46: rows=1680
[OK] ts47: rows=1680
[OK] ts48: rows=1680
[

In [3]:
# @title TDA Feature Extractor and Anomaly Scoring Algorithm (VAAD)
# Segmentation-aware TDA anomaly scores → single CSV (all datasets, all points)
import os
import numpy as np
import pandas as pd
from ripser import ripser

# ---------------- Tunables ----------------
DATA_PATTERN = "/content/A3Benchmark-TS{ts_id}.csv"  # change if needed
OUT_CSV      = "/content/anomaly_scores_a3.csv"     # final output CSV

VALUE_COL = "value"
LABEL_COL = "anomaly"
CP_CANDIDATES = ["new_cp"]

# Sliding window + Takens
window_size = 14
time_delay  = 1
dimension   = 7
assert window_size - (dimension - 1) * time_delay > 0, \
    "Choose smaller time_delay*dimension or larger window_size."

# V×A scoring knobs
KV = 3.5
KA = 3.5
MODE = "strict"  # "strict" | "plateau" | "abs_plateau" | "none"

# Which TDA feature streams to export (-> anomaly score columns)
FEATURES = [
    ("H0_ratio_auc_L1_to_sum",  "anomalyscore_h0_auc"),
    ("H0_ratio_auc_to_max",     "anomalyscore_h0_auc_over_max"),
    ("H0_ratio_auc_to_l2",      "anomalyscore_h0_auc_over_l2"),

    ("H0_bottleneck",           "anomalyscore_bottleneck"),
    ("tail_share_q90",          "anomalyscore_tail_q90"),
    ("H0_sum_centroid",         "anomalyscore_sum_centroid"),
    ("H0_L2_norm",              "anomalyscore_h0_l2norm"),
    ("PETE_p1.6_q0.5",          "anomalyscore_pete"),

    ("H0_energy_concentration", "anomalyscore_h0_energy_conc"),   # L2 / sumL
    ("H0_dominance_share",      "anomalyscore_h0_dom_share"),     # maxL / sumL
    ("H0_tail_curvature_80_90", "anomalyscore_h0_tail_curve"),    # tail90 - tail80
    ("H0_centroid_to_energy",   "anomalyscore_h0_cen_to_energy"), # centroid / L2
    ("H0_gini",                 "anomalyscore_h0_gini"),          # inequality of lifetimes
]

# ------------- Utilities (TDA + features) -------------
def takens_embed(window: np.ndarray, time_delay: int, dimension: int) -> np.ndarray:
    m = len(window) - (dimension - 1) * time_delay
    if m <= 0:
        raise ValueError("Takens params too large for this window_size.")
    return np.stack([window[j : j + m * time_delay : time_delay] for j in range(dimension)], axis=1)

def _clean_diag_h(diag_h):
    if diag_h is None:
        return np.empty((0, 2), dtype=float)
    arr = np.asarray(diag_h, dtype=float)
    if arr.ndim != 2 or arr.shape[1] != 2 or arr.size == 0:
        return np.empty((0, 2), dtype=float)
    finite_mask = np.isfinite(arr).all(axis=1)
    arr = arr[finite_mask]
    if arr.size == 0:
        return np.empty((0, 2), dtype=float)
    b, d = arr[:, 0], arr[:, 1]
    ok = np.isfinite(d) & (d > b)
    if not np.any(ok):
        return np.empty((0, 2), dtype=float)
    return np.stack([b[ok], d[ok]], axis=1)

try:
    _trapz = np.trapezoid
except AttributeError:
    _trapz = np.trapz

_EPS = 1e-12
def _safe_div(a, b, eps=_EPS): return float(a) / float(b + eps)

def _build_grid_for_time_funcs(arr, n_grid=128):
    if arr.size == 0: return np.linspace(0.0, 1.0, num=n_grid)
    lo = float(np.min(arr[:, 0])); hi = float(np.max(arr[:, 1]))
    if not np.isfinite(lo) or not np.isfinite(hi) or hi <= lo: lo, hi = 0.0, 1.0
    return np.linspace(lo, hi, num=n_grid)

def _lifetimes(arr):
    if arr.size == 0:
        return np.empty((0,), dtype=float)
    return np.maximum(arr[:, 1] - arr[:, 0], 0.0)

def _bottleneck_amp(diag_h):
    """Max lifetime."""
    arr = _clean_diag_h(diag_h)
    L = _lifetimes(arr)
    if L.size == 0: return 0.0
    return float(np.max(L))

def h0_l2_norm(diag_h):
    """sqrt(sum L_i^2)."""
    arr = _clean_diag_h(diag_h)
    L = _lifetimes(arr)
    if L.size == 0:
        return 0.0
    return float(np.sqrt(np.sum(L**2)))

def _auc_tri_max(diag_h):
    arr = _clean_diag_h(diag_h)
    if arr.size == 0:
        return 0.0
    # Fast path: one bar → exact
    if arr.shape[0] == 1:
        L = float(arr[0,1] - arr[0,0])
        return 0.25 * (L * L)

    # Smaller grid is enough; adaptive to time span
    grid = _build_grid_for_time_funcs(arr, n_grid=64)
    lam1 = np.zeros_like(grid, dtype=float)
    b, d = arr[:,0], arr[:,1]
    for j in range(b.size):
        bj, dj = b[j], d[j]
        m  = 0.5 * (bj + dj)
        h  = 0.5 * (dj - bj)
        if not (np.isfinite(h) and h > 0):
            continue
        tri = np.zeros_like(grid, dtype=float)
        left  = (grid >= bj) & (grid <= m)
        right = (grid >= m)  & (grid <= dj)
        if left.any():  tri[left]  = (grid[left]  - bj) * (h / max(m - bj, 1e-12))
        if right.any(): tri[right] = (dj - grid[right]) * (h / max(dj - m, 1e-12))
        lam1 = np.maximum(lam1, tri)
    return float(_trapz(lam1, grid))


# --- Base H0 features (AUC-family) ---
def h0_ratio_auc_l1_to_sum(diag_h):
    """∫ max triangular envelope / sum lifetimes."""
    arr = _clean_diag_h(diag_h)
    if arr.size == 0: return 0.0
    L = _lifetimes(arr); S = float(L.sum())
    if S <= _EPS: return 0.0
    A = _auc_tri_max(arr)
    return _safe_div(A, S)

def h0_ratio_auc_to_max(diag_h):
    """∫ max triangular envelope / max lifetime."""
    arr = _clean_diag_h(diag_h)
    if arr.size == 0: return 0.0
    A = _auc_tri_max(arr)
    M = _bottleneck_amp(arr)
    return _safe_div(A, M)

def h0_ratio_auc_to_l2(diag_h):
    """∫ max triangular envelope / L2(lifetimes)."""
    arr = _clean_diag_h(diag_h)
    if arr.size == 0: return 0.0
    A  = _auc_tri_max(arr)
    L2 = h0_l2_norm(arr)
    return _safe_div(A, L2)

# --- Other base stats + “streaming-friendly” single-window stats ---
def pete(diag_h, p=1.6, q=0.5):
    """Sum (L^p * |(b+d)/√2|^q) / sum L."""
    arr = _clean_diag_h(diag_h)
    if arr.size == 0: return 0.0
    b, d = arr[:, 0], arr[:, 1]
    L = np.maximum(d - b, 0.0); S = float(L.sum())
    if S <= _EPS: return 0.0
    radial = (b + d) / np.sqrt(2.0)
    num = np.sum((L ** p) * (np.abs(radial) ** q))
    return _safe_div(num, S)

def _tail_share_q(diag_h, q=0.90):
    arr = _clean_diag_h(diag_h)
    L = _lifetimes(arr)
    if L.size == 0: return 0.0
    S = float(L.sum())
    if S <= _EPS: return 0.0
    qv = float(np.quantile(L, q))
    return _safe_div(float(L[L >= qv].sum()), S)

def _tail_share_q90(diag_h): return _tail_share_q(diag_h, q=0.90)
def _tail_share_q80(diag_h): return _tail_share_q(diag_h, q=0.80)

def _sum_centroid(diag_h):
    """Lifetime-weighted |(b+d)/√2| mean / sum lifetimes."""
    arr = _clean_diag_h(diag_h)
    if arr.size == 0: return 0.0
    b, d = arr[:, 0], arr[:, 1]
    L = np.maximum(d - b, 0.0); S = float(L.sum())
    if S <= _EPS: return 0.0
    radial = np.abs((b + d) / np.sqrt(2.0))
    return _safe_div(float(np.sum(radial * L)), S)

def _sumL(diag_h):
    arr = _clean_diag_h(diag_h); L = _lifetimes(arr)
    return float(np.sum(L)) if L.size > 0 else 0.0

def H0_energy_concentration(diag_h):   # L2 / sumL
    L2 = h0_l2_norm(diag_h); S = _sumL(diag_h)
    return _safe_div(L2, S)

def H0_dominance_share(diag_h):        # maxL / sumL
    mx = _bottleneck_amp(diag_h); S = _sumL(diag_h)
    return _safe_div(mx, S)

def H0_tail_curvature_80_90(diag_h):   # tail90 - tail80
    return float(_tail_share_q90(diag_h) - _tail_share_q80(diag_h))

def H0_centroid_to_energy(diag_h):     # centroid / L2
    cen = _sum_centroid(diag_h); L2 = h0_l2_norm(diag_h)
    return _safe_div(cen, L2)

def _gini_from_array(x):
    """Gini of nonnegative array x (0 if empty)."""
    x = np.asarray(x, dtype=float)
    x = x[np.isfinite(x)]
    x = x[x >= 0.0]
    if x.size == 0 or np.allclose(x.sum(), 0.0):
        return 0.0
    xs = np.sort(x); n = xs.size
    cumx = np.cumsum(xs)
    return float(1.0 + 1.0/n - 2.0 * (cumx.sum() / (n * xs.sum())))

def H0_gini(diag_h):
    arr = _clean_diag_h(diag_h)
    L = _lifetimes(arr)
    return _gini_from_array(L)

# ------------- Feature extraction per-window -------------
def extract_h0_features_series(D0_list):
    """Return dict with requested H0 features (one value per window)."""
    n = len(D0_list)
    feats = {
        # AUC-family
        "H0_ratio_auc_L1_to_sum":  np.zeros(n, dtype=float),
        "H0_ratio_auc_to_max":     np.zeros(n, dtype=float),
        "H0_ratio_auc_to_l2":      np.zeros(n, dtype=float),
        # base stats
        "H0_bottleneck":           np.zeros(n, dtype=float),
        "tail_share_q90":          np.zeros(n, dtype=float),
        "H0_sum_centroid":         np.zeros(n, dtype=float),
        "H0_L2_norm":              np.zeros(n, dtype=float),
        "PETE_p1.6_q0.5":          np.zeros(n, dtype=float),
        # streaming-friendly
        "H0_energy_concentration": np.zeros(n, dtype=float),
        "H0_dominance_share":      np.zeros(n, dtype=float),
        "H0_tail_curvature_80_90": np.zeros(n, dtype=float),
        "H0_centroid_to_energy":   np.zeros(n, dtype=float),
        "H0_gini":                 np.zeros(n, dtype=float),
    }
    for i, D0 in enumerate(D0_list):
        clean = _clean_diag_h(D0)
        # AUC-family
        feats["H0_ratio_auc_L1_to_sum"][i] = h0_ratio_auc_l1_to_sum(clean)
        feats["H0_ratio_auc_to_max"][i]    = h0_ratio_auc_to_max(clean)
        feats["H0_ratio_auc_to_l2"][i]     = h0_ratio_auc_to_l2(clean)
        # base stats
        feats["H0_bottleneck"][i]          = _bottleneck_amp(clean)
        feats["tail_share_q90"][i]         = _tail_share_q90(clean)
        feats["H0_sum_centroid"][i]        = _sum_centroid(clean)
        feats["H0_L2_norm"][i]             = h0_l2_norm(clean)
        feats["PETE_p1.6_q0.5"][i]         = pete(clean, p=1.6, q=0.5)
        # streaming-friendly
        feats["H0_energy_concentration"][i] = H0_energy_concentration(clean)
        feats["H0_dominance_share"][i]      = H0_dominance_share(clean)
        feats["H0_tail_curvature_80_90"][i] = H0_tail_curvature_80_90(clean)
        feats["H0_centroid_to_energy"][i]   = H0_centroid_to_energy(clean)
        feats["H0_gini"][i]                 = H0_gini(clean)
    return feats

# ------------- Scoring helpers -------------
def _mad_with_median(x):
    x = np.asarray(x, dtype=float)
    med = np.nanmedian(x); mad = np.nanmedian(np.abs(x - med))
    return med, (mad + 1e-12)


# old version of code
#def velocity_accel_score(series, kv=KV, ka=KA, mode=MODE):
#    s = pd.to_numeric(pd.Series(series, dtype=float), errors="coerce").interpolate(limit_direction="both")
#    v = s.diff(1); a = v.diff(1)
#    v_med, v_mad = _mad_with_median(v.values)
#    a_med, a_mad = _mad_with_median(a.values)
#    zv = np.maximum(0.0, (v.values - v_med) / v_mad)
#    za = np.maximum(0.0, (a.values - a_med) / a_mad)
#    return np.nan_to_num(zv * za, nan=0.0, posinf=0.0, neginf=0.0)

def velocity_accel_score(series, kv=KV, ka=KA, mode=MODE):
    s = pd.to_numeric(pd.Series(series, dtype=float), errors="coerce").interpolate(limit_direction="both")
    v = s.diff(1); a = v.diff(1)

    def _zm(x):
        med = np.nanmedian(x); mad = np.nanmedian(np.abs(x - med)) + 1e-12
        return (x - med) / mad

    zv = _zm(v.values); za = _zm(a.values)

    mode = (mode or "strict").lower()
    if mode == "strict":
        zv = np.maximum(0.0, zv); za = np.maximum(0.0, za)
    elif mode == "plateau":
        # allow small negatives to survive (flattened spikes)
        zv = np.where(zv > -0.25, zv, 0.0)
        za = np.where(za > -0.25, za, 0.0)
    elif mode == "abs_plateau":
        zv = np.abs(zv); za = np.abs(za)
    elif mode == "none":
        pass
    else:
        # fallback
        zv = np.maximum(0.0, zv); za = np.maximum(0.0, za)

    score = (kv * zv) * (ka * za)
    return np.nan_to_num(score, nan=0.0, posinf=0.0, neginf=0.0)


def minmax01(x):
    x = np.asarray(x, dtype=float)
    if x.size == 0: return x
    lo, hi = float(np.nanmin(x)), float(np.nanmax(x))
    if not np.isfinite(lo) or not np.isfinite(hi) or hi <= lo:
        return np.zeros_like(x, dtype=float)
    return (x - lo) / (hi - lo)

# ------------- Segmentation helpers -------------
def _resolve_cp_column(df: pd.DataFrame) -> str:
    cols_lower = {c.lower(): c for c in df.columns}
    for cand in CP_CANDIDATES:
        if cand in cols_lower: return cols_lower[cand]
    raise ValueError(f"No change-point column found. Looked for {CP_CANDIDATES}. Found: {list(df.columns)}")

def _segments_from_cp(df: pd.DataFrame, cp_col: str):
    n = len(df)
    cp_idx = df.index[df[cp_col] == 1].astype(int).tolist()
    bkps = sorted(set([i for i in cp_idx if 0 < i <= n]) | {n})
    starts = [0] + bkps[:-1]
    return list(zip(starts, bkps))  # [(s,e), ...]

# ------------- Per-dataset processing → DataFrame -------------
def process_one(ts_id: int) -> pd.DataFrame | None:
    path = DATA_PATTERN.format(ts_id=ts_id)
    if not os.path.exists(path):
        print(f"[WARN] Missing: {path}")
        return None

    df_raw = pd.read_csv(path)
    if VALUE_COL not in df_raw.columns or LABEL_COL not in df_raw.columns:
        print(f"[WARN] {path} missing required columns")
        return None
    cp_col = _resolve_cp_column(df_raw)

    df = pd.DataFrame({
        "value":   pd.to_numeric(df_raw[VALUE_COL], errors="coerce"),
        "anomaly": pd.to_numeric(df_raw[LABEL_COL], errors="coerce").fillna(0).astype(int),
        "cp":      pd.to_numeric(df_raw[cp_col], errors="coerce").fillna(0).astype(int),
    }).ffill().bfill().reset_index(drop=True)

    n = len(df)
    segments = _segments_from_cp(df, "cp")

    # Prepare output frame for ALL points (0..n-1)
    out_cols = {
        "loadeddataset": [f"ts{ts_id}"] * n,
        "segment": 0,
        "point":   np.arange(n, dtype=int),
        "is_anomaly": df["anomaly"].astype(int).values,
        "y": df["value"].astype(float).values,  # original time-series value
    }
    for _, col_name in FEATURES:
        out_cols[col_name] = np.nan
    out = pd.DataFrame(out_cols)

    # Tag each point with its segment id
    seg_id_arr = np.zeros(n, dtype=int)
    for sid, (s, e) in enumerate(segments, start=1):
        seg_id_arr[s:e] = sid
    out["segment"] = seg_id_arr

    # Fill scores only at window right-edges inside each segment
    x = df["value"].to_numpy(float)
    for sid, (s, e) in enumerate(segments, start=1):
        seg_x = x[s:e]
        if len(seg_x) < window_size:
            continue

        n_windows = (len(seg_x) - window_size) + 1  # stride=1
        D0_list = []
        for i in range(n_windows):
            w = seg_x[i:i+window_size]
            emb = takens_embed(w, time_delay, dimension)
            dgms = ripser(emb, maxdim=0)["dgms"]
            D0_list.append(_clean_diag_h(dgms[0] if len(dgms) else None))

        # Compute raw features per window
        feat_dict = extract_h0_features_series(D0_list)
        df_feat = pd.DataFrame({"window": np.arange(n_windows, dtype=int)})
        for k, v in feat_dict.items(): df_feat[k] = v

        # Map window to GLOBAL right-edge index
        t_global = s + df_feat["window"].astype(int) + (window_size - 1)

        # Turn each feature into an anomaly-score series (min–max 0..1 within segment)
        for feat_name, col_name in FEATURES:
            if feat_name not in df_feat.columns:
                continue
            fvals  = df_feat[feat_name].to_numpy(float)
            score  = velocity_accel_score(fvals, kv=KV, ka=KA, mode=MODE)
            scoreN = minmax01(score)
            out.loc[t_global.values, col_name] = scoreN

    return out

# ------------- Batch over datasets → single CSV -------------
all_rows = []
for ts in range(1, 101):  # TS1..TS101
    try:
        df_one = process_one(ts)
        if df_one is not None:
            all_rows.append(df_one)
            print(f"[OK] ts{ts}: rows={len(df_one)}")
    except Exception as e:
        print(f"[ERROR] ts{ts}: {e}")

if all_rows:
    df_all = pd.concat(all_rows, ignore_index=True)
    df_all.to_csv(OUT_CSV, index=False)
    print(f"Saved: {OUT_CSV} (rows={len(df_all)})")
else:
    print("No data written.")


[OK] ts1: rows=1680
[OK] ts2: rows=1680
[OK] ts3: rows=1680
[OK] ts4: rows=1680
[OK] ts5: rows=1680
[OK] ts6: rows=1680
[OK] ts7: rows=1680
[OK] ts8: rows=1680
[OK] ts9: rows=1680
[OK] ts10: rows=1680
[OK] ts11: rows=1680
[OK] ts12: rows=1680
[OK] ts13: rows=1680
[OK] ts14: rows=1680
[OK] ts15: rows=1680
[OK] ts16: rows=1680
[OK] ts17: rows=1680
[OK] ts18: rows=1680
[OK] ts19: rows=1680
[OK] ts20: rows=1680
[OK] ts21: rows=1680
[OK] ts22: rows=1680
[OK] ts23: rows=1680
[OK] ts24: rows=1680
[OK] ts25: rows=1680
[OK] ts26: rows=1680
[OK] ts27: rows=1680
[OK] ts28: rows=1680
[OK] ts29: rows=1680
[OK] ts30: rows=1680
[OK] ts31: rows=1680
[OK] ts32: rows=1680
[OK] ts33: rows=1680
[OK] ts34: rows=1680
[OK] ts35: rows=1680
[OK] ts36: rows=1680
[OK] ts37: rows=1680
[OK] ts38: rows=1680
[OK] ts39: rows=1680
[OK] ts40: rows=1680
[OK] ts41: rows=1680
[OK] ts42: rows=1680
[OK] ts43: rows=1680
[OK] ts44: rows=1680
[OK] ts45: rows=1680
[OK] ts46: rows=1680
[OK] ts47: rows=1680
[OK] ts48: rows=1680
[

# ANOMALY SCORES OF THE TIME SERIES

In [4]:
# Load CSV (adjust path if needed)
import pandas as pd
df = pd.read_csv("/content/anomaly_scores_a3.csv")
df

Unnamed: 0,loadeddataset,segment,point,is_anomaly,y,anomalyscore_h0_auc,anomalyscore_h0_auc_over_max,anomalyscore_h0_auc_over_l2,anomalyscore_bottleneck,anomalyscore_tail_q90,anomalyscore_sum_centroid,anomalyscore_h0_l2norm,anomalyscore_pete,anomalyscore_h0_energy_conc,anomalyscore_h0_dom_share,anomalyscore_h0_tail_curve,anomalyscore_h0_cen_to_energy,anomalyscore_h0_gini
0,ts1,1,0,0,-363.278909,,,,,,,,,,,,,
1,ts1,1,1,0,320.888590,,,,,,,,,,,,,
2,ts1,1,2,0,891.727422,,,,,,,,,,,,,
3,ts1,1,3,0,1174.652287,,,,,,,,,,,,,
4,ts1,1,4,0,1712.290261,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167995,ts100,1,1675,0,-143.374166,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.00000,0.000000
167996,ts100,1,1676,0,-213.830741,0.000010,0.000097,0.000009,0.000097,0.000000,0.000704,0.001063,0.000603,0.00019,0.000000,0.000000,0.00019,0.001116
167997,ts100,1,1677,0,-191.906215,0.000000,0.000000,0.000000,0.000000,0.000000,0.001147,0.002574,0.000953,0.00000,0.000000,0.002241,0.00000,0.000000
167998,ts100,1,1678,0,-158.322475,0.000000,0.000027,0.000000,0.000027,0.000000,0.000706,0.002008,0.000584,0.00000,0.000000,0.000000,0.00000,0.000000


In [5]:
# Replace all NaN with 0
df = df.fillna(0)
print("\nAny NaNs left?:", df.isna().sum().sum())


Any NaNs left?: 0


In [6]:
# Ensure is_anomaly is integer
df["is_anomaly"] = pd.to_numeric(df["is_anomaly"], errors="coerce").fillna(0).astype(int)
# Print total anomalies
print("Total anomalies (is_anomaly=1):", df["is_anomaly"].sum())

Total anomalies (is_anomaly=1): 943


In [7]:
df.columns

Index(['loadeddataset', 'segment', 'point', 'is_anomaly', 'y',
       'anomalyscore_h0_auc', 'anomalyscore_h0_auc_over_max',
       'anomalyscore_h0_auc_over_l2', 'anomalyscore_bottleneck',
       'anomalyscore_tail_q90', 'anomalyscore_sum_centroid',
       'anomalyscore_h0_l2norm', 'anomalyscore_pete',
       'anomalyscore_h0_energy_conc', 'anomalyscore_h0_dom_share',
       'anomalyscore_h0_tail_curve', 'anomalyscore_h0_cen_to_energy',
       'anomalyscore_h0_gini'],
      dtype='object')

In [8]:
# @title descriptive statistics of anomaly scores
# assume df is already loaded
# detect anomaly score columns automatically
anomaly_cols = [c for c in df.columns if c.startswith("anomalyscore")]

results = {}

for col in anomaly_cols:
    print(f"\n=== {col} ===")
    # 1. Descriptive statistics
    stats = df[col].describe()
    print(stats)

    # 2. Count of GT anomaly points (is_anomaly==1) where value < 0.001
    mask = (df["is_anomaly"] == 1) & (df[col] < 0.001)
    count_below = mask.sum()
    print(f"Ground truth anomalies with {col} < 0.001: {count_below}")

    results[col] = {
        "stats": stats.to_dict(),
        "count_below_0.001": count_below
    }

# If you want a compact summary table instead:
summary = pd.DataFrame({
    col: {
        "mean": df[col].mean(),
        "std": df[col].std(),
        "min": df[col].min(),
        "25%": df[col].quantile(0.25),
        "50%": df[col].median(),
        "75%": df[col].quantile(0.75),
        "max": df[col].max(),
        "count_below_0.001 (GT=1)": ((df["is_anomaly"]==1) & (df[col] < 0.001)).sum()
    }
    for col in anomaly_cols
}).T

print("\n\n=== Summary Table ===")
print(summary)



=== anomalyscore_h0_auc ===
count    168000.000000
mean          0.002958
std           0.038789
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000030
max           1.000000
Name: anomalyscore_h0_auc, dtype: float64
Ground truth anomalies with anomalyscore_h0_auc < 0.001: 67

=== anomalyscore_h0_auc_over_max ===
count    168000.000000
mean          0.003667
std           0.042684
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: anomalyscore_h0_auc_over_max, dtype: float64
Ground truth anomalies with anomalyscore_h0_auc_over_max < 0.001: 65

=== anomalyscore_h0_auc_over_l2 ===
count    168000.000000
mean          0.003261
std           0.040650
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000039
max           1.000000
Name: anomalyscore_h0_auc_over_l2, dtype: float64
Ground truth anomalies with anomalyscore_h0_auc_over_l2 < 0.001: 65

===

# VISUALIZATION OF TIME SERIES, ANOMALY SCORES AND ANOMALY POINTS

In [None]:
# @title plotting (with "all" option for dataset and feature)
# -*- coding: utf-8 -*-
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ========= CONFIG =========
CSV_PATH = "/content/anomaly_scores_a4.csv"

# Fixed schema per your description
DATASET_COL = "loadeddataset"
VALUE_COL   = "y"
ANOM_COL    = "is_anomaly"
SEG_COL     = "segment"
POINT_COL   = "point"  # x-axis if present, else fallback to index

# Allowed feature columns
FEATURE_COLUMNS = [
    "anomalyscore_h0_auc",
    "anomalyscore_h0_auc_over_max",
    "anomalyscore_h0_auc_over_l2",
    "anomalyscore_bottleneck",
    "anomalyscore_tail_q90",
    "anomalyscore_sum_centroid",
    "anomalyscore_h0_l2norm",
    "anomalyscore_pete",
    "anomalyscore_h0_energy_conc",
    "anomalyscore_h0_dom_share",
    "anomalyscore_h0_tail_curve",
    "anomalyscore_h0_cen_to_energy",
    "anomalyscore_h0_gini",
]

ALL_TOKEN = "__ALL__"

# ========= HELPERS =========
def read_csv_safely(path: str) -> pd.DataFrame:
    if not os.path.exists(path):
        raise FileNotFoundError(f"CSV not found: {path}")
    try:
        return pd.read_csv(path)
    except Exception:
        return pd.read_csv(path, engine="python")

def normalize_boolish(series: pd.Series) -> pd.Series:
    """Convert anomaly flags to 0/1 integers robustly."""
    s = series.copy()
    if s.dtype == bool:
        return s.astype(int)
    if s.dtype == object:
        return s.astype(str).str.lower().isin(["1","true","t","yes","y"]).astype(int)
    return (pd.to_numeric(s, errors="coerce").fillna(0) != 0).astype(int)

def derive_segment_starts_from_segment_col(seg_series: pd.Series) -> np.ndarray:
    """Return row indices where a new segment starts (0 included)."""
    seg = seg_series.values
    starts = [0]
    for i in range(1, len(seg)):
        if seg[i] != seg[i-1]:
            starts.append(i)
    return np.array(starts, dtype=int)

def list_datasets(df: pd.DataFrame) -> list:
    return df[DATASET_COL].astype(str).dropna().unique().tolist()

def pick_dataset(df: pd.DataFrame) -> str:
    """Ask user for dataset name (or 'all')."""
    if DATASET_COL not in df.columns:
        raise KeyError(f"Required column '{DATASET_COL}' not found. Columns: {list(df.columns)}")
    sample = list_datasets(df)
    preview = sample[:20]
    print(f"\nDetected {len(sample)} dataset ids in '{DATASET_COL}'. Examples:\n  {preview}")
    ds = input("Enter dataset name (exact/basename/substring) or type 'all': ").strip()
    if not ds:
        raise ValueError("Dataset name cannot be empty.")
    if ds.lower() == "all":
        return ALL_TOKEN
    return ds

def filter_rows_by_dataset(df: pd.DataFrame, dataset_name: str) -> pd.DataFrame:
    """Exact match, else basename match, else substring (case-insensitive)."""
    col = df[DATASET_COL].astype(str)
    sub = df[col == dataset_name]
    if len(sub) > 0:
        return sub
    # basename
    bn = os.path.basename(dataset_name)
    sub = df[col.map(os.path.basename) == bn]
    if len(sub) > 0:
        return sub
    # substring (case-insensitive)
    mask = col.str.contains(re.escape(bn), case=False, na=False)
    sub = df[mask]
    if len(sub) > 0:
        return sub
    raise ValueError(
        f"No rows match dataset name '{dataset_name}' in column '{DATASET_COL}'. "
        f"Try one of: {df[DATASET_COL].astype(str).head(10).tolist()}"
    )

def pick_feature() -> str:
    """Ask user for feature name (or 'all')."""
    print("\nAvailable feature columns:")
    for c in FEATURE_COLUMNS:
        print("  -", c)
    feat = input("Enter FEATURE (anomaly score) column, or type 'all': ").strip()
    if feat.lower() == "all":
        return ALL_TOKEN
    if feat not in FEATURE_COLUMNS:
        raise KeyError(f"Feature '{feat}' is not in the allowed list above.")
    return feat

def plot_one(sub: pd.DataFrame, dataset_name: str, feature_col: str):
    # x-axis
    if POINT_COL in sub.columns:
        x = pd.to_numeric(sub[POINT_COL], errors="coerce").values
        x_is_index = False
    else:
        sub = sub.copy()
        sub["__idx"] = np.arange(len(sub))
        x = sub["__idx"].values
        x_is_index = True

    # y and feature
    y = pd.to_numeric(sub[VALUE_COL], errors="coerce").astype(float).values
    f = pd.to_numeric(sub[feature_col], errors="coerce").astype(float).values

    # anomalies (GT)
    anomalies = normalize_boolish(sub[ANOM_COL])
    anom_x_positions = list(np.where(anomalies.values == 1)[0])

    # segment starts from segment changes
    seg_starts = derive_segment_starts_from_segment_col(sub[SEG_COL])

    # ---- Plot ----
    fig, ax_left = plt.subplots(figsize=(14, 6))
    ax_right = ax_left.twinx()

    # Left axis: y
    line_y, = ax_left.plot(x, y, linewidth=1.2, alpha=0.9, label=VALUE_COL)

    # Right axis: feature
    line_f, = ax_right.plot(x, f, linewidth=1.2, alpha=0.9, label=feature_col, color="red")

    # Anomaly dashed lines
    for i in anom_x_positions:
        ax_left.axvline(x=x[i], linestyle="--", linewidth=0.85, alpha=0.75, color="black")

    # Segment start solid red lines
    for i in seg_starts:
        ax_left.axvline(x=x[i], linestyle="-", linewidth=1.0, alpha=0.9, color="yellow")

    # Labels / legend
    ttl_ds = os.path.basename(str(dataset_name))
    x_name = POINT_COL if not x_is_index else "row_index"
    ax_left.set_title(f"{ttl_ds}: y (left) vs {feature_col} (right) — anomalies (dashed), segments (red)")
    ax_left.set_xlabel(x_name)
    ax_left.set_ylabel(VALUE_COL)
    ax_right.set_ylabel(feature_col)

    from matplotlib.lines import Line2D
    legend_elems = [line_y, line_f]
    if len(anom_x_positions) > 0:
        legend_elems.append(Line2D([0], [0], color="black", lw=1.0, ls="--", label="GT anomaly"))
    if len(seg_starts) > 0:
        legend_elems.append(Line2D([0], [0], color="red", lw=1.0, ls="-", label="Segment start"))
    ax_left.legend(handles=legend_elems, loc="upper left")

    ax_left.grid(True, which="both", linestyle=":", linewidth=0.6, alpha=0.6)
    fig.tight_layout()

    print(f"Plotted dataset='{dataset_name}', feature='{feature_col}', rows={len(sub)}, "
          f"anomalies={int(anomalies.sum())}, segments≈{len(seg_starts)}")
    plt.show()

# ========= MAIN =========
def main():
    # Load
    df = read_csv_safely(CSV_PATH)
    if df.empty:
        raise RuntimeError("The CSV is empty.")
    for req in [DATASET_COL, VALUE_COL, ANOM_COL, SEG_COL]:
        if req not in df.columns:
            raise KeyError(f"Required column '{req}' not found. Columns: {list(df.columns)}")

    # Print all options up front
    all_ds = list_datasets(df)
    print(f"\n=== OPTIONS ===")
    print(f"- Datasets ({len(all_ds)}): {all_ds[:30]}{' ...' if len(all_ds) > 30 else ''}")
    print(f"- Features ({len(FEATURE_COLUMNS)}): {FEATURE_COLUMNS}")
    print("Type 'all' at prompts to select all datasets or all features.\n")

    # Inputs
    dataset_name = pick_dataset(df)   # string or ALL_TOKEN
    feature_sel  = pick_feature()     # string or ALL_TOKEN

    if dataset_name == ALL_TOKEN and feature_sel == ALL_TOKEN:
        print("\nYou selected ALL datasets and ALL features.")
        print("Aborting plotting to avoid generating a huge number of figures.")
        print("Pick either a single dataset with 'all' features, or 'all' datasets with a single feature.")
        return

    if dataset_name == ALL_TOKEN:
        # Plot single feature for every dataset
        if feature_sel == ALL_TOKEN:
            # handled above
            return
        print(f"\nPlotting feature '{feature_sel}' for ALL datasets...")
        for ds in all_ds:
            try:
                sub = filter_rows_by_dataset(df, ds).copy().reset_index(drop=True)
                plot_one(sub, ds, feature_sel)
            except Exception as e:
                print(f"[WARN] Skipped '{ds}': {e}")
        return

    # Single dataset path
    sub_all = filter_rows_by_dataset(df, dataset_name).copy().reset_index(drop=True)

    if feature_sel == ALL_TOKEN:
        print(f"\nPlotting ALL features for dataset '{dataset_name}' ...")
        for feat in FEATURE_COLUMNS:
            try:
                plot_one(sub_all, dataset_name, feat)
            except Exception as e:
                print(f"[WARN] Skipped feature '{feat}': {e}")
        return

    # Single dataset + single feature
    plot_one(sub_all, dataset_name, feature_sel)

# Run
main()


In [None]:
# @title plotting the anomaly points and neighbourhoods
# ========= NEW: window plot around a chosen anomaly =========
WINDOW = 10  # +/- points to show
DEFAULT_FEATURE = "anomalyscore_h0_auc"  # used if you choose 'all' features here

def _ensure_point_index(sub: pd.DataFrame) -> pd.DataFrame:
    """Guarantee we have a positional row index and a numeric POINT_COL."""
    sub = sub.copy().reset_index(drop=True)
    if POINT_COL in sub.columns:
        sub[POINT_COL] = pd.to_numeric(sub[POINT_COL], errors="coerce")
    else:
        sub[POINT_COL] = np.arange(len(sub), dtype=float)
    return sub

def _segment_of_row(sub: pd.DataFrame, row_i: int) -> int:
    """Return the segment value at row_i."""
    return int(pd.to_numeric(sub.loc[row_i, SEG_COL], errors="coerce"))

def _rows_for_window(center_i: int, n_rows: int, w: int = WINDOW) -> slice:
    lo = max(0, center_i - w)
    hi = min(n_rows, center_i + w + 1)
    return slice(lo, hi)

def _pick_point_via_segment(sub: pd.DataFrame) -> int:
    """Interactive: list segments and anomalies; return a chosen row index."""
    sub = sub.copy().reset_index(drop=True)
    seg_vals = sub[SEG_COL].astype(int).values
    anom = normalize_boolish(sub[ANOM_COL]).values

    # Group anomaly row indices by segment
    seg_to_rows = {}
    for i, (s, a) in enumerate(zip(seg_vals, anom)):
        if a == 1:
            seg_to_rows.setdefault(s, []).append(i)

    # Show a summary
    print("\nSegments with anomalies:")
    if not seg_to_rows:
        raise RuntimeError("No ground-truth anomalies in the selected dataset.")
    for s, rows in sorted(seg_to_rows.items()):
        pts = [int(sub.loc[i, POINT_COL]) if not pd.isna(sub.loc[i, POINT_COL]) else i for i in rows]
        print(f"  Segment {s}: {len(rows)} anomalies — points {pts[:20]}{' ...' if len(pts) > 20 else ''}")

    # Choose a segment
    while True:
        try:
            seg_sel = int(input("Enter a SEGMENT number to inspect: ").strip())
            if seg_sel in seg_to_rows:
                break
            print("Segment not in list; try again.")
        except Exception:
            print("Please enter an integer segment id.")

    # Choose an anomaly point inside that segment
    rows = seg_to_rows[seg_sel]
    # Display full list to choose by ordinal
    print(f"\nSegment {seg_sel} anomaly choices (index in this list → row_i → point):")
    for k, i in enumerate(rows):
        pval = sub.loc[i, POINT_COL]
        print(f"  [{k}] row={i}  point={int(pval) if pd.notna(pval) else i}")

    while True:
        try:
            k = int(input("Pick anomaly by its [index] above: ").strip())
            if 0 <= k < len(rows):
                return rows[k]
            print("Out of range; try again.")
        except Exception:
            print("Please enter a valid integer index.")

def _find_row_by_point_value(sub: pd.DataFrame, point_value: int) -> int:
    """Find the FIRST row whose POINT_COL equals point_value; fallback: nearest by absolute diff."""
    sub = sub.copy().reset_index(drop=True)
    if POINT_COL in sub.columns:
        hits = np.where(sub[POINT_COL].astype(float).values == float(point_value))[0]
        if len(hits) > 0:
            return int(hits[0])
        # nearest fallback
        diffs = np.abs(sub[POINT_COL].astype(float).values - float(point_value))
        return int(np.nanargmin(diffs))
    return int(point_value)  # if POINT_COL missing, treat as row index

def plot_window_around_anomaly(sub: pd.DataFrame, row_i: int, feature_col: str = DEFAULT_FEATURE, w: int = WINDOW):
    """Plot y + feature around a selected row_i with ±w window; mark anomalies and the chosen one."""
    sub = _ensure_point_index(sub)
    if feature_col not in sub.columns:
        raise KeyError(f"Feature '{feature_col}' not in dataframe columns.")

    # Determine window slice
    n = len(sub)
    win = _rows_for_window(row_i, n, w=w)
    subw = sub.iloc[win].copy()

    # Axis vectors
    x = pd.to_numeric(subw[POINT_COL], errors="coerce").astype(float).values
    y = pd.to_numeric(subw[VALUE_COL], errors="coerce").astype(float).values
    f = pd.to_numeric(subw[feature_col], errors="coerce").astype(float).values

    # Flags
    anom = normalize_boolish(subw[ANOM_COL]).values
    chosen_x = float(sub.loc[row_i, POINT_COL])

    # Get full-series segment starts (yellow) within window for context
    seg_change_rows = derive_segment_starts_from_segment_col(sub[SEG_COL])
    seg_change_points = [float(sub.loc[i, POINT_COL]) for i in seg_change_rows if win.start <= i < win.stop]

    # ---- Plot ----
    fig, ax_left = plt.subplots(figsize=(12, 5))
    ax_right = ax_left.twinx()

    ax_left.plot(x, y, linewidth=1.2, alpha=0.9, label=VALUE_COL)
    ax_right.plot(x, f, linewidth=1.0, alpha=0.9, label=feature_col, color="red")

    # All GT anomalies in window (dashed)
    for xi, ai in zip(x, anom):
        if ai == 1:
            ax_left.axvline(x=xi, linestyle="--", linewidth=0.9, alpha=0.7, color="black")

    # Segment starts (yellow)
    for xp in seg_change_points:
        ax_left.axvline(x=xp, linestyle="-", linewidth=1.0, alpha=0.9, color="yellow")

    # Chosen anomaly (big X)
    ax_left.scatter([chosen_x], [np.interp(chosen_x, x, y)], marker="x", s=120, linewidths=2.0, zorder=5)

    # Titles / labels
    seg_id = _segment_of_row(sub, row_i)
    ax_left.set_title(f"Segment {seg_id} — window ±{w} around point={int(chosen_x)} (row {row_i})\n"
                      f"y (left) vs {feature_col} (right)")
    ax_left.set_xlabel(POINT_COL)
    ax_left.set_ylabel(VALUE_COL)
    ax_right.set_ylabel(feature_col)
    ax_left.grid(True, which="both", linestyle=":", linewidth=0.6, alpha=0.6)

    from matplotlib.lines import Line2D
    legend_elems = [
        Line2D([0], [0], color="C0", lw=1.2, label=VALUE_COL),
        Line2D([0], [0], color="red", lw=1.0, label=feature_col),
        Line2D([0], [0], color="black", lw=1.0, ls="--", label="GT anomaly"),
        Line2D([0], [0], marker="x", color="black", lw=0, markersize=8, label="Chosen anomaly"),
        Line2D([0], [0], color="yellow", lw=1.0, label="Segment start"),
    ]
    ax_left.legend(handles=legend_elems, loc="upper left")
    fig.tight_layout()
    plt.show()

def interactive_window_plot():
    # Load & sanity
    df = read_csv_safely(CSV_PATH)
    for req in [DATASET_COL, VALUE_COL, ANOM_COL, SEG_COL]:
        if req not in df.columns:
            raise KeyError(f"Required column '{req}' not found. Columns: {list(df.columns)}")

    # Pick dataset
    all_ds = list_datasets(df)
    print(f"\nDatasets found ({len(all_ds)}): {all_ds[:30]}{' ...' if len(all_ds) > 30 else ''}")
    ds = pick_dataset(df)
    if ds == ALL_TOKEN:
        raise RuntimeError("Please pick a single dataset for window plotting.")

    sub = filter_rows_by_dataset(df, ds).copy().reset_index(drop=True)
    sub = _ensure_point_index(sub)

    # Offer feature choice (or keep default)
    print("\nAvailable feature columns:")
    for c in FEATURE_COLUMNS:
        print("  -", c)
    feat = input(f"Feature to plot (Enter for default '{DEFAULT_FEATURE}'): ").strip()
    if not feat:
        feat = DEFAULT_FEATURE
    if feat not in sub.columns:
        raise KeyError(f"Feature '{feat}' not in dataframe for this dataset.")

    # Option A: user already knows a point number
    raw = input("\nEnter an anomaly POINT value to jump to (or press Enter to browse by segment): ").strip()
    if raw:
        try:
            point_value = int(float(raw))
        except Exception:
            raise ValueError("Please enter a numeric point value.")
        row_i = _find_row_by_point_value(sub, point_value)
        seg_id = _segment_of_row(sub, row_i)

        # Show segment and anomalies in that segment
        seg_mask = (sub[SEG_COL].astype(int) == seg_id)
        seg_rows = sub[seg_mask].reset_index(drop=True)
        seg_anom_rows = np.where(normalize_boolish(seg_rows[ANOM_COL]).values == 1)[0]
        seg_anom_points = [int(seg_rows.loc[i, POINT_COL]) for i in seg_anom_rows]
        print(f"\nPoint {point_value} is in SEGMENT {seg_id}.")
        print(f"Segment {seg_id} anomaly points: {seg_anom_points[:50]}{' ...' if len(seg_anom_points) > 50 else ''}")

        # Confirm or pick one of the listed anomalies to plot
        confirm = input("Plot the window around this point? (y to confirm, anything else to pick from segment): ").strip().lower()
        if confirm == "y":
            plot_window_around_anomaly(sub, row_i=row_i, feature_col=feat, w=WINDOW)
            return
        else:
            # Let user pick from anomalies of this segment
            if not len(seg_anom_rows):
                raise RuntimeError("Selected segment has no GT anomalies.")
            print("Pick an index among the listed segment anomalies.")
            while True:
                try:
                    p = int(input("Enter a POINT value from the list: ").strip())
                    row_i = _find_row_by_point_value(sub, p)
                    break
                except Exception:
                    print("Please enter a valid integer POINT.")
            plot_window_around_anomaly(sub, row_i=row_i, feature_col=feat, w=WINDOW)
            return

    # Option B: browse segments → anomalies
    print("\nNo point entered; browsing by segment…")
    row_i = _pick_point_via_segment(sub)
    plot_window_around_anomaly(sub, row_i=row_i, feature_col=feat, w=WINDOW)

# Run the interactive flow
if __name__ == "__main__":
    interactive_window_plot()


# WHAT IS OUR MAXIMUM ANOMALY DETECTION CAPABILITY BY K TOP ANALYSIS

In [None]:
# @title by ground truth K value
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

# --- Settings ---
CSV_PATH = "/content/anomaly_scores_a4.csv"   # standard CSV
NTOL = 3                                # tolerance window for SCORING ONLY
FEATURES = [
    "anomalyscore_h0_auc",            # ∫max-tri / sumL
    "anomalyscore_h0_auc_over_max",   # ∫max-tri / maxL
    "anomalyscore_h0_auc_over_l2",    # ∫max-tri / L2(L)
 'anomalyscore_bottleneck',
       'anomalyscore_tail_q90', 'anomalyscore_sum_centroid',
       'anomalyscore_h0_l2norm', 'anomalyscore_pete',
       'anomalyscore_h0_energy_conc', 'anomalyscore_h0_dom_share',
       'anomalyscore_h0_tail_curve', 'anomalyscore_h0_cen_to_energy',    'anomalyscore_h0_gini'
]

GROUP_COLS = ["loadeddataset", "segment"]

# --- Load (simple) ---
df = pd.read_csv(CSV_PATH).fillna(0)
df["is_anomaly"] = pd.to_numeric(df["is_anomaly"], errors="coerce").fillna(0).astype(int)
for c in FEATURES:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0)

y_true = df["is_anomaly"].values
N = len(df)
total_gt = int(y_true.sum())
print("Total ground-truth anomalies:", total_gt)

# Precompute groups once
groups = df.groupby(GROUP_COLS).indices  # dict: (ds, seg) -> ndarray of row indices (global)

def topk_pred_indices_for_feature(feat: str) -> np.ndarray:
    """Return global indices predicted as 1 by taking K=GT per segment, top-K by 'feat'."""
    y_pred = np.zeros(N, dtype=int)
    for ix in groups.values():
        ix = np.asarray(ix, dtype=int)
        K = int(df.loc[ix, "is_anomaly"].sum())
        if K <= 0:
            continue
        scores = df.loc[ix, feat].values
        if K >= len(ix):
            chosen_local = np.arange(len(ix))
        else:
            chosen_local = np.argpartition(-scores, K-1)[:K]  # top-K unsorted
        y_pred[ix[chosen_local]] = 1
    return np.where(y_pred == 1)[0]

def score_with_tolerance(pred_idx: np.ndarray, true_idx: np.ndarray, ntol: int):
    """
    Greedy matching for tolerance:
    - A prediction i is TP if there exists an *unmatched* ground-truth t with |i - t| <= ntol.
    - Each ground-truth can be matched at most once.
    """
    if ntol < 0:
        ntol = 0
    pred_idx = np.sort(pred_idx)
    true_idx = np.sort(true_idx)

    tp = 0
    used_true = np.zeros(true_idx.size, dtype=bool)
    t_ptr = 0  # pointer over true_idx

    for i in pred_idx:
        # advance pointer to first true >= i-ntol
        while t_ptr < true_idx.size and true_idx[t_ptr] < i - ntol:
            t_ptr += 1
        # check current and maybe previous one (nearest within window)
        matched = False
        # try current t_ptr
        if t_ptr < true_idx.size and (not used_true[t_ptr]) and abs(true_idx[t_ptr] - i) <= ntol:
            used_true[t_ptr] = True
            tp += 1
            matched = True
        # also try left neighbor if exists and closer
        if not matched and t_ptr > 0:
            left = t_ptr - 1
            if (not used_true[left]) and abs(true_idx[left] - i) <= ntol:
                used_true[left] = True
                tp += 1
                matched = True
        # if not matched, it's FP; handled later by counts

    fp = int(pred_idx.size - tp)
    fn = int(true_idx.size - tp)
    tn = int(N - tp - fp - fn)
    return tp, fp, fn, tn

rows = []
true_idx = np.where(y_true == 1)[0]
sum_K_over_segments = sum(int(df.loc[ix, "is_anomaly"].sum()) for ix in groups.values())
print("Sum of K over segments:", sum_K_over_segments)

for feat in FEATURES:
    pred_idx = topk_pred_indices_for_feature(feat)    # predictions (indices) — EXACTLY K per segment
    print(f"{feat}: predicted ones (no tolerance applied to predictions) = {pred_idx.size}")

    tp, fp, fn, tn = score_with_tolerance(pred_idx, true_idx, NTOL)

    # Metrics
    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall    = tp / (tp + fn) if (tp + fn) else 0.0
    f1        = (2*precision*recall)/(precision+recall) if (precision+recall) else 0.0

    rows.append({
        "feature": feat,
        "TP": tp, "FP": fp, "FN": fn, "TN": tn,
        "Precision": precision, "Recall": recall, "F1": f1,
        "pred_ones": int(pred_idx.size)
    })

out = pd.DataFrame(rows).sort_values("F1", ascending=False).reset_index(drop=True)
print(f"\n=== Top-K per segment (K from GT) — Scored with ±{NTOL} tolerance (predictions unchanged) ===")
print(out.to_string(index=False))


In [None]:
# @title individual results of anomaly points
# ==== TP/FP for predictions AND TP/FN for ground truth (NO TNs, print ALL rows) ====

DATASET_COL = "loadeddataset"
SEG_COL     = "segment"
POINT_COL   = "point"
ANOM_COL    = "is_anomaly"

def _greedy_match_with_tolerance(pred_idx: np.ndarray,
                                 true_idx: np.ndarray,
                                 ntol: int):
    """
    Greedy one-to-one matching between predictions and ground-truth positives (anomalies),
    using ±ntol tolerance on indices.

    Returns:
      pred_to_label: dict[pred_i] = ("TP", matched_true_i) or ("FP", None)
      true_to_hit:   dict[true_i] = matched_pred_i or None  (None => FN)
    """
    pred_idx = np.sort(np.asarray(pred_idx, dtype=int))
    true_idx = np.sort(np.asarray(true_idx, dtype=int))

    pred_to_label = {}
    true_to_hit = {int(t): None for t in true_idx}

    used_true = np.zeros(true_idx.size, dtype=bool)
    t_ptr = 0

    for i in pred_idx:
        while t_ptr < true_idx.size and true_idx[t_ptr] < i - ntol:
            t_ptr += 1

        matched = False
        # try current
        if t_ptr < true_idx.size and (not used_true[t_ptr]) and abs(true_idx[t_ptr] - i) <= ntol:
            used_true[t_ptr] = True
            pred_to_label[i] = ("TP", int(true_idx[t_ptr]))
            true_to_hit[int(true_idx[t_ptr])] = int(i)
            matched = True

        # try left neighbor
        if not matched and t_ptr > 0:
            left = t_ptr - 1
            if (not used_true[left]) and abs(true_idx[left] - i) <= ntol:
                used_true[left] = True
                pred_to_label[i] = ("TP", int(true_idx[left]))
                true_to_hit[int(true_idx[left])] = int(i)
                matched = True

        if not matched:
            pred_to_label[i] = ("FP", None)

    return pred_to_label, true_to_hit

def _mk_addr_table(idxs: np.ndarray, extra_cols: dict = None) -> pd.DataFrame:
    """Build (dataset, segment, point, global_index) table; optionally add extra cols."""
    if idxs.size == 0:
        base = pd.DataFrame(columns=[DATASET_COL, SEG_COL, POINT_COL, "global_index"])
    else:
        base = df.loc[idxs, [DATASET_COL, SEG_COL, POINT_COL]].copy()
        base["global_index"] = idxs
    if extra_cols:
        for k, v in extra_cols.items():
            base[k] = v
    return base.sort_values([DATASET_COL, SEG_COL, POINT_COL, "global_index"]).reset_index(drop=True)

def list_tp_fp_and_gt_status_for_feature(feature_name: str):
    if feature_name not in FEATURES:
        raise ValueError(f"Feature '{feature_name}' not in FEATURES.\nAvailable: {FEATURES}")

    # 1) Predictions for this feature (Top-K per segment)
    pred_idx = topk_pred_indices_for_feature(feature_name)

    # 2) Ground truth indices (positives only)
    true_idx = np.where(y_true == 1)[0]

    # 3) Greedy matching with ±NTOL
    pred_labels, true_hits = _greedy_match_with_tolerance(pred_idx, true_idx, NTOL)

    # ---- PREDICTIONS table (TP/FP) ----
    statuses = []
    matched_true_idx = []
    for i in pred_idx:
        lab, t_i = pred_labels[i]
        statuses.append(lab)               # "TP" or "FP"
        matched_true_idx.append(t_i)       # matched GT index if TP else None

    pred_tbl = _mk_addr_table(
        pred_idx,
        extra_cols={
            "status": statuses,
            "matched_true_index": matched_true_idx,
            "matched_true_point": [
                (None if t is None else df.loc[int(t), POINT_COL]) for t in matched_true_idx
            ],
            "score": df.loc[pred_idx, feature_name].values
        }
    )

    # ---- GROUND TRUTH table (TP/FN only) ----
    gt_status = []
    matched_pred_idx = []
    for t in true_idx:
        p = true_hits[int(t)]
        if p is None:
            gt_status.append("FN")
            matched_pred_idx.append(None)
        else:
            gt_status.append("TP")
            matched_pred_idx.append(int(p))

    gt_tbl = _mk_addr_table(
        true_idx,
        extra_cols={
            "gt_status": gt_status,                 # "TP" or "FN"
            "matched_pred_index": matched_pred_idx,
            "matched_pred_point": [
                (None if p is None else df.loc[int(p), POINT_COL]) for p in matched_pred_idx
            ]
        }
    )

    # ---- Print ALL rows (no head/limit) ----
    print(f"\n=== ALL Predictions for '{feature_name}' labeled (±{NTOL}) ===")
    if len(pred_tbl):
        print(pred_tbl.to_string(index=False))
    else:
        print("(no predictions)")

    print(f"\n=== ALL Ground-truth anomalies labeled TP/FN (±{NTOL}) ===")
    if len(gt_tbl):
        print(gt_tbl.to_string(index=False))
    else:
        print("(no ground-truth anomalies)")

    # ---- Summary counts (no TN reported) ----
    tp_pred = int((pred_tbl["status"] == "TP").sum())
    fp_pred = int((pred_tbl["status"] == "FP").sum())
    tp_gt   = int((gt_tbl["gt_status"] == "TP").sum())
    fn_gt   = int((gt_tbl["gt_status"] == "FN").sum())
    print(f"\nCounts → TP: {tp_gt} | FP: {fp_pred} | FN: {fn_gt}")

    return pred_tbl, gt_tbl

# ---- Run interactively ----
SELECTED_FEATURE = input(f"\nEnter a feature to list ALL PRED (TP/FP) and ALL GT (TP/FN):\n{FEATURES}\n> ").strip() or FEATURES[0]
pred_table, gt_table = list_tp_fp_and_gt_status_for_feature(SELECTED_FEATURE)


# HOW WE CAN DETECT ANOMALIES WITH UNSUPERVISED METHODS

In [None]:
# @title thresholding by quantile
import pandas as pd
import numpy as np
import sys

def run_all_score_analyses(file_path='/content/anomaly_scores_a3.csv'):
    """
    This script performs adaptive anomaly detection and NTOL evaluation
    for EVERY 'anomalyscore_' column in the provided CSV file.

    The strategy is as follows:
    1.  Load the dataset.
    2.  Identify all columns starting with 'anomalyscore_'.
    3.  Initialize a DataFrame to store the final performance metrics
        for comparison.
    4.  Loop through each identified anomaly score column:
        a.  Print which score is being processed.
        b.  Clean the data by dropping rows where *this specific score*
            is NaN.
        c.  Calculate adaptive 99th percentile thresholds for this score,
            grouped by 'loadeddataset' and 'segment'.
        d.  Generate 'predicted_anomaly' based on this score and its
            group-specific thresholds.
        e.  Apply the NTOL=3 logic by dilating the 'is_anomaly'
            (ground truth) and the new 'predicted_anomaly' columns.
        f.  Calculate the custom point-adjusted TP, FN, FP, and TN.
        g.  Calculate and print the F1, Precision, and Recall for this
            specific score.
        h.  Store these metrics in the summary DataFrame.
    5.  After the loop finishes, print the final summary DataFrame,
        sorted by F1 score, to show which score performed best.
    """

    # --- 1. Configuration ---
    data_file = file_path
    quantile_threshold = 0.99
    NTOL = 3
    window_size = 2 * NTOL + 1

    print(f"Starting adaptive anomaly detection process for ALL scores...")
    print(f"Using data file: {data_file}")
    print(f"Using quantile threshold: {quantile_threshold} (99th percentile)")
    print(f"Using evaluation NTOL: {NTOL} (Window size: {window_size})\n")

    try:
        # --- 2. Load Data ---
        df = pd.read_csv(data_file)
        if 'is_anomaly' not in df.columns:
            print("Error: 'is_anomaly' column not found.")
            return

    except FileNotFoundError:
        print(f"Error: The file '{data_file}' was not found.")
        return
    except Exception as e:
        print(f"An unexpected error occurred during loading: {e}")
        return

    # --- 3. Identify Score Columns ---
    all_score_columns = [col for col in df.columns if col.startswith('anomalyscore_')]

    if not all_score_columns:
        print("Error: No columns found starting with 'anomalyscore_'.")
        return

    print(f"Found {len(all_score_columns)} score columns to analyze:")
    print(all_score_columns)

    # --- 4. Initialize Summary ---
    performance_summary = []

    # Pre-calculate the dilated ground truth once
    # This is efficient as it doesn't change
    print("\nPre-calculating dilated ground truth...")
    group_keys = ['loadeddataset', 'segment']
    dilated_true_series = df.groupby(group_keys)['is_anomaly'] \
                              .rolling(window=window_size, center=True, min_periods=1) \
                              .max()
    df['y_true_dilated'] = dilated_true_series.reset_index(level=group_keys, drop=True) \
                                              .fillna(0).astype(int)
    print("Dilated ground truth calculated.")

    # --- 5. Loop Through Each Score Column ---
    for score_column in all_score_columns:
        print(f"\n{'-'*20} Analyzing: {score_column} {'-'*20}")

        try:
            # --- 5a. Clean Data (specific to this column) ---
            # We use a copy to ensure each loop is independent
            df_cleaned = df.dropna(subset=[score_column]).copy()

            if len(df_cleaned) == 0:
                print(f"Skipping {score_column}: All values are NaN.")
                continue

            # --- 5b. Calculate Adaptive Thresholds & Predictions ---
            print("Calculating thresholds and making predictions...")
            df_cleaned['adaptive_threshold'] = df_cleaned.groupby(group_keys)[score_column] \
                                                         .transform(lambda x: x.quantile(quantile_threshold))

            df_cleaned['predicted_anomaly'] = (df_cleaned[score_column] > df_cleaned['adaptive_threshold']).astype(int)

            # --- 5c. Apply NTOL Logic to Predictions ---
            print("Applying NTOL to predictions...")
            dilated_pred_series = df_cleaned.groupby(group_keys)['predicted_anomaly'] \
                                            .rolling(window=window_size, center=True, min_periods=1) \
                                            .max()
            df_cleaned['y_pred_dilated'] = dilated_pred_series.reset_index(level=group_keys, drop=True) \
                                                              .fillna(0).astype(int)

            # --- 5d. Calculate Custom Metrics ---
            y_true = df_cleaned['is_anomaly']
            y_pred = df_cleaned['predicted_anomaly']
            y_true_dilated = df_cleaned['y_true_dilated'] # from pre-computation
            y_pred_dilated = df_cleaned['y_pred_dilated']

            TP = ((y_true == 1) & (y_pred_dilated == 1)).sum()
            FN = ((y_true == 1) & (y_pred_dilated == 0)).sum()
            FP = ((y_pred == 1) & (y_true_dilated == 0)).sum()
            TN = ((y_true == 0) & (y_pred_dilated == 0)).sum()

            precision = TP / (TP + FP) if (TP + FP) > 0 else 0
            recall = TP / (TP + FN) if (TP + FN) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

            # --- 5e. Print Individual Report ---
            print(f"--- Results for {score_column} ---")
            print(f"Total Actual Anomaly Points: {y_true.sum()}")
            print(f"Total Predicted Anomaly Points: {y_pred.sum()}\n")

            print("Custom Confusion Matrix (Point-Adjusted):")
            print("                 Predicted Negative | Predicted Positive")
            print(f"Actual Negative: {TN:>17} | {FP:>17}")
            print(f"Actual Positive: {FN:>17} | {TP:>17}\n")

            print("Performance Metrics (Point-Adjusted):")
            print(f"  F1 Score:  {f1:.6f}")
            print(f"  Precision: {precision:.6f}")
            print(f"  Recall:    {recall:.6f}")

            # --- 5f. Store in Summary ---
            performance_summary.append({
                'score_column': score_column,
                'f1_score': f1,
                'precision': precision,
                'recall': recall,
                'TP': TP,
                'FN': FN,
                'FP': FP,
                'total_predicted': y_pred.sum()
            })

        except Exception as e:
            print(f"!!! Error processing {score_column}: {e} !!!")
            import traceback
            traceback.print_exc()

    # --- 6. Print Final Summary Table ---
    print(f"\n\n{'-'*20} FINAL PERFORMANCE SUMMARY {'-'*20}")

    if performance_summary:
        summary_df = pd.DataFrame(performance_summary)
        summary_df = summary_df.sort_values(by='f1_score', ascending=False)
        summary_df = summary_df.set_index('score_column')

        print(summary_df.to_string(float_format="%.6f"))


    else:
        print("No score columns were successfully processed.")

# --- Run the function ---
if __name__ == "__main__":
    run_all_score_analyses()

In [None]:
# @title POT
import pandas as pd
import numpy as np
from scipy.signal import find_peaks
import sys
import warnings

def find_anomalies_for_segment_series(segment_scores, k_val, distance):
    """
    This is the core helper function that applies the adaptive peak-finding
    logic to a single time series segment (passed as a pandas Series).

    Rules:
    1. Find all peaks and their prominences.
    2. Calculate an adaptive prominence threshold: mean(prominences) + k * std(prominences).
    3. Run find_peaks again, using this adaptive threshold and a minimum distance.
    4. Return a series of 0s (not anomaly) and 1s (anomaly).
    """

    # --- 1. Handle edge cases ---
    # If the segment is flat or has no variance, there are no peaks.
    if segment_scores.nunique() <= 1:
        return pd.Series(0, index=segment_scores.index, dtype=int)

    try:
        # --- 2. Find all peaks to get their prominence distribution ---
        all_peaks_indices, all_props = find_peaks(segment_scores,
                                                  prominence=(None, None))

        # If no peaks are found at all, return all zeros.
        if len(all_peaks_indices) == 0:
            return pd.Series(0, index=segment_scores.index, dtype=int)

        prominences = all_props['prominences']

        # --- 3. Calculate adaptive prominence threshold ---
        if len(prominences) == 1:
            # If there's only one peak, it is by definition an anomaly
            adaptive_prominence_threshold = 0
        else:
            prom_mean = np.mean(prominences)
            prom_std = np.std(prominences)

            # If all prominences are identical, std is 0. Threshold is just the mean.
            if prom_std == 0:
                adaptive_prominence_threshold = prom_mean
            else:
                adaptive_prominence_threshold = prom_mean + (k_val * prom_std)

        # --- 4. Run find_peaks a final time with the full rule set ---
        # We find peaks that are:
        # a) Above the adaptive prominence threshold
        # b) At least 'distance' points apart
        final_peak_indices, _ = find_peaks(segment_scores,
                                           prominence=(adaptive_prominence_threshold, None),
                                           distance=distance)

        # --- 5. Create the final prediction series ---
        predicted_anomalies = pd.Series(0, index=segment_scores.index, dtype=int)

        # Set the indices of the found peaks to 1
        predicted_anomalies.iloc[final_peak_indices] = 1
        return predicted_anomalies

    except Exception as e:
        # Catch any errors during peak finding for a segment
        print(f"Warning: Error in peak finding for a segment: {e}", file=sys.stderr)
        return pd.Series(0, index=segment_scores.index, dtype=int)


def run_peak_based_analysis_for_all_scores(file_path='/content/anomaly_scores_a3.csv'):
    """
    Main script to run the advanced peak-based adaptive anomaly detection
    for EVERY 'anomalyscore_' column in the dataset.
    """

    # --- 1. Configuration ---
    data_file = file_path

    # --- Evaluation Config ---
    NTOL = 3
    window_size = 2 * NTOL + 1

    # --- Peak-Finding Rule Set Config ---
    # k-value for mean + k * std(prominence)
    PEAK_K_VALUE = 2.2
    # Distance (in points) between anomalies
    PEAK_DISTANCE = 5

    print(f"Starting PEAK-BASED adaptive anomaly detection...")
    print(f"Using data file: {data_file}")
    print(f"Evaluation NTOL: {NTOL}")
    print(f"Peak Rules: Prominence > (mean + {PEAK_K_VALUE}*std), Distance = {PEAK_DISTANCE}\n")

    # Suppress warnings from scipy/numpy during processing
    warnings.filterwarnings('ignore', category=RuntimeWarning)

    try:
        # --- 2. Load Data ---
        df = pd.read_csv(data_file)
        if 'is_anomaly' not in df.columns:
            print("Error: 'is_anomaly' column not found.")
            return

    except FileNotFoundError:
        print(f"Error: The file '{data_file}' was not found.")
        return
    except Exception as e:
        print(f"An unexpected error occurred during loading: {e}")
        return

    # --- 3. Identify Score Columns ---
    all_score_columns = [col for col in df.columns if col.startswith('anomalyscore_')]

    if not all_score_columns:
        print("Error: No columns found starting with 'anomalyscore_'.")
        return

    print(f"Found {len(all_score_columns)} score columns to analyze.")

    # --- 4. Initialize Summary ---
    performance_summary = []
    group_keys = ['loadeddataset', 'segment']

    # --- 5. Pre-calculate Dilated Ground Truth (Optimization) ---
    print("\nPre-calculating dilated ground truth...")
    dilated_true_series = df.groupby(group_keys)['is_anomaly'] \
                              .rolling(window=window_size, center=True, min_periods=1) \
                              .max()
    df['y_true_dilated'] = dilated_true_series.reset_index(level=group_keys, drop=True) \
                                              .fillna(0).astype(int)
    print("Dilated ground truth calculated.")

    # --- 6. Loop Through Each Score Column ---
    for score_column in all_score_columns:
        print(f"\n{'-'*25} Analyzing: {score_column} {'-'*25}")

        try:
            # --- 6a. Clean Data (specific to this column) ---
            df_cleaned = df.dropna(subset=[score_column]).copy()

            if len(df_cleaned) == 0:
                print(f"Skipping {score_column}: All values are NaN.")
                continue

            # --- 6b. Apply Peak-Finding Logic ---
            print("Applying peak-finding logic to all segments...")

            # This is the core step. We group, select the score column,
            # and apply our custom function to each segment (which is a Series).
            predicted_series = df_cleaned.groupby(group_keys)[score_column].apply(
                find_anomalies_for_segment_series,
                k_val=PEAK_K_VALUE,
                distance=PEAK_DISTANCE
            )

            # The result needs to be aligned back to the df_cleaned index
            df_cleaned['predicted_anomaly'] = predicted_series.reset_index(level=group_keys, drop=True)
            # Fill any potential NaNs from failed groups (should be rare)
            df_cleaned['predicted_anomaly'] = df_cleaned['predicted_anomaly'].fillna(0).astype(int)

            print("Prediction generation complete.")

            # --- 6c. Apply NTOL Logic to Predictions ---
            print("Applying NTOL to predictions...")
            dilated_pred_series = df_cleaned.groupby(group_keys)['predicted_anomaly'] \
                                            .rolling(window=window_size, center=True, min_periods=1) \
                                            .max()
            df_cleaned['y_pred_dilated'] = dilated_pred_series.reset_index(level=group_keys, drop=True) \
                                                              .fillna(0).astype(int)

            # --- 6d. Calculate Custom Metrics (NTOL) ---
            y_true = df_cleaned['is_anomaly']
            y_pred = df_cleaned['predicted_anomaly']
            y_true_dilated = df_cleaned['y_true_dilated']
            y_pred_dilated = df_cleaned['y_pred_dilated']

            TP = ((y_true == 1) & (y_pred_dilated == 1)).sum()
            FN = ((y_true == 1) & (y_pred_dilated == 0)).sum()
            FP = ((y_pred == 1) & (y_true_dilated == 0)).sum()
            TN = ((y_true == 0) & (y_pred_dilated == 0)).sum()

            precision = TP / (TP + FP) if (TP + FP) > 0 else 0
            recall = TP / (TP + FN) if (TP + FN) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

            # --- 6e. Print Individual Report ---
            print(f"--- Results for {score_column} ---")
            print(f"Total Actual Anomaly Points: {y_true.sum()}")
            print(f"Total Predicted Anomaly Points (Peaks): {y_pred.sum()}\n")

            print("Custom Confusion Matrix (Point-Adjusted):")
            print("                 Predicted Negative | Predicted Positive")
            print(f"Actual Negative: {TN:>17} | {FP:>17}")
            print(f"Actual Positive: {FN:>17} | {TP:>17}\n")

            print("Performance Metrics (Point-Adjusted):")
            print(f"  F1 Score:  {f1:.6f}")
            print(f"  Precision: {precision:.6f}")
            print(f"  Recall:    {recall:.6f}")

            # --- 6f. Store in Summary ---
            performance_summary.append({
                'score_column': score_column,
                'f1_score': f1,
                'precision': precision,
                'recall': recall,
                'TP': TP,
                'FN': FN,
                'FP': FP,
                'total_predicted_peaks': y_pred.sum()
            })

        except Exception as e:
            print(f"!!! Error processing {score_column}: {e} !!!", file=sys.stderr)
            import traceback
            traceback.print_exc()

    # --- 7. Print Final Summary Table ---
    print(f"\n\n{'-'*25} FINAL PEAK-BASED PERFORMANCE SUMMARY {'-'*25}")

    if performance_summary:
        summary_df = pd.DataFrame(performance_summary)
        summary_df = summary_df.sort_values(by='f1_score', ascending=False)
        summary_df = summary_df.set_index('score_column')

        print(summary_df.to_string(float_format="%.6f"))


    else:
        print("No score columns were successfully processed.")

    # Reset warnings
    warnings.filterwarnings('default')

# --- Run the function ---
if __name__ == "__main__":
    run_peak_based_analysis_for_all_scores()

In [9]:
# @title EVT a3
import pandas as pd
import numpy as np
from scipy.stats import genpareto
import sys
import warnings

def get_evt_threshold_for_segment(segment_scores, gate_quantile, final_quantile):
    """
    This is the core helper function that applies the EVT/POT
    logic to a single time series segment (passed as a pandas Series).

    1. Sets a "gate" threshold 'u' at the 'gate_quantile'.
    2. Collects all 'exceedances' (points > u).
    3. Tries to fit a Generalized Pareto Distribution (GPD) to these exceedances.
    4. Uses the fitted GPD to calculate the final, much rarer 'final_quantile' threshold.
    5. If the GPD fit fails or is unstable (e.g., too few exceedances),
       it safely falls back to just using the 'final_quantile' directly.
    """
    try:
        # --- 1. Set the "gate" threshold ---
        u = segment_scores.quantile(gate_quantile)

        # --- 2. Collect exceedances ---
        # We need to drop NaNs and any values <= u
        exceedances = segment_scores[segment_scores > u].dropna() - u

        # --- 5. Fallback Logic ---
        # If we have too few points (e.g., < 10) to fit a stable GPD,
        # or if all values are identical (nunique=1), fall back to a simple quantile.
        if len(exceedances) < 10 or exceedances.nunique() == 1:
            return segment_scores.quantile(final_quantile)

        # --- 3. Fit GPD ---
        # Fit the GPD to the exceedances.
        # We fix location 'floc=0' as exceedances start at 0.
        c, loc, scale = genpareto.fit(exceedances, floc=0)

        # --- 4. Calculate Final Threshold ---
        # We need to find the threshold 'T' such that P(X > T) = (1 - final_quantile)
        # T = u + GPD.isf( P(X > T) / P(X > u) )

        prob_gate = 1.0 - gate_quantile
        prob_final = 1.0 - final_quantile

        # Probability to find *within the tail*
        prob_target_in_tail = prob_final / prob_gate

        # If our target is "less extreme" than the gate, this is invalid.
        if prob_target_in_tail >= 1.0:
            return segment_scores.quantile(final_quantile)

        # Use Inverse Survival Function (isf) to find the exceedance value
        y_p = genpareto.isf(prob_target_in_tail, c, loc=loc, scale=scale)

        # The final threshold is the gate + the calculated exceedance
        threshold = u + y_p

        # Final sanity check for nan/inf
        if not np.isfinite(threshold):
            return segment_scores.quantile(final_quantile)

        return threshold

    except Exception:
        # If *anything* goes wrong (e.g., failed convergence), fall back.
        return segment_scores.quantile(final_quantile)


def run_evt_based_analysis_for_all_scores(file_path='/content/anomaly_scores_a3.csv'):
    """
    Main script to run an adaptive anomaly detection based on
    Extreme Value Theory (EVT) for EVERY 'anomalyscore_' column.
    """

    # --- 1. Configuration ---
    data_file = file_path

    # --- Evaluation Config ---
    NTOL = 3
    window_size = 2 * NTOL + 1

    # --- EVT Rule Set Config ---
    GATE_QUANTILE = 0.87  # 80th percentile "gate"
    FINAL_QUANTILE = 0.995 # 99.5th percentile we want to find

    print(f"Starting EXTREME VALUE THEORY (EVT) adaptive detection...")
    print(f"Using data file: {data_file}")
    print(f"Evaluation NTOL: {NTOL}")
    print(f"Rule: Gate at {GATE_QUANTILE}, calculate threshold for {FINAL_QUANTILE}\n")

    # Suppress warnings from scipy.stats.fit
    warnings.filterwarnings('ignore', category=Warning)

    try:
        # --- 2. Load Data ---
        df = pd.read_csv(data_file)
        if 'is_anomaly' not in df.columns:
            print("Error: 'is_anomaly' column not found.")
            return

    except FileNotFoundError:
        print(f"Error: The file '{data_file}' was not found.")
        return
    except Exception as e:
        print(f"An unexpected error occurred during loading: {e}")
        return

    # --- 3. Identify Score Columns ---
    all_score_columns = [col for col in df.columns if col.startswith('anomalyscore_')]
    #all_score_columns = [col for col in df.columns if col.startswith('H0_')]
    if not all_score_columns:
        print("Error: No columns found starting with 'anomalyscore_'.")
        return

    print(f"Found {len(all_score_columns)} score columns to analyze.")

    # --- 4. Initialize Summary ---
    performance_summary = []
    group_keys = ['loadeddataset', 'segment']

    # --- 5. Pre-calculate Dilated Ground Truth (Optimization) ---
    print("\nPre-calculating dilated ground truth...")
    dilated_true_series = df.groupby(group_keys)['is_anomaly'] \
                              .rolling(window=window_size, center=True, min_periods=1) \
                              .max()
    df['y_true_dilated'] = dilated_true_series.reset_index(level=group_keys, drop=True) \
                                              .fillna(0).astype(int)
    print("Dilated ground truth calculated.")

    # --- 6. Loop Through Each Score Column ---
    for score_column in all_score_columns:
        print(f"\n{'-'*25} Analyzing: {score_column} {'-'*25}")

        try:
            # --- 6a. Clean Data (specific to this column) ---
            df_cleaned = df.dropna(subset=[score_column]).copy()

            if len(df_cleaned) == 0:
                print(f"Skipping {score_column}: All values are NaN.")
                continue

            # --- 6b. Apply EVT Threshold Logic ---
            print("Calculating adaptive EVT thresholds...")

            # We use .transform() to apply our custom function to each group
            # 'transform' will pass the entire segment Series to our function
            df_cleaned['adaptive_threshold'] = df_cleaned.groupby(group_keys)[score_column] \
                                                         .transform(get_evt_threshold_for_segment,
                                                                    gate_quantile=GATE_QUANTILE,
                                                                    final_quantile=FINAL_QUANTILE)

            # --- 6c. Generate Predictions ---
            df_cleaned['predicted_anomaly'] = (df_cleaned[score_column] > df_cleaned['adaptive_threshold']).astype(int)

            print("Prediction generation complete.")

            # --- 6d. Apply NTOL Logic to Predictions ---
            print("Applying NTOL to predictions...")
            dilated_pred_series = df_cleaned.groupby(group_keys)['predicted_anomaly'] \
                                            .rolling(window=window_size, center=True, min_periods=1) \
                                            .max()
            df_cleaned['y_pred_dilated'] = dilated_pred_series.reset_index(level=group_keys, drop=True) \
                                                              .fillna(0).astype(int)

            # --- 6e. Calculate Custom Metrics (NTOL) ---
            y_true = df_cleaned['is_anomaly']
            y_pred = df_cleaned['predicted_anomaly']
            y_true_dilated = df_cleaned['y_true_dilated']
            y_pred_dilated = df_cleaned['y_pred_dilated']

            TP = ((y_true == 1) & (y_pred_dilated == 1)).sum()
            FN = ((y_true == 1) & (y_pred_dilated == 0)).sum()
            FP = ((y_pred == 1) & (y_true_dilated == 0)).sum()
            TN = ((y_true == 0) & (y_pred_dilated == 0)).sum()

            precision = TP / (TP + FP) if (TP + FP) > 0 else 0
            recall = TP / (TP + FN) if (TP + FN) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

            # --- 6f. Print Individual Report ---
            print(f"--- Results for {score_column} (EVT) ---")
            print(f"Total Actual Anomaly Points: {y_true.sum()}")
            print(f"Total Predicted Anomaly Points: {y_pred.sum()}\n")

            print("Custom Confusion Matrix (Point-Adjusted):")
            print("                 Predicted Negative | Predicted Positive")
            print(f"Actual Negative: {TN:>17} | {FP:>17}")
            print(f"Actual Positive: {FN:>17} | {TP:>17}\n")

            print("Performance Metrics (Point-Adjusted):")
            print(f"  F1 Score:  {f1:.6f}")
            print(f"  Precision: {precision:.6f}")
            print(f"  Recall:    {recall:.6f}")

            # --- 6g. Store in Summary ---
            performance_summary.append({
                'score_column': score_column,
                'f1_score': f1,
                'precision': precision,
                'recall': recall,
                'TP': TP,
                'FN': FN,
                'FP': FP,
                'total_predicted': y_pred.sum()
            })

        except Exception as e:
            # This will catch errors during the loop for one score
            # and allow the analysis to continue with the next score.
            print(f"!!! Error processing {score_column}: {e} !!!", file=sys.stderr)
            import traceback
            traceback.print_exc(file=sys.stderr)

    # --- 7. Print Final Summary Table ---
    print(f"\n\n{'-'*25} FINAL EVT (q={FINAL_QUANTILE}) PERFORMANCE SUMMARY {'-'*25}")

    if performance_summary:
        summary_df = pd.DataFrame(performance_summary)
        summary_df = summary_df.sort_values(by='f1_score', ascending=False)
        summary_df = summary_df.set_index('score_column')

        print(summary_df.to_string(float_format="%.6f"))



    else:
        print("No score columns were successfully processed.")

    # Reset warnings
    warnings.filterwarnings('default')

# --- Run the function ---
if __name__ == "__main__":
    run_evt_based_analysis_for_all_scores()

Starting EXTREME VALUE THEORY (EVT) adaptive detection...
Using data file: /content/anomaly_scores_a3.csv
Evaluation NTOL: 3
Rule: Gate at 0.87, calculate threshold for 0.995

Found 13 score columns to analyze.

Pre-calculating dilated ground truth...
Dilated ground truth calculated.

------------------------- Analyzing: anomalyscore_h0_auc -------------------------
Calculating adaptive EVT thresholds...
Prediction generation complete.
Applying NTOL to predictions...
--- Results for anomalyscore_h0_auc (EVT) ---
Total Actual Anomaly Points: 934
Total Predicted Anomaly Points: 893

Custom Confusion Matrix (Point-Adjusted):
                 Predicted Negative | Predicted Positive
Actual Negative:            160493 |                74
Actual Positive:               157 |               777

Performance Metrics (Point-Adjusted):
  F1 Score:  0.870588
  Precision: 0.913043
  Recall:    0.831906

------------------------- Analyzing: anomalyscore_h0_auc_over_max -------------------------
Calcu

  return datetime.utcnow().replace(tzinfo=utc)


In [None]:
# @title EVT a4
import pandas as pd
import numpy as np
from scipy.stats import genpareto
import sys
import warnings

def get_evt_threshold_for_segment(segment_scores, gate_quantile, final_quantile):
    """
    This is the core helper function that applies the EVT/POT
    logic to a single time series segment (passed as a pandas Series).

    1. Sets a "gate" threshold 'u' at the 'gate_quantile'.
    2. Collects all 'exceedances' (points > u).
    3. Tries to fit a Generalized Pareto Distribution (GPD) to these exceedances.
    4. Uses the fitted GPD to calculate the final, much rarer 'final_quantile' threshold.
    5. If the GPD fit fails or is unstable (e.g., too few exceedances),
       it safely falls back to just using the 'final_quantile' directly.
    """
    try:
        # --- 1. Set the "gate" threshold ---
        u = segment_scores.quantile(gate_quantile)

        # --- 2. Collect exceedances ---
        # We need to drop NaNs and any values <= u
        exceedances = segment_scores[segment_scores > u].dropna() - u

        # --- 5. Fallback Logic ---
        # If we have too few points (e.g., < 10) to fit a stable GPD,
        # or if all values are identical (nunique=1), fall back to a simple quantile.
        if len(exceedances) < 10 or exceedances.nunique() == 1:
            return segment_scores.quantile(final_quantile)

        # --- 3. Fit GPD ---
        # Fit the GPD to the exceedances.
        # We fix location 'floc=0' as exceedances start at 0.
        c, loc, scale = genpareto.fit(exceedances, floc=0)

        # --- 4. Calculate Final Threshold ---
        # We need to find the threshold 'T' such that P(X > T) = (1 - final_quantile)
        # T = u + GPD.isf( P(X > T) / P(X > u) )

        prob_gate = 1.0 - gate_quantile
        prob_final = 1.0 - final_quantile

        # Probability to find *within the tail*
        prob_target_in_tail = prob_final / prob_gate

        # If our target is "less extreme" than the gate, this is invalid.
        if prob_target_in_tail >= 1.0:
            return segment_scores.quantile(final_quantile)

        # Use Inverse Survival Function (isf) to find the exceedance value
        y_p = genpareto.isf(prob_target_in_tail, c, loc=loc, scale=scale)

        # The final threshold is the gate + the calculated exceedance
        threshold = u + y_p

        # Final sanity check for nan/inf
        if not np.isfinite(threshold):
            return segment_scores.quantile(final_quantile)

        return threshold

    except Exception:
        # If *anything* goes wrong (e.g., failed convergence), fall back.
        return segment_scores.quantile(final_quantile)


def run_evt_based_analysis_for_all_scores(file_path='/content/anomaly_scores_a4.csv'):
    """
    Main script to run an adaptive anomaly detection based on
    Extreme Value Theory (EVT) for EVERY 'anomalyscore_' column.
    """

    # --- 1. Configuration ---
    data_file = file_path

    # --- Evaluation Config ---
    NTOL = 3
    window_size = 2 * NTOL + 1

    # --- EVT Rule Set Config ---
    GATE_QUANTILE = 0.78  # 80th percentile "gate"
    FINAL_QUANTILE = 0.993 # 99.3th percentile we want to find

    print(f"Starting EXTREME VALUE THEORY (EVT) adaptive detection...")
    print(f"Using data file: {data_file}")
    print(f"Evaluation NTOL: {NTOL}")
    print(f"Rule: Gate at {GATE_QUANTILE}, calculate threshold for {FINAL_QUANTILE}\n")

    # Suppress warnings from scipy.stats.fit
    warnings.filterwarnings('ignore', category=Warning)

    try:
        # --- 2. Load Data ---
        df = pd.read_csv(data_file)
        if 'is_anomaly' not in df.columns:
            print("Error: 'is_anomaly' column not found.")
            return

    except FileNotFoundError:
        print(f"Error: The file '{data_file}' was not found.")
        return
    except Exception as e:
        print(f"An unexpected error occurred during loading: {e}")
        return

    # --- 3. Identify Score Columns ---
    all_score_columns = [col for col in df.columns if col.startswith('anomalyscore_')]

    if not all_score_columns:
        print("Error: No columns found starting with 'anomalyscore_'.")
        return

    print(f"Found {len(all_score_columns)} score columns to analyze.")

    # --- 4. Initialize Summary ---
    performance_summary = []
    group_keys = ['loadeddataset', 'segment']

    # --- 5. Pre-calculate Dilated Ground Truth (Optimization) ---
    print("\nPre-calculating dilated ground truth...")
    dilated_true_series = df.groupby(group_keys)['is_anomaly'] \
                              .rolling(window=window_size, center=True, min_periods=1) \
                              .max()
    df['y_true_dilated'] = dilated_true_series.reset_index(level=group_keys, drop=True) \
                                              .fillna(0).astype(int)
    print("Dilated ground truth calculated.")

    # --- 6. Loop Through Each Score Column ---
    for score_column in all_score_columns:
        print(f"\n{'-'*25} Analyzing: {score_column} {'-'*25}")

        try:
            # --- 6a. Clean Data (specific to this column) ---
            df_cleaned = df.dropna(subset=[score_column]).copy()

            if len(df_cleaned) == 0:
                print(f"Skipping {score_column}: All values are NaN.")
                continue

            # --- 6b. Apply EVT Threshold Logic ---
            print("Calculating adaptive EVT thresholds...")

            # We use .transform() to apply our custom function to each group
            # 'transform' will pass the entire segment Series to our function
            df_cleaned['adaptive_threshold'] = df_cleaned.groupby(group_keys)[score_column] \
                                                         .transform(get_evt_threshold_for_segment,
                                                                    gate_quantile=GATE_QUANTILE,
                                                                    final_quantile=FINAL_QUANTILE)

            # --- 6c. Generate Predictions ---
            df_cleaned['predicted_anomaly'] = (df_cleaned[score_column] > df_cleaned['adaptive_threshold']).astype(int)

            print("Prediction generation complete.")

            # --- 6d. Apply NTOL Logic to Predictions ---
            print("Applying NTOL to predictions...")
            dilated_pred_series = df_cleaned.groupby(group_keys)['predicted_anomaly'] \
                                            .rolling(window=window_size, center=True, min_periods=1) \
                                            .max()
            df_cleaned['y_pred_dilated'] = dilated_pred_series.reset_index(level=group_keys, drop=True) \
                                                              .fillna(0).astype(int)

            # --- 6e. Calculate Custom Metrics (NTOL) ---
            y_true = df_cleaned['is_anomaly']
            y_pred = df_cleaned['predicted_anomaly']
            y_true_dilated = df_cleaned['y_true_dilated']
            y_pred_dilated = df_cleaned['y_pred_dilated']

            TP = ((y_true == 1) & (y_pred_dilated == 1)).sum()
            FN = ((y_true == 1) & (y_pred_dilated == 0)).sum()
            FP = ((y_pred == 1) & (y_true_dilated == 0)).sum()
            TN = ((y_true == 0) & (y_pred_dilated == 0)).sum()

            precision = TP / (TP + FP) if (TP + FP) > 0 else 0
            recall = TP / (TP + FN) if (TP + FN) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

            # --- 6f. Print Individual Report ---
            print(f"--- Results for {score_column} (EVT) ---")
            print(f"Total Actual Anomaly Points: {y_true.sum()}")
            print(f"Total Predicted Anomaly Points: {y_pred.sum()}\n")

            print("Custom Confusion Matrix (Point-Adjusted):")
            print("                 Predicted Negative | Predicted Positive")
            print(f"Actual Negative: {TN:>17} | {FP:>17}")
            print(f"Actual Positive: {FN:>17} | {TP:>17}\n")

            print("Performance Metrics (Point-Adjusted):")
            print(f"  F1 Score:  {f1:.6f}")
            print(f"  Precision: {precision:.6f}")
            print(f"  Recall:    {recall:.6f}")

            # --- 6g. Store in Summary ---
            performance_summary.append({
                'score_column': score_column,
                'f1_score': f1,
                'precision': precision,
                'recall': recall,
                'TP': TP,
                'FN': FN,
                'FP': FP,
                'total_predicted': y_pred.sum()
            })

        except Exception as e:
            # This will catch errors during the loop for one score
            # and allow the analysis to continue with the next score.
            print(f"!!! Error processing {score_column}: {e} !!!", file=sys.stderr)
            import traceback
            traceback.print_exc(file=sys.stderr)

    # --- 7. Print Final Summary Table ---
    print(f"\n\n{'-'*25} FINAL EVT (q={FINAL_QUANTILE}) PERFORMANCE SUMMARY {'-'*25}")

    if performance_summary:
        summary_df = pd.DataFrame(performance_summary)
        summary_df = summary_df.sort_values(by='f1_score', ascending=False)
        summary_df = summary_df.set_index('score_column')

        print(summary_df.to_string(float_format="%.6f"))



    else:
        print("No score columns were successfully processed.")

    # Reset warnings
    warnings.filterwarnings('default')

# --- Run the function ---
if __name__ == "__main__":
    run_evt_based_analysis_for_all_scores()

In [None]:
# @title finding the best parameters
import pandas as pd
import numpy as np
from scipy.stats import genpareto
import sys
import warnings

def get_evt_threshold_for_segment(segment_scores, gate_quantile, final_quantile):
    """
    This is the core helper function that applies the EVT/POT
    logic to a single time series segment (passed as a pandas Series).

    1. Sets a "gate" threshold 'u' at the 'gate_quantile'.
    2. Collects all 'exceedances' (points > u).
    3. Tries to fit a Generalized Pareto Distribution (GPD) to these exceedances.
    4. Uses the fitted GPD to calculate the final, much rarer 'final_quantile' threshold.
    5. If the GPD fit fails or is unstable (e.g., too few exceedances),
       it safely falls back to just using the 'final_quantile' directly.
    """
    try:
        # --- 1. Set the "gate" threshold ---
        u = segment_scores.quantile(gate_quantile)

        # --- 2. Collect exceedances ---
        # We need to drop NaNs and any values <= u
        exceedances = segment_scores[segment_scores > u].dropna() - u

        # --- 5. Fallback Logic ---
        # If we have too few points (e.g., < 10) to fit a stable GPD,
        # or if all values are identical (nunique=1), fall back to a simple quantile.
        if len(exceedances) < 10 or exceedances.nunique() <= 1:
            return segment_scores.quantile(final_quantile)

        # --- 3. Fit GPD ---
        # Fit the GPD to the exceedances.
        # We fix location 'floc=0' as exceedances start at 0.
        c, loc, scale = genpareto.fit(exceedances, floc=0)

        # --- 4. Calculate Final Threshold ---
        # T = u + GPD.isf( P(X > T) / P(X > u) )

        prob_gate = 1.0 - gate_quantile
        prob_final = 1.0 - final_quantile

        # Probability to find *within the tail*
        prob_target_in_tail = prob_final / prob_gate

        # If our target is "less extreme" than the gate, this is invalid.
        if prob_target_in_tail >= 1.0 or prob_target_in_tail <= 0:
            return segment_scores.quantile(final_quantile)

        # Use Inverse Survival Function (isf) to find the exceedance value
        y_p = genpareto.isf(prob_target_in_tail, c, loc=loc, scale=scale)

        # The final threshold is the gate + the calculated exceedance
        threshold = u + y_p

        # Final sanity check for nan/inf or negative thresholds
        if not np.isfinite(threshold) or threshold < 0:
            return segment_scores.quantile(final_quantile)

        return threshold

    except Exception:
        # If *anything* goes wrong (e.g., failed convergence), fall back.
        return segment_scores.quantile(final_quantile)


def run_evt_grid_search(file_path='anomaly_scores_a3.csv'):
    """
    Main script to run a grid search for the best EVT parameters
    for EVERY 'anomalyscore_' column.

    MODIFIED: This version only prints the *single best* feature
    for each parameter set, instead of all 13.
    """

    # --- 1. Configuration ---
    # Using the original, correct filename
    data_file = file_path

    # --- Evaluation Config ---
    NTOL = 3
    window_size = 2 * NTOL + 1

    # --- EVT Grid Search Parameters ---
    GATE_QUANTILE_LIST = [0.75,0.77,0.79,0.80,0.81,0.83,0.85,0.86,0.87,0.88,0.89,0.90,0.91,0.93,0.95,0.99]
    FINAL_QUANTILE_LIST = [ 0.995]

    print(f"Starting EVT PARAMETER GRID SEARCH...")
    print(f"Using data file: {data_file}")
    print(f"Evaluation NTOL: {NTOL}")
    print(f"Gate Quantiles to test: {GATE_QUANTILE_LIST}")
    print(f"Final Quantiles to test: {FINAL_QUANTILE_LIST}\n")

    # Suppress warnings from scipy.stats.fit
    warnings.filterwarnings('ignore', category=Warning)

    try:
        # --- 2. Load Data ---
        df = pd.read_csv(data_file)
        if 'is_anomaly' not in df.columns:
            print("Error: 'is_anomaly' column not found.")
            return

    except FileNotFoundError:
        print(f"Error: The file '{data_file}' was not found.")
        return
    except Exception as e:
        print(f"An unexpected error occurred during loading: {e}")
        return

    # --- 3. Identify Score Columns ---
    all_score_columns = [col for col in df.columns if col.startswith('anomalyscore_')]

    if not all_score_columns:
        print("Error: No columns found starting with 'anomalyscore_'.")
        return

    print(f"Found {len(all_score_columns)} score columns to analyze.")

    # --- 4. Initialize Summary ---
    master_performance_summary = []

    group_keys = ['loadeddataset', 'segment']

    # --- 5. Pre-calculate Dilated Ground Truth (Optimization) ---
    print("\nPre-calculating dilated ground truth...")
    dilated_true_series = df.groupby(group_keys)['is_anomaly'] \
                              .rolling(window=window_size, center=True, min_periods=1) \
                              .max()
    # This is the line you provided, ensures robust handling
    df['y_true_dilated'] = dilated_true_series.reset_index(level=group_keys, drop=True) \
                                              .fillna(0).astype(int)
    print("Dilated ground truth calculated.")

    # --- 6. Run the Grid Search Loop ---

    # This list will store only the single best result from each set
    best_results_by_param_set = []

    for gate_q in GATE_QUANTILE_LIST:
        for final_q in FINAL_QUANTILE_LIST:

            print(f"\n--- Analyzing Parameter Set: GateQ={gate_q}, FinalQ={final_q} ---")

            # This list will hold the 13 results for this *specific* parameter set
            current_param_set_results = []

            for score_column in all_score_columns:

                # Removed the chatty "Analyzing..." print here

                try:
                    # --- 6a. Clean Data ---
                    df_cleaned = df.dropna(subset=[score_column]).copy()

                    if len(df_cleaned) == 0:
                        continue # Skip this score if it's all NaN

                    # --- 6b. Apply EVT Threshold Logic ---
                    df_cleaned['adaptive_threshold'] = df_cleaned.groupby(group_keys)[score_column] \
                                                                 .transform(get_evt_threshold_for_segment,
                                                                            gate_quantile=gate_q,
                                                                            final_quantile=final_q)

                    # --- 6c. Generate Predictions ---
                    df_cleaned['predicted_anomaly'] = (df_cleaned[score_column] > df_cleaned['adaptive_threshold']).astype(int)

                    # --- 6d. Apply NTOL Logic to Predictions ---
                    dilated_pred_series = df_cleaned.groupby(group_keys)['predicted_anomaly'] \
                                                    .rolling(window=window_size, center=True, min_periods=1) \
                                                    .max()
                    df_cleaned['y_pred_dilated'] = dilated_pred_series.reset_index(level=group_keys, drop=True) \
                                                                      .fillna(0).astype(int)

                    # --- 6e. Calculate Custom Metrics (NTOL) ---
                    y_true = df_cleaned['is_anomaly']
                    y_pred = df_cleaned['predicted_anomaly']
                    y_true_dilated = df_cleaned['y_true_dilated']
                    y_pred_dilated = df_cleaned['y_pred_dilated']

                    TP = ((y_true == 1) & (y_pred_dilated == 1)).sum()
                    FN = ((y_true == 1) & (y_pred_dilated == 0)).sum()
                    FP = ((y_pred == 1) & (y_true_dilated == 0)).sum()
                    TN = ((y_true == 0) & (y_pred_dilated == 0)).sum()

                    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
                    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
                    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

                    # --- 6f. Store in *temporary* and *master* lists ---
                    result_details = {
                        'score_column': score_column,
                        'gate_quantile': gate_q,
                        'final_quantile': final_q,
                        'f1_score': f1,
                        'precision': precision,
                        'recall': recall,
                        'TP': TP,
                        'FN': FN,
                        'FP': FP,
                        'total_predicted': y_pred.sum()
                    }
                    current_param_set_results.append(result_details)
                    master_performance_summary.append(result_details)

                except Exception as e:
                    print(f"!!! Error processing {score_column} with G={gate_q}, F={final_q}: {e} !!!", file=sys.stderr)
                    import traceback
                    traceback.print_exc(file=sys.stderr)

            # --- End of score_column loop ---
            # Now, find the best result *from this parameter set* and print it
            if current_param_set_results:
                best_for_set = max(current_param_set_results, key=lambda x: x['f1_score'])
                best_results_by_param_set.append(best_for_set) # Store for final summary

                print(f"    -> Best F1 for this set: {best_for_set['f1_score']:.6f} "
                      f"(using {best_for_set['score_column']})")
            else:
                print("    -> No results for this parameter set.")

    # --- 7. Print Final Summary Table ---
    print(f"\n\n{'-'*25} FINAL EVT GRID SEARCH SUMMARY {'-'*25}")

    if master_performance_summary:
        # Create DataFrames from our collected results
        full_summary_df = pd.DataFrame(master_performance_summary)
        best_set_summary_df = pd.DataFrame(best_results_by_param_set)

        # Sort the "best of each set" list to find the overall winners
        best_set_summary_df = best_set_summary_df.sort_values(by='f1_score', ascending=False)

        print("\n--- TOP 20 BEST PARAMETER SETS (and their best feature) ---")
        print(best_set_summary_df.head(20).to_string(float_format="%.6f",
                                                    columns=['gate_quantile', 'final_quantile', 'score_column', 'f1_score', 'precision', 'recall']))

        print("\n\n--- !!! BEST OVERALL RESULT !!! ---")
        best_overall = best_set_summary_df.iloc[0]
        print(best_overall)


    else:
        print("No combinations were successfully processed.")

    # Reset warnings
    warnings.filterwarnings('default')

# --- Run the function ---
if __name__ == "__main__":
    run_evt_grid_search()

In [None]:
# @title finding the best parameters
import pandas as pd
import numpy as np
from scipy.stats import genpareto
import sys
import warnings

def get_evt_threshold_for_segment(segment_scores, gate_quantile, final_quantile):
    """
    This is the core helper function that applies the EVT/POT
    logic to a single time series segment (passed as a pandas Series).

    1. Sets a "gate" threshold 'u' at the 'gate_quantile'.
    2. Collects all 'exceedances' (points > u).
    3. Tries to fit a Generalized Pareto Distribution (GPD) to these exceedances.
    4. Uses the fitted GPD to calculate the final, much rarer 'final_quantile' threshold.
    5. If the GPD fit fails or is unstable (e.g., too few exceedances),
       it safely falls back to just using the 'final_quantile' directly.
    """
    try:
        # --- 1. Set the "gate" threshold ---
        u = segment_scores.quantile(gate_quantile)

        # --- 2. Collect exceedances ---
        # We need to drop NaNs and any values <= u
        exceedances = segment_scores[segment_scores > u].dropna() - u

        # --- 5. Fallback Logic ---
        # If we have too few points (e.g., < 10) to fit a stable GPD,
        # or if all values are identical (nunique=1), fall back to a simple quantile.
        if len(exceedances) < 10 or exceedances.nunique() <= 1:
            return segment_scores.quantile(final_quantile)

        # --- 3. Fit GPD ---
        # Fit the GPD to the exceedances.
        # We fix location 'floc=0' as exceedances start at 0.
        c, loc, scale = genpareto.fit(exceedances, floc=0)

        # --- 4. Calculate Final Threshold ---
        # T = u + GPD.isf( P(X > T) / P(X > u) )

        prob_gate = 1.0 - gate_quantile
        prob_final = 1.0 - final_quantile

        # Probability to find *within the tail*
        prob_target_in_tail = prob_final / prob_gate

        # If our target is "less extreme" than the gate, this is invalid.
        if prob_target_in_tail >= 1.0 or prob_target_in_tail <= 0:
            return segment_scores.quantile(final_quantile)

        # Use Inverse Survival Function (isf) to find the exceedance value
        y_p = genpareto.isf(prob_target_in_tail, c, loc=loc, scale=scale)

        # The final threshold is the gate + the calculated exceedance
        threshold = u + y_p

        # Final sanity check for nan/inf or negative thresholds
        if not np.isfinite(threshold) or threshold < 0:
            return segment_scores.quantile(final_quantile)

        return threshold

    except Exception:
        # If *anything* goes wrong (e.g., failed convergence), fall back.
        return segment_scores.quantile(final_quantile)


def run_evt_grid_search(file_path='anomaly_scores_a4.csv'):
    """
    Main script to run a grid search for the best EVT parameters
    for EVERY 'anomalyscore_' column.

    MODIFIED: This version only prints the *single best* feature
    for each parameter set, instead of all 13.
    """

    # --- 1. Configuration ---
    # Using the original, correct filename
    data_file = file_path

    # --- Evaluation Config ---
    NTOL = 3
    window_size = 2 * NTOL + 1

    # --- EVT Grid Search Parameters ---
    GATE_QUANTILE_LIST = np.round(np.arange(0.60, 0.901, 0.01), 2)
    FINAL_QUANTILE_LIST = np.round(np.arange(0.980, 0.9971, 0.001), 3)

    print(f"Starting EVT PARAMETER GRID SEARCH...")
    print(f"Using data file: {data_file}")
    print(f"Evaluation NTOL: {NTOL}")
    print(f"Gate Quantiles to test: {GATE_QUANTILE_LIST}")
    print(f"Final Quantiles to test: {FINAL_QUANTILE_LIST}\n")

    # Suppress warnings from scipy.stats.fit
    warnings.filterwarnings('ignore', category=Warning)

    try:
        # --- 2. Load Data ---
        df = pd.read_csv(data_file)
        if 'is_anomaly' not in df.columns:
            print("Error: 'is_anomaly' column not found.")
            return

    except FileNotFoundError:
        print(f"Error: The file '{data_file}' was not found.")
        return
    except Exception as e:
        print(f"An unexpected error occurred during loading: {e}")
        return

    # --- 3. Identify Score Columns ---
    all_score_columns = [col for col in df.columns if col.startswith('anomalyscore_')]

    if not all_score_columns:
        print("Error: No columns found starting with 'anomalyscore_'.")
        return

    print(f"Found {len(all_score_columns)} score columns to analyze.")

    # --- 4. Initialize Summary ---
    master_performance_summary = []

    group_keys = ['loadeddataset', 'segment']

    # --- 5. Pre-calculate Dilated Ground Truth (Optimization) ---
    print("\nPre-calculating dilated ground truth...")
    dilated_true_series = df.groupby(group_keys)['is_anomaly'] \
                              .rolling(window=window_size, center=True, min_periods=1) \
                              .max()
    # This is the line you provided, ensures robust handling
    df['y_true_dilated'] = dilated_true_series.reset_index(level=group_keys, drop=True) \
                                              .fillna(0).astype(int)
    print("Dilated ground truth calculated.")

    # --- 6. Run the Grid Search Loop ---

    # This list will store only the single best result from each set
    best_results_by_param_set = []

    for gate_q in GATE_QUANTILE_LIST:
        for final_q in FINAL_QUANTILE_LIST:

            print(f"\n--- Analyzing Parameter Set: GateQ={gate_q}, FinalQ={final_q} ---")

            # This list will hold the 13 results for this *specific* parameter set
            current_param_set_results = []

            for score_column in all_score_columns:

                # Removed the chatty "Analyzing..." print here

                try:
                    # --- 6a. Clean Data ---
                    df_cleaned = df.dropna(subset=[score_column]).copy()

                    if len(df_cleaned) == 0:
                        continue # Skip this score if it's all NaN

                    # --- 6b. Apply EVT Threshold Logic ---
                    df_cleaned['adaptive_threshold'] = df_cleaned.groupby(group_keys)[score_column] \
                                                                 .transform(get_evt_threshold_for_segment,
                                                                            gate_quantile=gate_q,
                                                                            final_quantile=final_q)

                    # --- 6c. Generate Predictions ---
                    df_cleaned['predicted_anomaly'] = (df_cleaned[score_column] > df_cleaned['adaptive_threshold']).astype(int)

                    # --- 6d. Apply NTOL Logic to Predictions ---
                    dilated_pred_series = df_cleaned.groupby(group_keys)['predicted_anomaly'] \
                                                    .rolling(window=window_size, center=True, min_periods=1) \
                                                    .max()
                    df_cleaned['y_pred_dilated'] = dilated_pred_series.reset_index(level=group_keys, drop=True) \
                                                                      .fillna(0).astype(int)

                    # --- 6e. Calculate Custom Metrics (NTOL) ---
                    y_true = df_cleaned['is_anomaly']
                    y_pred = df_cleaned['predicted_anomaly']
                    y_true_dilated = df_cleaned['y_true_dilated']
                    y_pred_dilated = df_cleaned['y_pred_dilated']

                    TP = ((y_true == 1) & (y_pred_dilated == 1)).sum()
                    FN = ((y_true == 1) & (y_pred_dilated == 0)).sum()
                    FP = ((y_pred == 1) & (y_true_dilated == 0)).sum()
                    TN = ((y_true == 0) & (y_pred_dilated == 0)).sum()

                    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
                    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
                    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

                    # --- 6f. Store in *temporary* and *master* lists ---
                    result_details = {
                        'score_column': score_column,
                        'gate_quantile': gate_q,
                        'final_quantile': final_q,
                        'f1_score': f1,
                        'precision': precision,
                        'recall': recall,
                        'TP': TP,
                        'FN': FN,
                        'FP': FP,
                        'total_predicted': y_pred.sum()
                    }
                    current_param_set_results.append(result_details)
                    master_performance_summary.append(result_details)

                except Exception as e:
                    print(f"!!! Error processing {score_column} with G={gate_q}, F={final_q}: {e} !!!", file=sys.stderr)
                    import traceback
                    traceback.print_exc(file=sys.stderr)

            # --- End of score_column loop ---
            # Now, find the best result *from this parameter set* and print it
            if current_param_set_results:
                best_for_set = max(current_param_set_results, key=lambda x: x['f1_score'])
                best_results_by_param_set.append(best_for_set) # Store for final summary

                print(f"    -> Best F1 for this set: {best_for_set['f1_score']:.6f} "
                      f"(using {best_for_set['score_column']})")
            else:
                print("    -> No results for this parameter set.")

    # --- 7. Print Final Summary Table ---
    print(f"\n\n{'-'*25} FINAL EVT GRID SEARCH SUMMARY {'-'*25}")

    if master_performance_summary:
        # Create DataFrames from our collected results
        full_summary_df = pd.DataFrame(master_performance_summary)
        best_set_summary_df = pd.DataFrame(best_results_by_param_set)

        # Sort the "best of each set" list to find the overall winners
        best_set_summary_df = best_set_summary_df.sort_values(by='f1_score', ascending=False)

        print("\n--- TOP 20 BEST PARAMETER SETS (and their best feature) ---")
        print(best_set_summary_df.head(20).to_string(float_format="%.6f",
                                                    columns=['gate_quantile', 'final_quantile', 'score_column', 'f1_score', 'precision', 'recall']))

        print("\n\n--- !!! BEST OVERALL RESULT !!! ---")
        best_overall = best_set_summary_df.iloc[0]
        print(best_overall)


    else:
        print("No combinations were successfully processed.")

    # Reset warnings
    warnings.filterwarnings('default')

# --- Run the function ---
if __name__ == "__main__":
    run_evt_grid_search()

In [None]:
# @title a sample of auc
import numpy as np
import matplotlib.pyplot as plt

def tri_hat(b, d, t):
    m = (b + d) / 2.0
    h = (d - b) / 2.0
    y = np.zeros_like(t, dtype=float)
    left  = (t >= b) & (t <= m)
    right = (t > m) & (t <= d)
    y[left]  = (t[left]  - b) / (m - b) * h
    y[right] = (d - t[right]) / (d - m) * h
    return y

bars = [(0,4), (10,12)]
t = np.linspace(-1, 13, 2000)
hats = [tri_hat(b, d, t) for (b, d) in bars]
envelope = np.max(np.vstack(hats), axis=0)

plt.figure(figsize=(7,3))
for (b,d), h in zip(bars, hats):
    plt.plot(t, h, linewidth=2, label=f"hat({b},{d})")
plt.plot(t, envelope, 'k--', linewidth=2, label="max-envelope $\Lambda(t)$")
plt.xlabel("filtration parameter $t$")
plt.ylabel("height")
plt.legend()
plt.tight_layout()
plt.savefig("toy_auc_l2.png", dpi=300)
plt.show()
