<a href="https://colab.research.google.com/github/korkutanapa/ANOMALY_DETECTION_TDA_YAHOO_DATASET/blob/main/TDA_CODES_for_NAB_ORIGINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import shutil
import glob
import numpy as np
import pandas as pd

# ============================================================
# 1. CLEAN START & CLONE NAB
# ============================================================
print("--- 1. CLEAN START ---")
os.chdir("/content")

# Remove old NAB clone if exists
if os.path.exists("NAB"):
    shutil.rmtree("NAB")

# Clone NAB repository
!git clone https://github.com/numenta/NAB.git

# Install ripser for TDA
!pip install -q ripser

os.chdir("/content/NAB")

# Ensure config folder + empty thresholds.json (optimize will fill it)
os.makedirs("config", exist_ok=True)
thr_path = os.path.join("config", "thresholds.json")
if not os.path.exists(thr_path):
    with open(thr_path, "w") as f:
        f.write("{}")


# ============================================================
# 3. WRITE TDA_VEAD_METHOD (my_algo.py)
# ============================================================
print("--- 3. WRITING TDA_VEAD_METHOD DETECTOR ---")

tda_code = """
import os
import glob
import numpy as np
import pandas as pd
from ripser import ripser
import warnings

warnings.filterwarnings("ignore")

DETECTOR_NAME = "TDA_VEAD_Method"
INPUT_DIR = "data"
OUTPUT_DIR = os.path.join("results", DETECTOR_NAME)

WINDOW_SIZE = 20
TAU = 1
DIMENSION = 6
_EPS = 1e-12

# ==========================================================
# 1. TDA FEATURE NAMES (H0-BASED)
# ==========================================================
FEATURE_NAMES = [
    "H0_ratio_auc_L1_to_sum",
    "H0_ratio_auc_to_max",
    "H0_ratio_auc_to_l2",
    "H0_bottleneck",
    "tail_share_q90",
    "H0_sum_centroid",
    "H0_L2_norm",
    "PETE_p1.6_q0.5",
    "H0_energy_concentration",
    "H0_dominance_share",
    "H0_tail_curvature_80_90",
    "H0_centroid_to_energy",
    "H0_gini",
]


# ==========================================================
# 2. TDA UTILITY FUNCTIONS (ADAPTED FROM YOUR CODE)
# ==========================================================
def takens_embed(window: np.ndarray, tau: int, m: int) -> np.ndarray:
    \"""
    1D Takens embedding for a given window:
      window: 1D array length N
      tau: delay
      m: embedding dimension
    Returns shape (m, L) where L = N - (m-1)*tau, or None if not enough points.
    \"""
    L = len(window) - (m - 1) * tau
    if L <= 0:
        return None
    return np.stack([window[j : j + L * tau : tau] for j in range(m)], axis=1)


def _clean_diag_h(diag_h):
    if diag_h is None:
        return np.empty((0, 2), dtype=float)
    arr = np.asarray(diag_h, dtype=float)
    if arr.ndim != 2 or arr.shape[1] != 2 or arr.size == 0:
        return np.empty((0, 2), dtype=float)
    finite_mask = np.isfinite(arr).all(axis=1)
    arr = arr[finite_mask]
    if arr.size == 0:
        return np.empty((0, 2), dtype=float)
    b, d = arr[:, 0], arr[:, 1]
    ok = np.isfinite(d) & (d > b)
    if not np.any(ok):
        return np.empty((0, 2), dtype=float)
    return np.stack([b[ok], d[ok]], axis=1)


try:
    _trapz = np.trapezoid
except AttributeError:
    _trapz = np.trapz


def _lifetimes(arr):
    return np.maximum(arr[:, 1] - arr[:, 0], 0.0) if arr.size else np.empty((0,), float)


def _bottleneck_amp(arr):
    L = _lifetimes(arr)
    return float(np.max(L)) if L.size else 0.0


def h0_l2_norm(arr):
    L = _lifetimes(arr)
    return float(np.sqrt(np.sum(L**2))) if L.size else 0.0


def _auc_tri_max(arr):
    if arr.size == 0:
        return 0.0
    if arr.shape[0] == 1:
        return 0.25 * ((arr[0, 1] - arr[0, 0])**2)

    # Simple grid approximation for AUC
    n_grid = 64
    lo, hi = (float(np.min(arr[:, 0])), float(np.max(arr[:, 1]))) if arr.size else (0.0, 1.0)
    grid = np.linspace(lo, hi, num=n_grid)
    lam1 = np.zeros_like(grid, float)

    b, d = arr[:, 0], arr[:, 1]
    for bj, dj in zip(b, d):
        m = 0.5 * (bj + dj)
        h = 0.5 * (dj - bj)
        if h <= 0:
            continue

        mask = (grid >= bj) & (grid <= dj)
        if not mask.any():
            continue

        # Left side of triangle
        l_mask = mask & (grid <= m)
        if l_mask.any():
            lam1[l_mask] = np.maximum(lam1[l_mask], (grid[l_mask] - bj) * (h / (m - bj + _EPS)))

        # Right side of triangle
        r_mask = mask & (grid > m)
        if r_mask.any():
            lam1[r_mask] = np.maximum(lam1[r_mask], (dj - grid[r_mask]) * (h / (dj - m + _EPS)))

    return float(_trapz(lam1, grid))


def compute_h0_features_for_window(window: np.ndarray) -> dict:
    \"""
    Compute all H0-based TDA features for one window.
    Returns a dict with keys in FEATURE_NAMES.
    \"""
    try:
        emb = takens_embed(window, TAU, DIMENSION)
        if emb is None:
            # Not enough points
            return {name: 0.0 for name in FEATURE_NAMES}

        dgms = ripser(emb, maxdim=0)["dgms"]
        D0 = _clean_diag_h(dgms[0] if len(dgms) else None)

        L = _lifetimes(D0)
        S = float(L.sum())
        A = _auc_tri_max(D0)
        L2 = h0_l2_norm(D0)
        mx = float(np.max(L)) if L.size else 0.0

        feats = {}

        # 1) Ratios
        feats["H0_ratio_auc_L1_to_sum"] = 0.0 if S <= _EPS else A / S
        feats["H0_ratio_auc_to_max"] = 0.0 if mx <= _EPS else A / mx
        feats["H0_ratio_auc_to_l2"] = 0.0 if L2 <= _EPS else A / L2

        # 2) Bottleneck
        feats["H0_bottleneck"] = mx

        # 3) Tail share q90
        if S > _EPS and L.size > 0:
            qv = float(np.quantile(L, 0.90))
            feats["tail_share_q90"] = float(L[L >= qv].sum()) / S
        else:
            feats["tail_share_q90"] = 0.0

        # 4) Centroid
        if S > _EPS and D0.size > 0:
            radial = np.abs((D0[:, 0] + D0[:, 1]) / np.sqrt(2.0))
            feats["H0_sum_centroid"] = float(np.sum(radial * L)) / S
        else:
            feats["H0_sum_centroid"] = 0.0

        # 5) L2 norm
        feats["H0_L2_norm"] = L2

        # 6) PETE
        if S > _EPS and D0.size > 0:
            radial = (D0[:, 0] + D0[:, 1]) / np.sqrt(2.0)
            num = np.sum((L ** 1.6) * (np.abs(radial) ** 0.5))
            feats["PETE_p1.6_q0.5"] = num / S
        else:
            feats["PETE_p1.6_q0.5"] = 0.0

        # 7) Energy concentration & dominance
        feats["H0_energy_concentration"] = (L2 / S) if S > _EPS else 0.0
        feats["H0_dominance_share"] = (mx / S) if S > _EPS else 0.0

        # 8) Tail curvature
        if S > _EPS and L.size > 0:
            q90 = float(L[L >= np.quantile(L, 0.90)].sum()) / S
            q80 = float(L[L >= np.quantile(L, 0.80)].sum()) / S
            feats["H0_tail_curvature_80_90"] = q90 - q80
        else:
            feats["H0_tail_curvature_80_90"] = 0.0

        # 9) Centroid to energy
        feats["H0_centroid_to_energy"] = (
            feats["H0_sum_centroid"] / L2
        ) if L2 > _EPS else 0.0

        # 10) Gini
        if L.size > 0 and S > _EPS:
            xs = np.sort(L)
            n = xs.size
            cumx = np.cumsum(xs)
            feats["H0_gini"] = float(1.0 + 1.0/n - 2.0 * (cumx.sum() / (n * S)))
        else:
            feats["H0_gini"] = 0.0

        # Ensure all feature names are present
        for name in FEATURE_NAMES:
            feats.setdefault(name, 0.0)

        return feats

    except Exception:
        # On any failure, return zeros for all features
        return {name: 0.0 for name in FEATURE_NAMES}


# ==========================================================
# 3. MAIN: ASK USER FOR FEATURE, THEN RUN NAB DETECTOR
# ==========================================================
def run():
    # --- Ask user to pick a TDA feature ---
    print("\\nAvailable TDA H0 features:")
    for idx, name in enumerate(FEATURE_NAMES):
        print(f"  {idx:2d} -> {name}")
    choice = input("Select feature by index or name (default: H0_bottleneck): ").strip()

    selected_feature = "H0_bottleneck"

    if choice == "":
        pass
    elif choice.isdigit():
        idx = int(choice)
        if 0 <= idx < len(FEATURE_NAMES):
            selected_feature = FEATURE_NAMES[idx]
        else:
            print(f"Index {idx} out of range, using default H0_bottleneck.")
    else:
        if choice in FEATURE_NAMES:
            selected_feature = choice
        else:
            print(f"Feature '{choice}' not recognized, using default H0_bottleneck.")

    print(f"\\n>>> Using TDA feature: {selected_feature}\\n")

    # --- Process all NAB CSV files ---
    files = glob.glob(os.path.join(INPUT_DIR, "**", "*.csv"), recursive=True)
    print(f"   Found {len(files)} data files in '{INPUT_DIR}'")

    for filepath in files:
        if ".ipynb_checkpoints" in filepath:
            continue

        try:
            df = pd.read_csv(filepath)
            df.columns = [c.strip().lower() for c in df.columns]

            if "value" not in df.columns or "timestamp" not in df.columns:
                continue

            vals = df["value"].astype(float).values

            # 1) Compute chosen TDA feature over sliding windows
            scores = []
            for i in range(len(vals)):
                if i < WINDOW_SIZE:
                    scores.append(0.0)
                else:
                    window = vals[i - WINDOW_SIZE : i]
                    feats = compute_h0_features_for_window(window)
                    scores.append(float(feats.get(selected_feature, 0.0)))

            scores = pd.Series(scores, dtype=float).fillna(0.0)

            # 2) VEAD-like step on the chosen TDA feature
            v = scores.diff().fillna(0.0)
            a = v.diff().fillna(0.0)
            raw_final = (3.5 * np.abs(v)) * (3.5 * np.abs(a))
            #raw_final = scores.abs()
            # 3) Normalize anomaly scores into [0, 1]
            max_val = raw_final.max()
            if pd.isna(max_val) or max_val <= 0:
                final_scores = raw_final * 0.0
            else:
                final_scores = (raw_final / max_val).clip(lower=0.0, upper=1.0)

            # 4) Build output path: results/TDA_VEAD_Method/<category>/TDA_VEAD_Method_<file>.csv
            rel = os.path.relpath(filepath, INPUT_DIR)
            category = os.path.dirname(rel)
            base_name = os.path.basename(rel)

            out_dir = os.path.join(OUTPUT_DIR, category)
            os.makedirs(out_dir, exist_ok=True)
            out_name = f"{DETECTOR_NAME}_" + base_name
            out_path = os.path.join(out_dir, out_name)

            # 5) Write result: timestamp + anomaly_score
            out_df = pd.DataFrame({
                "timestamp": df["timestamp"],
                "anomaly_score": final_scores.values
            })
            out_df.to_csv(out_path, index=False)
            print(f"   -> Wrote: {out_path}")

        except Exception as e:
            print(f"   !! Error processing {filepath}: {e}")
            continue


if __name__ == "__main__":
    run()
"""

with open("my_algo.py", "w") as f:
    f.write(tda_code)

print("✅ my_algo.py written.")




# ============================================================
# 4. RUN YOUR DETECTOR ON ALL NAB DATA
# ============================================================
print("--- 4. RUNNING TDA_VEAD_Method ON ALL DATASETS ---")
!python my_algo.py

# ============================================================
# 5. RUN NAB OPTIMIZE + SCORE FOR THIS DETECTOR
# ============================================================
print("--- 5. RUNNING NAB OPTIMIZE + SCORE ---")
!python run.py --optimize --score --detectors TDA_VEAD_Method --normalize


--- 1. CLEAN START ---
Cloning into 'NAB'...
remote: Enumerating objects: 7119, done.[K
remote: Counting objects: 100% (713/713), done.[K
remote: Compressing objects: 100% (168/168), done.[K
remote: Total 7119 (delta 601), reused 545 (delta 545), pack-reused 6406 (from 1)[K
Receiving objects: 100% (7119/7119), 86.73 MiB | 23.73 MiB/s, done.
Resolving deltas: 100% (5015/5015), done.
Updating files: 100% (1186/1186), done.
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m842.1/842.1 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.6/48.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for hopcroftkarp (setup.py) ... [?25l[?25hdone
--- 3. WRITING TDA_VEAD_METHOD DETECTOR ---
✅ my_algo.py written.
--- 4. RUNNING TDA_VEAD_Method ON ALL DATASETS ---

Available TDA H0 features:
   0 -> H0_ratio_auc_L1_to_sum
   1 -> H0_ratio_auc_to_max
   