<a href="https://colab.research.google.com/github/korkutanapa/ANOMALY_DETECTION_TDA_YAHOO_DATASET/blob/main/batch_running_nab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, re, shutil, subprocess, textwrap

# =========================
# USER CONFIG
# =========================
BASE_DIR = "/content"
NAB_DIR  = os.path.join(BASE_DIR, "NAB")

USE_VEAD = True            # <-- set False to run WITHOUT VEAD
KV, KA   = 3.5, 3.5
MODE     = "abs_plateau"   # "strict" | "plateau" | "abs_plateau"

WINDOW_SIZE = 14
TAU         = 1
DIMENSION   = 7
MAXDIM      = 1            # H0 + H1

# =========================
# 77 FEATURES LIST (exact)
# =========================
ROBUST_FEATURES = [
    #"H0_Carlsson_f1","H0_Carlsson_f3","H0_Carlsson_f5",
    #"H0_L1_lifetime","H0_L1_norm","H0_L2_lifetime","H0_L2_norm",
    #"H0_Linf_lifetime","H0_Linf_norm","H0_bottleneck","H0_centroid_to_energy",
    #"H0_centroid_y","H0_dominance_share","H0_energy_concentration","H0_gini",
    #"H0_max_diag_dist","H0_max_lifetime","H0_mean_death","H0_mean_diag_dist",
    #"H0_mean_lifetime","H0_median_lifetime","H0_min_lifetime","H0_persistence_entropy",
    #"H0_q50","H0_q75","H0_q90","H0_q95","H0_q99","H0_ratio_auc_L1_to_sum",
    #"H0_ratio_auc_to_l2","H0_ratio_auc_to_max","H0_std_death","H0_std_lifetime",
    #"H0_sum_centroid","H0_sum_diag_dist","H0_sum_lifetime","H0_tail_curvature_80_90",
    #"H0_tail_share_q80","H0_tail_share_q90","H0_tail_share_q95",
    #"H1_Carlsson_f1","H1_Carlsson_f2","H1_Carlsson_f3",
    #"H1_L1_lifetime","H1_L1_norm","H1_L2_lifetime","H1_L2_norm",
    #"H1_Linf_lifetime","H1_Linf_norm","H1_betti","H1_count_lifetime",
    "H1_dominance_share","H1_energy_concentration","H1_gini",
    "H1_max_diag_dist","H1_max_lifetime","H1_mean_diag_dist","H1_mean_lifetime",
    "H1_median_lifetime","H1_min_lifetime","H1_persistence_entropy",
    "H1_q50","H1_q75","H1_q90","H1_q95","H1_q99",
    "H1_std_birth","H1_std_death","H1_std_lifetime",
    "H1_sum_diag_dist","H1_sum_lifetime",
    "H1_tail_share_q80","H1_tail_share_q90","H1_tail_share_q95",
    "H1_to_H0_betti_ratio","H1_to_H0_entropy_ratio",
    "PETE_p1.6_q0.5"
]

# =========================
# helpers
# =========================
def safe_name(s: str) -> str:
    return re.sub(r"[^A-Za-z0-9_]+", "_", s)

def run_cmd(cmd, cwd=None, check=True):
    """Run a command and capture output (stdout+stderr)."""
    p = subprocess.run(
        cmd, cwd=cwd, check=False,
        stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
        text=True
    )
    if check and p.returncode != 0:
        raise RuntimeError(f"Command failed ({p.returncode}): {' '.join(cmd)}\n\n{p.stdout}")
    return p.returncode, p.stdout

def extract_score_block(text: str) -> str:
    """
    NAB output formats can vary. This tries to print the most useful tail part:
    - any lines containing 'Score', 'Standard', 'LowFP', 'LowFN', 'AUC', etc.
    - plus last ~60 lines as fallback.
    """
    lines = text.splitlines()
    key = []
    for ln in lines:
        if any(k in ln for k in ["Score", "Standard", "LowFP", "LowFN", "AUC", "Overall", "profile", "TOTAL"]):
            key.append(ln)
    tail = "\n".join(lines[-60:])
    if key:
        return "---- Key lines ----\n" + "\n".join(key[-40:]) + "\n\n---- Output tail ----\n" + tail
    return "---- Output tail ----\n" + tail

# =========================
# TEMPLATE my_algo.py
# - no user input
# - outputs selected feature
# - optional VEAD (switch)
# =========================
MY_ALGO_TEMPLATE = r"""
import os, glob
import numpy as np
import pandas as pd
from ripser import ripser
import warnings
warnings.filterwarnings("ignore")

DETECTOR_NAME     = "{{DETECTOR_NAME}}"
SELECTED_FEATURE  = "{{SELECTED_FEATURE}}"

USE_VEAD = {{USE_VEAD}}
KV       = {{KV}}
KA       = {{KA}}
MODE     = "{{MODE}}"

INPUT_DIR  = "data"
OUTPUT_DIR = os.path.join("results", DETECTOR_NAME)

WINDOW_SIZE = {{WINDOW_SIZE}}
TAU         = {{TAU}}
DIMENSION   = {{DIMENSION}}
MAXDIM      = {{MAXDIM}}
_EPS        = 1e-12

def _vead_series(raw_vals, kv=KV, ka=KA, mode=MODE):
    s = pd.to_numeric(pd.Series(raw_vals, dtype=float), errors="coerce").interpolate(limit_direction="both")
    v = s.diff(1)
    a = v.diff(1)

    def _zmad(x):
        x = np.asarray(x, dtype=float)
        med = np.nanmedian(x)
        mad = np.nanmedian(np.abs(x - med)) + _EPS
        return (x - med) / mad

    zv = _zmad(v.values)
    za = _zmad(a.values)

    mode = (mode or "strict").lower()
    if mode == "strict":
        zv = np.maximum(0.0, zv); za = np.maximum(0.0, za)
    elif mode == "plateau":
        zv = np.where(zv > -0.25, zv, 0.0); za = np.where(za > -0.25, za, 0.0)
    elif mode == "abs_plateau":
        zv = np.abs(zv); za = np.abs(za)

    score = (kv * zv) * (ka * za)
    return np.nan_to_num(score, nan=0.0, posinf=0.0, neginf=0.0)

def takens_embed(window, time_delay, dimension):
    m = len(window) - (dimension - 1) * time_delay
    if m <= 0:
        return None
    return np.stack([window[j:j + m * time_delay:time_delay] for j in range(dimension)], axis=1)

def _clean_diag(diag):
    if diag is None:
        return np.empty((0, 2), dtype=float)
    arr = np.asarray(diag, dtype=float)
    if arr.ndim != 2 or arr.shape[1] != 2 or arr.size == 0:
        return np.empty((0, 2), dtype=float)
    b, d = arr[:, 0], arr[:, 1]
    mask = np.isfinite(b) & np.isfinite(d) & (d > b)
    if not np.any(mask):
        return np.empty((0, 2), dtype=float)
    return np.stack([b[mask], d[mask]], axis=1)

def _lifetimes(diag):
    arr = _clean_diag(diag)
    if arr.size == 0:
        return np.empty(0, dtype=float)
    return np.maximum(arr[:, 1] - arr[:, 0], 0.0)

def _safe_div(a, b):
    return float(a) / float(b + _EPS)

try:
    _trapz = np.trapezoid
except AttributeError:
    _trapz = np.trapz

def _auc_tri_max(diag):
    arr = _clean_diag(diag)
    if arr.size == 0:
        return 0.0
    b_all, d_all = arr[:, 0], arr[:, 1]
    if b_all.min() == d_all.max():
        return 0.0
    grid = np.linspace(b_all.min(), d_all.max(), 64)
    lam1 = np.zeros_like(grid)
    for b, d in arr:
        m = 0.5 * (b + d)
        h = 0.5 * (d - b)
        if h <= 0:
            continue
        left  = (grid >= b) & (grid <= m)
        right = (grid >= m) & (grid <= d)
        lam1[left]  = np.maximum(lam1[left],  (grid[left]  - b) * (h / max(m - b, _EPS)))
        lam1[right] = np.maximum(lam1[right], (d - grid[right]) * (h / max(d - m, _EPS)))
    return float(_trapz(lam1, grid))

def _persistence_entropy(diag):
    L = _lifetimes(diag)
    if L.size == 0: return 0.0
    S = L.sum()
    if S <= 0: return 0.0
    p = L / (S + _EPS)
    return float(-np.sum(p * np.log(p + _EPS)))

def _gini_from_lifetimes(L):
    L = np.sort(L); n = len(L)
    if n == 0: return 0.0
    S = L.sum()
    if S <= 0: return 0.0
    cumL = np.cumsum(L)
    return float(1 + 1/n - 2*np.sum(cumL/(n*S)))

def _tail_share_q(diag, q):
    L = _lifetimes(diag)
    if L.size == 0: return 0.0
    qv = np.quantile(L, q)
    return _safe_div(L[L >= qv].sum(), L.sum())

def _birth_death_stats(diag):
    arr = _clean_diag(diag)
    if arr.size == 0:
        return {"mean_birth": 0.0, "mean_death": 0.0, "std_birth": 0.0, "std_death": 0.0}
    b, d = arr[:, 0], arr[:, 1]
    return {"mean_birth": float(b.mean()), "mean_death": float(d.mean()),
            "std_birth": float(b.std(ddof=0)), "std_death": float(d.std(ddof=0))}

def _diag_distance_stats(diag):
    arr = _clean_diag(diag)
    if arr.size == 0:
        return {"mean_diag_dist": 0.0, "max_diag_dist": 0.0, "sum_diag_dist": 0.0}
    b, d = arr[:, 0], arr[:, 1]
    dist = (d - b) / np.sqrt(2.0)
    return {"mean_diag_dist": float(dist.mean()), "max_diag_dist": float(dist.max()), "sum_diag_dist": float(dist.sum())}

def _centroid_xy(diag):
    arr = _clean_diag(diag)
    if arr.size == 0:
        return {"centroid_x": 0.0, "centroid_y": 0.0}
    b, d = arr[:, 0], arr[:, 1]
    L = np.maximum(d - b, 0.0); S = L.sum()
    if S <= 0:
        return {"centroid_x": 0.0, "centroid_y": 0.0}
    return {"centroid_x": float(np.sum(b * L) / (S + _EPS)),
            "centroid_y": float(np.sum(d * L) / (S + _EPS))}

def _lifetimes_stats(diag):
    L = _lifetimes(diag)
    if L.size == 0:
        return {"count": 0, "sum": 0.0, "mean": 0.0, "median": 0.0, "std": 0.0,
                "min": 0.0, "max": 0.0, "L1": 0.0, "L2": 0.0, "Linf": 0.0}
    return {"count": int(L.size), "sum": float(L.sum()), "mean": float(L.mean()),
            "median": float(np.median(L)), "std": float(L.std(ddof=0)),
            "min": float(L.min()), "max": float(L.max()),
            "L1": float(np.sum(np.abs(L))), "L2": float(np.sqrt(np.sum(L**2))),
            "Linf": float(np.max(np.abs(L)))}

def _lifetimes_quantiles(diag):
    L = _lifetimes(diag)
    if L.size == 0:
        return {"q50": 0.0, "q75": 0.0, "q90": 0.0, "q95": 0.0, "q99": 0.0}
    return {"q50": float(np.quantile(L, 0.50)), "q75": float(np.quantile(L, 0.75)),
            "q90": float(np.quantile(L, 0.90)), "q95": float(np.quantile(L, 0.95)),
            "q99": float(np.quantile(L, 0.99))}

def _carlsson_coordinates(diag):
    arr = _clean_diag(diag)
    if arr.size == 0:
        return {f"f{k}": 0.0 for k in range(1, 6)}
    b, d = arr[:, 0], arr[:, 1]
    L = np.maximum(d - b, 0.0); S = L.sum()
    if S <= 0:
        return {f"f{k}": 0.0 for k in range(1, 6)}
    return {"f1": float(L.sum()), "f2": float(np.sum(b * L)), "f3": float(np.sum(d * L)),
            "f4": float(np.sum(b**2 * L)), "f5": float(np.sum(d**2 * L))}

def _sum_centroid_radial(diag):
    arr = _clean_diag(diag)
    if arr.size == 0: return 0.0
    b, d = arr[:, 0], arr[:, 1]
    L = np.maximum(d - b, 0.0); S = L.sum()
    if S <= 0: return 0.0
    radial = (b + d) / np.sqrt(2.0)
    return _safe_div(np.sum(np.abs(radial) * L), S)

def _pete(diag, p=1.6, q=0.5):
    arr = _clean_diag(diag)
    if arr.size == 0: return 0.0
    b, d = arr[:, 0], arr[:, 1]
    L = np.maximum(d - b, 0.0); S = L.sum()
    if S <= 0: return 0.0
    radial = (b + d) / np.sqrt(2.0)
    return _safe_div(np.sum((L**p) * (np.abs(radial)**q)), S)

def compute_features_for_diag(diag, prefix):
    feats = {}
    Ls = _lifetimes_stats(diag)

    feats[f"{prefix}count_lifetime"] = float(Ls["count"])
    feats[f"{prefix}sum_lifetime"]   = float(Ls["sum"])
    feats[f"{prefix}mean_lifetime"]  = float(Ls["mean"])
    feats[f"{prefix}median_lifetime"]= float(Ls["median"])
    feats[f"{prefix}std_lifetime"]   = float(Ls["std"])
    feats[f"{prefix}min_lifetime"]   = float(Ls["min"])
    feats[f"{prefix}max_lifetime"]   = float(Ls["max"])

    feats[f"{prefix}L1_lifetime"]    = float(Ls["L1"])
    feats[f"{prefix}L2_lifetime"]    = float(Ls["L2"])
    feats[f"{prefix}Linf_lifetime"]  = float(Ls["Linf"])

    feats[f"{prefix}L1_norm"]        = float(Ls["L1"])
    feats[f"{prefix}L2_norm"]        = float(Ls["L2"])
    feats[f"{prefix}Linf_norm"]      = float(Ls["Linf"])

    feats[f"{prefix}betti"]          = float(Ls["count"])
    feats[f"{prefix}energy_concentration"] = _safe_div(Ls["L2"], Ls["L1"])
    feats[f"{prefix}dominance_share"]      = _safe_div(Ls["Linf"], Ls["L1"])

    feats[f"{prefix}persistence_entropy"]  = _persistence_entropy(diag)

    bd = _birth_death_stats(diag)
    for k, v in bd.items():
        feats[f"{prefix}{k}"] = float(v)

    dd = _diag_distance_stats(diag)
    for k, v in dd.items():
        feats[f"{prefix}{k}"] = float(v)

    cxy = _centroid_xy(diag)
    feats[f"{prefix}centroid_x"] = float(cxy["centroid_x"])
    feats[f"{prefix}centroid_y"] = float(cxy["centroid_y"])

    q = _lifetimes_quantiles(diag)
    for k, v in q.items():
        feats[f"{prefix}{k}"] = float(v)

    tail80 = _tail_share_q(diag, 0.80)
    tail90 = _tail_share_q(diag, 0.90)
    tail95 = _tail_share_q(diag, 0.95)
    feats[f"{prefix}tail_share_q80"] = float(tail80)
    feats[f"{prefix}tail_share_q90"] = float(tail90)
    feats[f"{prefix}tail_share_q95"] = float(tail95)
    feats[f"{prefix}tail_curvature_80_90"] = float(tail90 - tail80)

    L = _lifetimes(diag)
    feats[f"{prefix}gini"] = float(_gini_from_lifetimes(L))

    cc = _carlsson_coordinates(diag)
    feats[f"{prefix}Carlsson_f1"] = float(cc["f1"])
    feats[f"{prefix}Carlsson_f2"] = float(cc["f2"])
    feats[f"{prefix}Carlsson_f3"] = float(cc["f3"])
    feats[f"{prefix}Carlsson_f4"] = float(cc["f4"])
    feats[f"{prefix}Carlsson_f5"] = float(cc["f5"])

    if prefix == "H0_":
        A = _auc_tri_max(diag)
        feats["H0_ratio_auc_L1_to_sum"] = _safe_div(A, Ls["sum"])
        feats["H0_ratio_auc_to_max"]    = _safe_div(A, Ls["max"])
        feats["H0_ratio_auc_to_l2"]     = _safe_div(A, Ls["L2"])
        feats["H0_bottleneck"]          = float(Ls["max"])
        feats["H0_sum_centroid"]        = float(_sum_centroid_radial(diag))
        feats["PETE_p1.6_q0.5"]         = float(_pete(diag, p=1.6, q=0.5))
        feats["H0_energy_concentration"]= _safe_div(Ls["L2"], Ls["sum"])
        feats["H0_dominance_share"]     = _safe_div(Ls["Linf"], Ls["sum"])
        feats["H0_tail_curvature_80_90"]= float(tail90 - tail80)
        feats["H0_centroid_to_energy"]  = _safe_div(feats["H0_sum_centroid"], Ls["L2"])
        feats["H0_gini"]                = float(feats.get("H0_gini", 0.0))

    return feats

def compute_cross_dim_features(feats_H0, feats_H1):
    out = {}
    def g(d, k): return float(d.get(k, 0.0))
    out["H1_to_H0_betti_ratio"]   = _safe_div(g(feats_H1, "H1_betti"), g(feats_H0, "H0_betti"))
    out["H1_to_H0_entropy_ratio"] = _safe_div(g(feats_H1, "H1_persistence_entropy"), g(feats_H0, "H0_persistence_entropy"))
    return out

def _minmax_01(x):
    x = np.asarray(x, dtype=float)
    x = np.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0)
    mn, mx = float(np.min(x)), float(np.max(x))
    if (mx - mn) <= _EPS:
        return np.zeros_like(x, dtype=float)
    return (x - mn) / (mx - mn + _EPS)

def run():
    files = glob.glob(os.path.join(INPUT_DIR, "**", "*.csv"), recursive=True)
    print(f"Found {len(files)} data files in '{INPUT_DIR}'")

    for filepath in files:
        if ".ipynb_checkpoints" in filepath:
            continue

        df = pd.read_csv(filepath)
        df.columns = [c.strip().lower() for c in df.columns]
        if "value" not in df.columns or "timestamp" not in df.columns:
            continue

        vals = pd.to_numeric(df["value"], errors="coerce").astype(float).to_numpy()
        n = len(vals)
        series = np.zeros(n, dtype=float)

        for i in range(WINDOW_SIZE - 1, n):
            w = vals[i - WINDOW_SIZE + 1 : i + 1]
            try:
                emb = takens_embed(w, TAU, DIMENSION)
                dgms = ripser(emb, maxdim=MAXDIM)["dgms"]
            except Exception:
                dgms = [np.empty((0,2)), np.empty((0,2))]

            D0 = dgms[0] if len(dgms) > 0 else np.empty((0,2))
            D1 = dgms[1] if (MAXDIM >= 1 and len(dgms) > 1) else np.empty((0,2))

            feats_H0 = compute_features_for_diag(D0, "H0_")
            feats_H1 = compute_features_for_diag(D1, "H1_")
            cross    = compute_cross_dim_features(feats_H0, feats_H1)

            merged = {}
            merged.update(feats_H0); merged.update(feats_H1); merged.update(cross)

            series[i] = float(merged.get(SELECTED_FEATURE, 0.0))

        if USE_VEAD:
            raw = _vead_series(series, kv=KV, ka=KA, mode=MODE)
            final_scores = _minmax_01(raw)
        else:
            final_scores = _minmax_01(series)

        # Keep only top-5 scores, silence others
        #k = 2
        #n_scores = len(final_scores)
        #if n_scores > 0 and np.max(final_scores) > 0:
        #    k_eff = min(k, n_scores)
        #    topk_idx = np.argpartition(final_scores, -k_eff)[-k_eff:]
        #    sparse_scores = np.zeros_like(final_scores, dtype=float)
        #    sparse_scores[topk_idx] = final_scores[topk_idx]
        #    final_scores = sparse_scores

        rel = os.path.relpath(filepath, INPUT_DIR)
        category = os.path.dirname(rel)
        base_name = os.path.basename(rel)

        out_dir = os.path.join(OUTPUT_DIR, category)
        os.makedirs(out_dir, exist_ok=True)
        out_name = f"{DETECTOR_NAME}_" + base_name
        out_path = os.path.join(out_dir, out_name)

        out_df = pd.DataFrame({"timestamp": df["timestamp"], "anomaly_score": final_scores})
        out_df.to_csv(out_path, index=False)

if __name__ == "__main__":
    run()
"""

# =========================
# MAIN LOOP (77 runs)
# =========================
print(f"USE_VEAD = {USE_VEAD}  (KV={KV}, KA={KA}, MODE={MODE})")

for k, feat in enumerate(ROBUST_FEATURES, 1):
    print("\n" + "="*100)
    print(f"[{k:02d}/77] FEATURE = {feat}")
    print("="*100)

    # 1) Fresh NAB clone
    os.chdir(BASE_DIR)
    if os.path.exists(NAB_DIR):
        shutil.rmtree(NAB_DIR)

    run_cmd(["git", "clone", "https://github.com/numenta/NAB.git"], cwd=BASE_DIR, check=True)
    run_cmd(["pip", "install", "-q", "ripser"], cwd=BASE_DIR, check=True)

    os.chdir(NAB_DIR)

    # ensure config/thresholds.json
    os.makedirs("config", exist_ok=True)
    thr_path = os.path.join("config", "thresholds.json")
    if not os.path.exists(thr_path):
        with open(thr_path, "w") as f:
            f.write("{}")

    # 2) unique detector per feature (prevents overwriting)
    detector = f"TDAF__{safe_name(feat)}" + ("__VEAD" if USE_VEAD else "__NOV")
    detector = detector[:120]  # safety for long filenames on some FS

    # 3) write my_algo.py
    algo_code = (MY_ALGO_TEMPLATE
        .replace("{{DETECTOR_NAME}}", detector)
        .replace("{{SELECTED_FEATURE}}", feat)
        .replace("{{USE_VEAD}}", "True" if USE_VEAD else "False")
        .replace("{{KV}}", str(KV))
        .replace("{{KA}}", str(KA))
        .replace("{{MODE}}", MODE)
        .replace("{{WINDOW_SIZE}}", str(WINDOW_SIZE))
        .replace("{{TAU}}", str(TAU))
        .replace("{{DIMENSION}}", str(DIMENSION))
        .replace("{{MAXDIM}}", str(MAXDIM))
    )
    with open("my_algo.py", "w") as f:
        f.write(algo_code)

    # 4) run detector to produce results/<detector>/*
    _, out_det = run_cmd(["python", "my_algo.py"], cwd=NAB_DIR, check=True)

    # 5) run optimize+score WITHOUT interactive confirmation
    print("\n--- NAB OPTIMIZE + SCORE (no prompt) ---")
    rc, out_score = run_cmd(
        ["python", "run.py",
         "--optimize", "--score",
         "--detectors", detector,
         "--normalize",
         "--skipConfirmation"],
        cwd=NAB_DIR,
        check=False  # keep going even if one feature fails
    )

    print("\n--- RESULT SUMMARY FOR THIS FEATURE ---")
    print(f"Detector : {detector}")
    print(f"Feature  : {feat}")
    print(f"Status   : {'OK' if rc == 0 else f'FAILED (rc={rc})'}")
    print(extract_score_block(out_score))

print("\n✅ DONE: ran all 77 features with fresh NAB clone each time, no user intervention.")


USE_VEAD = True  (KV=3.5, KA=3.5, MODE=abs_plateau)

[01/77] FEATURE = H1_dominance_share

--- NAB OPTIMIZE + SCORE (no prompt) ---

--- RESULT SUMMARY FOR THIS FEATURE ---
Detector : TDAF__H1_dominance_share__VEAD
Feature  : H1_dominance_share
Status   : OK
---- Key lines ----
Final score for 'TDAF' detector on '_H1_dominance_share__VEAD_standard' profile = 9.69
Final score for 'TDAF' detector on '_H1_dominance_share__VEAD_reward_low_FP_rate' profile = 3.08
Final score for 'TDAF' detector on '_H1_dominance_share__VEAD_reward_low_FN_rate' profile = 15.49

---- Output tail ----

Running optimize step
Optimizer found a max score of -93.52269701181761 with anomaly threshold 0.9999999999997778.
Optimizer found a max score of -108.86412779713909 with anomaly threshold 0.9999999999999992.
Optimizer found a max score of -178.1117463168683 with anomaly threshold 0.9999999999995244.

Running scoring step
TDAF__H1_dominance_share__VEAD detector benchmark scores written to /content/NAB/results/TD