<a href="https://colab.research.google.com/github/hannapalya/anomaly_detection_syndromic/blob/main/Ensemble_3way_voting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
"""
3-Way Voting Ensemble: LSTM-AE + Isolation Forest + One-Class SVM (FIXED - END ALIGNMENT)

Voting Logic:
  - Each model gets its own tuned threshold on ALIGNED validation data to achieve >= SPEC_TARGET
  - Alignment is done from the END to ensure all models cover the same time period
  - Final prediction: At least 2 of 3 models must agree (majority voting)

Outputs:
  - Ensemble_3way_voting_per_sim_val.csv
  - Ensemble_3way_voting_per_sim_test.csv
  - Ensemble_3way_voting_results.csv
"""

import os, sys, numpy as np, pandas as pd
from typing import Any, Dict, List, Tuple
from sklearn.metrics import confusion_matrix

# ===== USER CONFIG =====
DATA_DIR      = "/content"
SIGNALS       = list(range(1, 17))
DAYS_PER_YEAR = 364
TRAIN_YEARS   = 6
TRAIN_DAYS    = TRAIN_YEARS * DAYS_PER_YEAR
VALID_DAYS    = 49 * 7
RNG_STATE     = 42

# Threshold tuning
SPEC_TARGET = 0.95
SENS_FLOOR  = 0.00
BETA        = 0.5

# ===== IMPORT ADAPTERS =====
import importlib.util, importlib, pathlib

def import_adapter(path_str: str, func_name: str, module_name: str):
    p = pathlib.Path(path_str)
    if not p.exists():
        raise FileNotFoundError(f"Adapter not found at: {p}")
    if module_name in sys.modules:
        del sys.modules[module_name]
    spec = importlib.util.spec_from_file_location(module_name, str(p))
    mod = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(mod)
    if not hasattr(mod, func_name):
        raise AttributeError(f"{p.name} is missing `{func_name}()`")
    return getattr(mod, func_name)

lstm_fit_and_score = import_adapter("LSTM_AE_curr.py", "fit_and_score", module_name="adapter_lstm")
iso_fit_and_score  = import_adapter("IsolationForest_tuned.py", "fit_and_score", module_name="adapter_if")
ocsvm_fit_and_score = import_adapter("OneClassSVM_adapter.py", "fit_and_score", module_name="adapter_ocsvm")

# ===== HELPERS =====
def load_data(sig: int):
    X = pd.read_csv(os.path.join(DATA_DIR, f"simulated_totals_sig{sig}.csv"))
    Y = (pd.read_csv(os.path.join(DATA_DIR, f"simulated_outbreaks_sig{sig}.csv")) > 0).astype(int)
    date_col = next((c for c in ["date","Date","ds","timestamp"] if c in X.columns), None)
    if date_col:
        X = X.drop(columns=[date_col])
        if date_col in Y.columns: Y = Y.drop(columns=[date_col])
    return X, Y

def cross_sim_split(sims: List[dict], rng: np.random.RandomState, train_frac=0.6):
    rng.shuffle(sims)
    n_train = int(len(sims) * train_frac)
    return sims[:n_train], sims[n_train:]

def sens_spec(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[float,float]:
    if len(y_true) == 0:
        return np.nan, np.nan
    TN, FP, FN, TP = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    sens = TP/(TP+FN) if (TP+FN)>0 else np.nan
    spec = TN/(TN+FP) if (TN+FP)>0 else np.nan
    return sens, spec

def tune_threshold(y_val: np.ndarray, scores: np.ndarray,
                   spec_target=SPEC_TARGET, sens_floor=SENS_FLOOR, p_max=20.0) -> float:
    if len(scores) == 0:
        return float("inf")
    neg_mask = (y_val == 0)
    base = scores[neg_mask] if neg_mask.any() else scores
    grid = np.linspace(0.5, p_max, num=200)
    best_t = None
    best_tuple = None
    for p in grid:
        thr = np.percentile(base, 100 - p)
        yhat = (scores >= thr).astype(int)
        s, sp = sens_spec(y_val, yhat)
        if (sp is not None) and (s is not None) and (sp >= spec_target) and (s >= sens_floor):
            key = (s, sp)
            if (best_tuple is None) or (key > best_tuple):
                best_tuple = key
                best_t = float(thr)
    if best_t is None:
        best_gap = 1e9; best_s=-1.0; best_sp=-1.0
        for p in grid:
            thr = np.percentile(base, 100 - p)
            yhat = (scores >= thr).astype(int)
            s, sp = sens_spec(y_val, yhat)
            gap = abs(sp - spec_target) if sp is not None else 1e9
            if (gap < best_gap) or (gap == best_gap and (sp > best_sp or (sp == best_sp and s > best_s))):
                best_t, best_gap, best_sp, best_s = float(thr), gap, (sp if sp is not None else -1), (s if s is not None else -1)
    return best_t

def metric_pod_anyhit(yhat: np.ndarray, ytrue: np.ndarray) -> float:
    has_out = (ytrue == 1).any()
    if not has_out: return np.nan
    return 1.0 if ((yhat == 1) & (ytrue == 1)).any() else 0.0

def timeliness_single(yhat: np.ndarray, ytrue: np.ndarray) -> float:
    idx_out = np.where(ytrue > 0)[0]
    if len(idx_out) == 0: return 1.0
    idx_hit = np.where((ytrue > 0) & (yhat > 0))[0]
    if len(idx_hit) == 0: return 1.0
    r1, r2 = int(idx_out[0]), int(idx_out[-1]); obs = int(idx_hit[0])
    return (obs - r1) / (r2 - r1 + 1)

def split_by_lengths(arr: np.ndarray, lengths: List[int]) -> List[np.ndarray]:
    out = []; i = 0; total = len(arr)
    for L in lengths:
        if i >= total: out.append(arr[:0]); continue
        out.append(arr[i:min(i+L, total)]); i += L
    return out

def val_lengths_for_window(val_sims: List[dict], window: int) -> List[int]:
    Ls: List[int] = []
    for d in val_sims:
        y_tail = d["y"][-VALID_DAYS:]
        Ls.append(max(0, len(y_tail) - (window - 1)))
    return Ls

def _align_tail(O, T):
    return O[-T:] if len(O) >= T else np.pad(O, (T-len(O), 0))

def metric_sensitivity(A, O):
    Oa = _align_tail(O, A.shape[0]); TP = np.logical_and(A==1, Oa>0).sum(); FN = np.logical_and(A==0, Oa>0).sum()
    return (TP/(TP+FN)) if (TP+FN)>0 else np.nan

def metric_specificity(A, O):
    Oa = _align_tail(O, A.shape[0]); TN = np.logical_and(A==0, Oa==0).sum(); FP = np.logical_and(A==1, Oa==0).sum()
    return (TN/(TN+FP)) if (TN+FP)>0 else np.nan

def metric_pod(A, O):
    Oa = _align_tail(O, A.shape[0])
    return np.mean((np.logical_and(A==1, Oa>0)).sum(axis=0) > 0)

def metric_timeliness(A, O):
    Oa = _align_tail(O, A.shape[0]); T,J = A.shape; score=0.0
    for j in range(J):
        score += timeliness_single(A[:,j], Oa[:,j])
    return score / J

# ===== MAIN =====
np.random.seed(RNG_STATE)
rng = np.random.RandomState(RNG_STATE)

summary = {}
rows_val: List[Dict[str, Any]] = []
rows_test: List[Dict[str, Any]] = []

for S in SIGNALS:
    print(f"\n--- Signal {S} (3-Way Voting: LSTM + IF + OCSVM) ---")
    Xsig, Ysig = load_data(S)

    # Build sim dicts
    sims: List[dict] = []
    for sim_idx, col in enumerate(Xsig.columns):
        x = Xsig[col].to_numpy(np.float32, copy=False)
        y = Ysig[col].to_numpy(np.int32, copy=False)
        if len(x) >= TRAIN_DAYS + VALID_DAYS:
            sims.append(dict(sim=f"sig{S}_sim{sim_idx}", x=x, y=y))
    if not sims:
        print("  No complete sims; skip."); continue

    train_sims, held_sims = cross_sim_split(sims, rng, train_frac=0.6)
    mid = max(1, len(held_sims)//2)
    val_sims  = held_sims[:mid]
    test_sims = held_sims[mid:] if len(held_sims) > 1 else held_sims
    print(f"  Using {len(train_sims)} train, {len(val_sims)} val, {len(test_sims)} test sims")

    # === Get results from all 3 models ===
    print("  Running LSTM-AE...")
    lstm_res = lstm_fit_and_score(S, train_sims, val_sims, test_sims, rng_state=RNG_STATE)
    print("  Running Isolation Forest...")
    iso_res = iso_fit_and_score(S, train_sims, val_sims, test_sims, rng_state=RNG_STATE)
    print("  Running One-Class SVM...")
    ocsvm_res = ocsvm_fit_and_score(S, train_sims, val_sims, test_sims, rng_state=RNG_STATE)

    models = [
        ("LSTM-AE", lstm_res),
        ("IF", iso_res),
        ("OCSVM", ocsvm_res)
    ]

    # === STEP 1: Collect all predictions (full length) ===
    model_full_preds = {}
    for name, res in models:
        yv_concat = res["val_labels"].astype(np.int32)
        sv_concat = res["val_scores"].astype(np.float32)
        model_full_preds[name] = {
            "scores": sv_concat,
            "labels": yv_concat,
            "window": res["window"],
            "res": res
        }
        print(f"    {name}: window={res['window']}, val_length={len(sv_concat)}")

    # === STEP 2: Find minimum length for alignment ===
    min_len = min(len(model_full_preds[name]["scores"]) for name in model_full_preds.keys())
    print(f"  Aligning all models to min_len={min_len} (from END)")

    # === STEP 3: Align all predictions to minimum length (FROM THE END) ===
    for name in model_full_preds.keys():
        # CRITICAL FIX: Take the LAST min_len points so all models cover the same time period
        model_full_preds[name]["scores_aligned"] = model_full_preds[name]["scores"][-min_len:]
        model_full_preds[name]["labels_aligned"] = model_full_preds[name]["labels"][-min_len:]

    # === STEP 4: Tune thresholds on ALIGNED data ===
    thresholds = {}
    val_preds = {}

    for name in model_full_preds.keys():
        sv_aligned = model_full_preds[name]["scores_aligned"]
        yv_aligned = model_full_preds[name]["labels_aligned"]

        thr = tune_threshold(yv_aligned, sv_aligned, SPEC_TARGET, SENS_FLOOR)
        yhat = (sv_aligned >= thr).astype(int)
        s, sp = sens_spec(yv_aligned, yhat)

        thresholds[name] = thr
        val_preds[name] = {
            "scores": sv_aligned,  # Use aligned
            "preds": yhat,
            "labels": yv_aligned,  # Use aligned
            "window": model_full_preds[name]["window"],
            "res": model_full_preds[name]["res"]
        }

        print(f"    {name}: thr={thr:.6f}, val_sens={s:.3f}, val_spec={sp:.3f}")

    # === STEP 5: Apply 3-way voting on validation (now properly aligned) ===
    lstm_pred = val_preds["LSTM-AE"]["preds"]
    if_pred = val_preds["IF"]["preds"]
    ocsvm_pred = val_preds["OCSVM"]["preds"]
    val_labels = val_preds["LSTM-AE"]["labels"]  # All same length now

    # Voting: sum >= 2 means at least 2 models detected
    vote_sum = lstm_pred + if_pred + ocsvm_pred
    ensemble_pred = (vote_sum >= 2).astype(int)

    # Compute validation metrics
    val_sens, val_spec = sens_spec(val_labels, ensemble_pred)

    # Per-sim validation metrics (use most common window or max)
    window_used = max(val_preds["LSTM-AE"]["window"],
                     val_preds["IF"]["window"],
                     val_preds["OCSVM"]["window"])
    Ls_val = val_lengths_for_window(val_sims, window_used)

    # Split ensemble predictions by sim
    ens_splits = split_by_lengths(ensemble_pred, Ls_val)
    lab_splits = split_by_lengths(val_labels, Ls_val)

    val_pods = []
    for sim_d, yh, yv in zip(val_sims, ens_splits, lab_splits):
        if len(yh) != len(yv):
            L = min(len(yh), len(yv)); yh, yv = yh[:L], yv[:L]
        s_i, sp_i = sens_spec(yv, yh) if len(yh) else (np.nan, np.nan)
        pod_i = metric_pod_anyhit(yh, yv) if len(yh) else np.nan
        tim_i = timeliness_single(yh, yv) if len(yh) else np.nan
        val_pods.append(pod_i)

        rows_val.append(dict(
            split="val", signal=S, sim=sim_d["sim"], model="3-Way-Voting",
            window=int(window_used), sens=s_i, spec=sp_i, pod=pod_i,
            timeliness=tim_i, n_points=int(len(yh))
        ))

    val_pod = float(np.nanmean(val_pods)) if val_pods else 0.0
    print(f"  VALIDATION → Voting sens={val_sens:.3f}, spec={val_spec:.3f}, POD={val_pod:.3f}")

    # === Apply 3-way voting on test ===
    # Get test predictions from each model
    lstm_test = val_preds["LSTM-AE"]["res"]
    if_test = val_preds["IF"]["res"]
    ocsvm_test = val_preds["OCSVM"]["res"]

    # Build test predictions for each model
    test_preds_ensemble = []
    test_labels_list = []

    for idx, sim_d in enumerate(test_sims):
        y_tail = sim_d["y"][-VALID_DAYS:]

        # LSTM
        lstm_scores = lstm_test["test_scores_splits"][idx]
        lstm_win = lstm_test["window"]
        lstm_labels = y_tail[lstm_win-1:]
        lstm_yh = (lstm_scores >= thresholds["LSTM-AE"]).astype(int)

        # IF
        if_scores = if_test["test_scores_splits"][idx]
        if_win = if_test["window"]
        if_labels = y_tail[if_win-1:]
        if_yh = (if_scores >= thresholds["IF"]).astype(int)

        # OCSVM
        ocsvm_scores = ocsvm_test["test_scores_splits"][idx]
        ocsvm_win = ocsvm_test["window"]
        ocsvm_labels = y_tail[ocsvm_win-1:]
        ocsvm_yh = (ocsvm_scores >= thresholds["OCSVM"]).astype(int)

        # Align to minimum length (from END for test too)
        min_test_len = min(len(lstm_yh), len(if_yh), len(ocsvm_yh))
        if min_test_len == 0:
            continue

        lstm_yh = lstm_yh[-min_test_len:]
        if_yh = if_yh[-min_test_len:]
        ocsvm_yh = ocsvm_yh[-min_test_len:]
        test_labels = lstm_labels[-min_test_len:]  # Also from end

        # Voting
        vote_sum_test = lstm_yh + if_yh + ocsvm_yh
        ens_yh = (vote_sum_test >= 2).astype(int)

        test_preds_ensemble.append(ens_yh)
        test_labels_list.append(test_labels)

        # Per-sim test metrics
        s_i, sp_i = sens_spec(test_labels, ens_yh)
        pod_i = metric_pod_anyhit(ens_yh, test_labels)
        tim_i = timeliness_single(ens_yh, test_labels)

        rows_test.append(dict(
            split="test", signal=S, sim=sim_d["sim"], model="3-Way-Voting",
            window=int(window_used), sens=s_i, spec=sp_i, pod=pod_i,
            timeliness=tim_i, n_points=int(min_test_len)
        ))

    # Aggregate test metrics
    if test_preds_ensemble:
        min_test_len_agg = min(len(p) for p in test_preds_ensemble)
        A = np.stack([p[:min_test_len_agg] for p in test_preds_ensemble], axis=1)
        O = np.stack([l[:min_test_len_agg] for l in test_labels_list], axis=1)

        test_sens = metric_sensitivity(A, O)
        test_spec = metric_specificity(A, O)
        test_pod = metric_pod(A, O)
        test_tim = metric_timeliness(A, O)

        print(f"  TEST → Voting sens={test_sens:.3f}, spec={test_spec:.3f}, POD={test_pod:.3f}, tim={test_tim:.3f}")

        summary[S] = dict(
            model="3-Way-Voting",
            window=int(window_used),
            val_sens=float(val_sens), val_spec=float(val_spec), val_pod=float(val_pod),
            sensitivity=float(test_sens), specificity=float(test_spec),
            pod=float(test_pod), timeliness=float(test_tim)
        )

# ===== SAVE RESULTS =====
if summary:
    df = pd.DataFrame.from_dict(summary, orient="index")
    print("\n=== 3-WAY VOTING ENSEMBLE SUMMARY ===")
    print(df)
    print("\nMeans:\n", df[["sensitivity","specificity","pod","timeliness"]].mean(numeric_only=True))
    df.to_csv("Ensemble_3way_voting_results.csv", index=True)
    print("Saved: Ensemble_3way_voting_results.csv")

if rows_val:
    dfv = pd.DataFrame(rows_val)
    dfv.sort_values(["signal","sim"], inplace=True)
    dfv.to_csv("Ensemble_3way_voting_per_sim_val.csv", index=False)
    print("Saved: Ensemble_3way_voting_per_sim_val.csv")

if rows_test:
    dft = pd.DataFrame(rows_test)
    dft.sort_values(["signal","sim"], inplace=True)
    dft.to_csv("Ensemble_3way_voting_per_sim_test.csv", index=False)
    print("Saved: Ensemble_3way_voting_per_sim_test.csv")

print("\n=== 3-WAY VOTING ENSEMBLE COMPLETE ===")

OCSVM Adapter: Testing 9 configurations per signal

--- Signal 1 (3-Way Voting: LSTM + IF + OCSVM) ---
  Using 60 train, 20 val, 20 test sims
  Running LSTM-AE...
  Running Isolation Forest...
  Running One-Class SVM...
    OCSVM (CPU): Signal 1...
      1/9: WIN=7, NU=0.03... s=0.29 sp=0.96
      2/9: WIN=7, NU=0.05... s=0.30 sp=0.96
      3/9: WIN=7, NU=0.07... s=0.30 sp=0.96
      4/9: WIN=14, NU=0.03... s=0.27 sp=0.96
      5/9: WIN=14, NU=0.05... s=0.27 sp=0.96
      6/9: WIN=14, NU=0.07... s=0.27 sp=0.96
      7/9: WIN=21, NU=0.03... s=0.19 sp=0.96
      8/9: WIN=21, NU=0.05... s=0.16 sp=0.98
      9/9: WIN=21, NU=0.07... s=0.16 sp=0.98
    ✓ Best: WIN=7, NU=0.07 (123.5s)
    LSTM-AE: window=14, val_length=6600
    IF: window=14, val_length=6600
    OCSVM: window=7, val_length=6740
  Aligning all models to min_len=6600 (from END)
    LSTM-AE: thr=0.026116, val_sens=0.819, val_spec=0.952
    IF: thr=0.002043, val_sens=0.679, val_spec=0.951
    OCSVM: thr=5.687780, val_sens=0.320, 

In [None]:
#!/usr/bin/env python3
"""
3-Way Voting Ensemble: LSTM-AE + Isolation Forest + One-Class SVM

Voting Logic:
  - Each model gets its own tuned threshold on validation to achieve >= SPEC_TARGET
  - Final prediction: At least 2 of 3 models must agree (majority voting)

Outputs:
  - Ensemble_3way_voting_per_sim_val.csv
  - Ensemble_3way_voting_per_sim_test.csv
  - Ensemble_3way_voting_results.csv
"""

import os, sys, numpy as np, pandas as pd
from typing import Any, Dict, List, Tuple
from sklearn.metrics import confusion_matrix

# ===== USER CONFIG =====
DATA_DIR      = "/content"
SIGNALS       = list(range(1, 17))
DAYS_PER_YEAR = 364
TRAIN_YEARS   = 6
TRAIN_DAYS    = TRAIN_YEARS * DAYS_PER_YEAR
VALID_DAYS    = 49 * 7
RNG_STATE     = 42

# Threshold tuning
SPEC_TARGET = 0.97
SENS_FLOOR  = 0.00
BETA        = 0.5

# ===== IMPORT ADAPTERS =====
import importlib.util, importlib, pathlib

def import_adapter(path_str: str, func_name: str, module_name: str):
    p = pathlib.Path(path_str)
    if not p.exists():
        raise FileNotFoundError(f"Adapter not found at: {p}")
    if module_name in sys.modules:
        del sys.modules[module_name]
    spec = importlib.util.spec_from_file_location(module_name, str(p))
    mod = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(mod)
    if not hasattr(mod, func_name):
        raise AttributeError(f"{p.name} is missing `{func_name}()`")
    return getattr(mod, func_name)

lstm_fit_and_score = import_adapter("LSTM_AE_curr.py", "fit_and_score", module_name="adapter_lstm")
iso_fit_and_score  = import_adapter("IsolationForest_tuned.py", "fit_and_score", module_name="adapter_if")
ocsvm_fit_and_score = import_adapter("OneClassSVM_tuned.py", "fit_and_score", module_name="adapter_ocsvm")

# ===== HELPERS =====
def load_data(sig: int):
    X = pd.read_csv(os.path.join(DATA_DIR, f"simulated_totals_sig{sig}.csv"))
    Y = (pd.read_csv(os.path.join(DATA_DIR, f"simulated_outbreaks_sig{sig}.csv")) > 0).astype(int)
    date_col = next((c for c in ["date","Date","ds","timestamp"] if c in X.columns), None)
    if date_col:
        X = X.drop(columns=[date_col])
        if date_col in Y.columns: Y = Y.drop(columns=[date_col])
    return X, Y

def cross_sim_split(sims: List[dict], rng: np.random.RandomState, train_frac=0.6):
    rng.shuffle(sims)
    n_train = int(len(sims) * train_frac)
    return sims[:n_train], sims[n_train:]

def sens_spec(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[float,float]:
    if len(y_true) == 0:
        return np.nan, np.nan
    TN, FP, FN, TP = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    sens = TP/(TP+FN) if (TP+FN)>0 else np.nan
    spec = TN/(TN+FP) if (TN+FP)>0 else np.nan
    return sens, spec

def tune_threshold(y_val: np.ndarray, scores: np.ndarray,
                   spec_target=SPEC_TARGET, sens_floor=SENS_FLOOR, p_max=20.0) -> float:
    if len(scores) == 0:
        return float("inf")
    neg_mask = (y_val == 0)
    base = scores[neg_mask] if neg_mask.any() else scores
    grid = np.linspace(0.5, p_max, num=200)
    best_t = None
    best_tuple = None
    for p in grid:
        thr = np.percentile(base, 100 - p)
        yhat = (scores >= thr).astype(int)
        s, sp = sens_spec(y_val, yhat)
        if (sp is not None) and (s is not None) and (sp >= spec_target) and (s >= sens_floor):
            key = (s, sp)
            if (best_tuple is None) or (key > best_tuple):
                best_tuple = key
                best_t = float(thr)
    if best_t is None:
        best_gap = 1e9; best_s=-1.0; best_sp=-1.0
        for p in grid:
            thr = np.percentile(base, 100 - p)
            yhat = (scores >= thr).astype(int)
            s, sp = sens_spec(y_val, yhat)
            gap = abs(sp - spec_target) if sp is not None else 1e9
            if (gap < best_gap) or (gap == best_gap and (sp > best_sp or (sp == best_sp and s > best_s))):
                best_t, best_gap, best_sp, best_s = float(thr), gap, (sp if sp is not None else -1), (s if s is not None else -1)
    return best_t

def metric_pod_anyhit(yhat: np.ndarray, ytrue: np.ndarray) -> float:
    has_out = (ytrue == 1).any()
    if not has_out: return np.nan
    return 1.0 if ((yhat == 1) & (ytrue == 1)).any() else 0.0

def timeliness_single(yhat: np.ndarray, ytrue: np.ndarray) -> float:
    idx_out = np.where(ytrue > 0)[0]
    if len(idx_out) == 0: return 1.0
    idx_hit = np.where((ytrue > 0) & (yhat > 0))[0]
    if len(idx_hit) == 0: return 1.0
    r1, r2 = int(idx_out[0]), int(idx_out[-1]); obs = int(idx_hit[0])
    return (obs - r1) / (r2 - r1 + 1)

def split_by_lengths(arr: np.ndarray, lengths: List[int]) -> List[np.ndarray]:
    out = []; i = 0; total = len(arr)
    for L in lengths:
        if i >= total: out.append(arr[:0]); continue
        out.append(arr[i:min(i+L, total)]); i += L
    return out

def val_lengths_for_window(val_sims: List[dict], window: int) -> List[int]:
    Ls: List[int] = []
    for d in val_sims:
        y_tail = d["y"][-VALID_DAYS:]
        Ls.append(max(0, len(y_tail) - (window - 1)))
    return Ls

def _align_tail(O, T):
    return O[-T:] if len(O) >= T else np.pad(O, (T-len(O), 0))

def metric_sensitivity(A, O):
    Oa = _align_tail(O, A.shape[0]); TP = np.logical_and(A==1, Oa>0).sum(); FN = np.logical_and(A==0, Oa>0).sum()
    return (TP/(TP+FN)) if (TP+FN)>0 else np.nan

def metric_specificity(A, O):
    Oa = _align_tail(O, A.shape[0]); TN = np.logical_and(A==0, Oa==0).sum(); FP = np.logical_and(A==1, Oa==0).sum()
    return (TN/(TN+FP)) if (TN+FP)>0 else np.nan

def metric_pod(A, O):
    Oa = _align_tail(O, A.shape[0])
    return np.mean((np.logical_and(A==1, Oa>0)).sum(axis=0) > 0)

def metric_timeliness(A, O):
    Oa = _align_tail(O, A.shape[0]); T,J = A.shape; score=0.0
    for j in range(J):
        score += timeliness_single(A[:,j], Oa[:,j])
    return score / J

# ===== MAIN =====
np.random.seed(RNG_STATE)
rng = np.random.RandomState(RNG_STATE)

summary = {}
rows_val: List[Dict[str, Any]] = []
rows_test: List[Dict[str, Any]] = []

for S in SIGNALS:
    print(f"\n--- Signal {S} (3-Way Voting: LSTM + IF + OCSVM) ---")
    Xsig, Ysig = load_data(S)

    # Build sim dicts
    sims: List[dict] = []
    for sim_idx, col in enumerate(Xsig.columns):
        x = Xsig[col].to_numpy(np.float32, copy=False)
        y = Ysig[col].to_numpy(np.int32, copy=False)
        if len(x) >= TRAIN_DAYS + VALID_DAYS:
            sims.append(dict(sim=f"sig{S}_sim{sim_idx}", x=x, y=y))
    if not sims:
        print("  No complete sims; skip."); continue

    train_sims, held_sims = cross_sim_split(sims, rng, train_frac=0.6)
    mid = max(1, len(held_sims)//2)
    val_sims  = held_sims[:mid]
    test_sims = held_sims[mid:] if len(held_sims) > 1 else held_sims
    print(f"  Using {len(train_sims)} train, {len(val_sims)} val, {len(test_sims)} test sims")

    # === Get results from all 3 models ===
    print("  Running LSTM-AE...")
    lstm_res = lstm_fit_and_score(S, train_sims, val_sims, test_sims, rng_state=RNG_STATE)
    print("  Running Isolation Forest...")
    iso_res = iso_fit_and_score(S, train_sims, val_sims, test_sims, rng_state=RNG_STATE)
    print("  Running One-Class SVM...")
    ocsvm_res = ocsvm_fit_and_score(S, train_sims, val_sims, test_sims, rng_state=RNG_STATE)

    models = [
        ("LSTM-AE", lstm_res),
        ("IF", iso_res),
        ("OCSVM", ocsvm_res)
    ]

    # === Tune thresholds for each model on validation ===
    thresholds = {}
    val_preds = {}

    for name, res in models:
        yv_concat = res["val_labels"].astype(np.int32)
        sv_concat = res["val_scores"].astype(np.float32)
        thr = tune_threshold(yv_concat, sv_concat, SPEC_TARGET, SENS_FLOOR)
        yhat = (sv_concat >= thr).astype(int)
        s, sp = sens_spec(yv_concat, yhat)

        thresholds[name] = thr
        val_preds[name] = {
            "scores": sv_concat,
            "preds": yhat,
            "labels": yv_concat,
            "window": res["window"],
            "res": res
        }

        print(f"    {name}: thr={thr:.6f}, val_sens={s:.3f}, val_spec={sp:.3f}")

    # === Apply 3-way voting on validation ===
    # Majority voting: at least 2 of 3 models must agree
    lstm_pred = val_preds["LSTM-AE"]["preds"]
    if_pred = val_preds["IF"]["preds"]
    ocsvm_pred = val_preds["OCSVM"]["preds"]

    # Align to minimum length
    min_len = min(len(lstm_pred), len(if_pred), len(ocsvm_pred))
    lstm_pred = lstm_pred[:min_len]
    if_pred = if_pred[:min_len]
    ocsvm_pred = ocsvm_pred[:min_len]
    val_labels = val_preds["LSTM-AE"]["labels"][:min_len]

    # Voting: sum >= 2 means at least 2 models detected
    vote_sum = lstm_pred + if_pred + ocsvm_pred
    ensemble_pred = (vote_sum >= 2).astype(int)

    # Compute validation metrics
    val_sens, val_spec = sens_spec(val_labels, ensemble_pred)

    # Per-sim validation metrics (use most common window or max)
    window_used = max(val_preds["LSTM-AE"]["window"],
                     val_preds["IF"]["window"],
                     val_preds["OCSVM"]["window"])
    Ls_val = val_lengths_for_window(val_sims, window_used)

    # Split ensemble predictions by sim
    ens_splits = split_by_lengths(ensemble_pred, Ls_val)
    lab_splits = split_by_lengths(val_labels, Ls_val)

    val_pods = []
    for sim_d, yh, yv in zip(val_sims, ens_splits, lab_splits):
        if len(yh) != len(yv):
            L = min(len(yh), len(yv)); yh, yv = yh[:L], yv[:L]
        s_i, sp_i = sens_spec(yv, yh) if len(yh) else (np.nan, np.nan)
        pod_i = metric_pod_anyhit(yh, yv) if len(yh) else np.nan
        tim_i = timeliness_single(yh, yv) if len(yh) else np.nan
        val_pods.append(pod_i)

        rows_val.append(dict(
            split="val", signal=S, sim=sim_d["sim"], model="3-Way-Voting",
            window=int(window_used), sens=s_i, spec=sp_i, pod=pod_i,
            timeliness=tim_i, n_points=int(len(yh))
        ))

    val_pod = float(np.nanmean(val_pods)) if val_pods else 0.0
    print(f"  VALIDATION → Voting sens={val_sens:.3f}, spec={val_spec:.3f}, POD={val_pod:.3f}")

    # === Apply 3-way voting on test ===
    # Get test predictions from each model
    lstm_test = val_preds["LSTM-AE"]["res"]
    if_test = val_preds["IF"]["res"]
    ocsvm_test = val_preds["OCSVM"]["res"]

    # Build test predictions for each model
    test_preds_lstm = []
    test_preds_if = []
    test_preds_ocsvm = []
    test_labels_list = []

    for idx, sim_d in enumerate(test_sims):
        y_tail = sim_d["y"][-VALID_DAYS:]

        # LSTM
        lstm_scores = lstm_test["test_scores_splits"][idx]
        lstm_win = lstm_test["window"]
        lstm_labels = y_tail[lstm_win-1:]
        lstm_yh = (lstm_scores >= thresholds["LSTM-AE"]).astype(int)

        # IF
        if_scores = if_test["test_scores_splits"][idx]
        if_win = if_test["window"]
        if_labels = y_tail[if_win-1:]
        if_yh = (if_scores >= thresholds["IF"]).astype(int)

        # OCSVM
        ocsvm_scores = ocsvm_test["test_scores_splits"][idx]
        ocsvm_win = ocsvm_test["window"]
        ocsvm_labels = y_tail[ocsvm_win-1:]
        ocsvm_yh = (ocsvm_scores >= thresholds["OCSVM"]).astype(int)

        # Align to minimum length
        min_test_len = min(len(lstm_yh), len(if_yh), len(ocsvm_yh))
        if min_test_len == 0:
            continue

        lstm_yh = lstm_yh[:min_test_len]
        if_yh = if_yh[:min_test_len]
        ocsvm_yh = ocsvm_yh[:min_test_len]
        test_labels = lstm_labels[:min_test_len]  # Use any, they should align

        # Voting
        vote_sum_test = lstm_yh + if_yh + ocsvm_yh
        ens_yh = (vote_sum_test >= 2).astype(int)

        test_preds_lstm.append(ens_yh)
        test_labels_list.append(test_labels)

        # Per-sim test metrics
        s_i, sp_i = sens_spec(test_labels, ens_yh)
        pod_i = metric_pod_anyhit(ens_yh, test_labels)
        tim_i = timeliness_single(ens_yh, test_labels)

        rows_test.append(dict(
            split="test", signal=S, sim=sim_d["sim"], model="3-Way-Voting",
            window=int(window_used), sens=s_i, spec=sp_i, pod=pod_i,
            timeliness=tim_i, n_points=int(min_test_len)
        ))

    # Aggregate test metrics
    if test_preds_lstm:
        min_test_len_agg = min(len(p) for p in test_preds_lstm)
        A = np.stack([p[:min_test_len_agg] for p in test_preds_lstm], axis=1)
        O = np.stack([l[:min_test_len_agg] for l in test_labels_list], axis=1)

        test_sens = metric_sensitivity(A, O)
        test_spec = metric_specificity(A, O)
        test_pod = metric_pod(A, O)
        test_tim = metric_timeliness(A, O)

        print(f"  TEST → Voting sens={test_sens:.3f}, spec={test_spec:.3f}, POD={test_pod:.3f}, tim={test_tim:.3f}")

        summary[S] = dict(
            model="3-Way-Voting",
            window=int(window_used),
            val_sens=float(val_sens), val_spec=float(val_spec), val_pod=float(val_pod),
            sensitivity=float(test_sens), specificity=float(test_spec),
            pod=float(test_pod), timeliness=float(test_tim)
        )

# ===== SAVE RESULTS =====
if summary:
    df = pd.DataFrame.from_dict(summary, orient="index")
    print("\n=== 3-WAY VOTING ENSEMBLE SUMMARY ===")
    print(df)
    print("\nMeans:\n", df[["sensitivity","specificity","pod","timeliness"]].mean(numeric_only=True))
    df.to_csv("Ensemble_3way_voting_results.csv", index=True)
    print("Saved: Ensemble_3way_voting_results.csv")

if rows_val:
    dfv = pd.DataFrame(rows_val)
    dfv.sort_values(["signal","sim"], inplace=True)
    dfv.to_csv("Ensemble_3way_voting_per_sim_val.csv", index=False)
    print("Saved: Ensemble_3way_voting_per_sim_val.csv")

if rows_test:
    dft = pd.DataFrame(rows_test)
    dft.sort_values(["signal","sim"], inplace=True)
    dft.to_csv("Ensemble_3way_voting_per_sim_test.csv", index=False)
    print("Saved: Ensemble_3way_voting_per_sim_test.csv")

print("\n=== 3-WAY VOTING ENSEMBLE COMPLETE ===")


--- Signal 1 (Ensemble: LSTM vs IF) ---
  Using 60 train, 20 val (CSV), 20 test (CSV)


KeyboardInterrupt: 