In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import json

DATASET_VERSION = 'v2'

if DATASET_VERSION == 'baseline':
    dataset_path = '../data/synthetic_data_baseline.csv'
    intervals_path = '../src/incident_intervals_baseline.json'
else:
    dataset_path = '../data/synthetic_data_v2_improved.csv'
    intervals_path = '../src/incident_intervals_v2_improved.json'

df = pd.read_csv(dataset_path)
df['timestamp'] = pd.to_datetime(df['timestamp'])

with open(intervals_path, 'r') as f:
    incident_intervals = json.load(f)

test_start = int(len(df) * 0.85)
df_test = df.iloc[test_start:].copy()
df_test = df_test.reset_index(drop=True)

print(f"\nTest set: {len(df_test)} timesteps")
print(f"Test incidents: {(df_test['incident_label'] == 1).sum()} timesteps ({(df_test['incident_label'] == 1).sum()/len(df_test)*100:.2f}%)")


Test set: 3000 timesteps
Test incidents: 275 timesteps (9.17%)


In [27]:
test_start = int(len(df) * 0.85)
test_end = len(df)

events_test = [
    inc for inc in incident_intervals
    if int(inc["end_idx"]) > test_start and int(inc["start_idx"]) < test_end
]

print("Test incident EVENTS:", len(events_test))
print("First 5 events:", [(e["start_idx"], e["end_idx"], e.get("scenario","")) for e in events_test[:5]])



Test incident EVENTS: 4
First 5 events: [(17416, 17459, 'service_crash'), (17968, 17996, 'traffic_spike'), (18651, 18839, 'traffic_spike'), (19671, 19687, 'backend_failure')]


In [63]:
class BaselineRule:
    """
    - Alert if ANY metric exceeds its threshold
    - Optional: require k consecutive violations (stability filter)
    """

    def __init__(
        self,
        thresholds_up: dict,
        k_consecutive: int = 1,
        request_rate_drop: dict | None = None
    ):
        self.thresholds_up = thresholds_up
        self.k_consecutive = k_consecutive
        self.request_rate_drop = request_rate_drop

    def predict(self, df):
        n = len(df)
        violations = np.zeros(n, dtype=int)
        for metric, thr in self.thresholds_up.items():
            if metric in df.columns:
                violations += (df[metric].to_numpy() > thr).astype(int)

        alerts = (violations > 0)

        # DOWN detector for request_rate
        if self.request_rate_drop is not None:
            col = self.request_rate_drop.get("column", "request_rate")
            window = int(self.request_rate_drop.get("window", 60))
            ratio = float(self.request_rate_drop.get("ratio", 0.4))

            if col in df.columns:
                rr = df[col].to_numpy(dtype=float)

                # rolling median baseline (shifted by 1 to avoid looking at current point)
                rr_med = (
                    pd.Series(rr)
                      .rolling(window=window, min_periods=window)
                      .median()
                      .shift(1)
                      .to_numpy()
                )

                rr_drop = (rr_med > 0) & (rr < ratio * rr_med)
                alerts = alerts | rr_drop  # OR with existing alerts

        # k-consecutive confirmation (optional)
        alerts = alerts.astype(int)

        if self.k_consecutive > 1:
            filtered = np.zeros(n, dtype=int)
            for i in range(n - self.k_consecutive + 1):
                if alerts[i:i + self.k_consecutive].all():
                    filtered[i] = 1
            alerts = filtered

        return np.asarray(alerts, dtype=int)

    def __repr__(self):
        up = ", ".join([f"{k}>{v}" for k, v in self.thresholds_up.items()])
        if self.request_rate_drop:
            d = self.request_rate_drop
            drop_str = f"{d.get('column','request_rate')}<({d.get('ratio',0.4)}×median_{d.get('window',60)}m)"
            return f"BaselineRule({up}, +DOWN:{drop_str}, k={self.k_consecutive})"
        return f"BaselineRule({up}, k={self.k_consecutive})"


thresholds_simple = {
    "cpu_utilization": 85,
    "memory_usage": 85,
    "request_latency": 250,
    "error_rate": 10,
}

baseline_simple = BaselineRule(
    thresholds_up=thresholds_simple,
    k_consecutive=1,
    request_rate_drop={
        "column": "request_rate",
        "window": 60,
        "ratio": 0.4
    }
)


print(f"Baseline Rule (Simple Threshold):")
print(f"  {baseline_simple}")
print(f"\nThresholds:")
for metric, thresh in thresholds_simple.items():
    print(f"  {metric}: > {thresh}")

Baseline Rule (Simple Threshold):
  BaselineRule(cpu_utilization>85, memory_usage>85, request_latency>250, error_rate>10, +DOWN:request_rate<(0.4×median_60m), k=1)

Thresholds:
  cpu_utilization: > 85
  memory_usage: > 85
  request_latency: > 250
  error_rate: > 10


In [64]:
class BaselineRuleVoting:
    """
    - Alert if ANY critical metric violates threshold
    - OR if votes_required metrics violate thresholds
    - Optional k_consecutive confirmation
    """

    def __init__(self, thresholds, votes_required=2, k_consecutive=1,
                 critical_metrics=("error_rate", "request_latency")):
        self.thresholds = thresholds
        self.votes_required = votes_required
        self.k_consecutive = k_consecutive
        self.critical_metrics = set(critical_metrics)

    def predict(self, df: pd.DataFrame) -> np.ndarray:
        n = len(df)
        votes = np.zeros(n, dtype=int)
        critical = np.zeros(n, dtype=bool)

        for metric, thr in self.thresholds.items():
            if metric not in df.columns:
                continue

            v = (df[metric].to_numpy() > thr)

            if metric in self.critical_metrics:
                critical |= v

            votes += v.astype(int)

        voting = (votes >= self.votes_required)

        # apply k only to voting (NOT to critical)
        if self.k_consecutive > 1:
            s = pd.Series(voting.astype(int))
            voting = (s.rolling(self.k_consecutive).sum() == self.k_consecutive).to_numpy()

        alerts = (critical | voting).astype(int)
        return alerts

    def __repr__(self):
        thresh_str = ', '.join([f'{k}>{v}' for k, v in self.thresholds.items()])
        crit_str = ', '.join(sorted(self.critical_metrics))
        return (f"BaselineRuleVoting({thresh_str}, votes≥{self.votes_required}, "
                f"k={self.k_consecutive}, critical=[{crit_str}])")



thresholds_voting = {
    'cpu_utilization': 80,       # Slightly lower thresholds
    'memory_usage': 80,
    'request_latency': 200,
    'error_rate': 8,
    #'request_rate': 1800,
}

baseline_voting = BaselineRuleVoting(
    thresholds_voting,
    votes_required=2,
    k_consecutive=2,
    critical_metrics=("error_rate", "request_latency")
)

print(f"Baseline Rule (Voting System):")
print(f"  {baseline_voting}")
print(f"\nLogic: Alert if (critical metric) OR (≥{baseline_voting.votes_required} metrics) "
      f"for ≥{baseline_voting.k_consecutive} consecutive minutes")

Baseline Rule (Voting System):
  BaselineRuleVoting(cpu_utilization>80, memory_usage>80, request_latency>200, error_rate>8, votes≥2, k=2, critical=[error_rate, request_latency])

Logic: Alert if (critical metric) OR (≥2 metrics) for ≥2 consecutive minutes


In [69]:
def count_alert_events(y_pred) -> int:
    y = np.asarray(y_pred, dtype=int)
    if len(y) == 0:
        return 0
    return int(((y[1:] == 1) & (y[:-1] == 0)).sum() + (y[0] == 1))

def alert_events_per_day(y_pred: np.ndarray) -> float:
    return count_alert_events(y_pred) / len(y_pred) * 1440

def event_recall_from_intervals(y_pred, incident_intervals_json, test_start, test_end):
    # incident detected if ANY alert=1 during its interval overlap with test
    detected = 0
    total = 0

    for inc in incident_intervals_json:
        s = int(inc["start_idx"])
        e = int(inc["end_idx"])
        if e <= test_start or s >= test_end:
            continue

        total += 1
        local_s = max(0, s - test_start)
        local_e = min(test_end - test_start, e - test_start)

        if y_pred[local_s:local_e].any():
            detected += 1

    return detected, total, detected / max(total, 1)

def evaluate_baseline_clean(rule, df, incident_intervals_json, test_start, test_end, name):
    df_test = df.iloc[test_start:test_end].copy()
    y_true = df_test["incident_label"].to_numpy()
    y_pred = np.asarray(rule.predict(df_test), dtype=int)

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()

    minute_alerts_per_day = y_pred.sum() / len(y_pred) * 1440
    event_alerts = count_alert_events(y_pred)
    event_alerts_day = alert_events_per_day(y_pred)

    det, tot, inc_event_recall = event_recall_from_intervals(
        y_pred, incident_intervals_json, test_start, test_end
    )

    print("\n" + "="*70)
    print(f"EVALUATING: {name}")
    print("="*70)
    print(f"Rule: {rule}")
    print(f"Precision={precision:.3f} Recall(timestep)={recall:.3f} F1={f1:.3f}")
    print(f"TN={tn} FP={fp} FN={fn} TP={tp}")

    print("\nAlert rate:")
    print(f"  Minutes with alert/day: {minute_alerts_per_day:.1f}")
    print(f"  Alert EVENTS/day:       {event_alerts_day:.2f} (events={event_alerts})")

    print("\nIncident EVENT detection:")
    print(f"  Detected incidents: {det}/{tot} = {inc_event_recall:.3f}")

    return {
    "precision": float(precision),
    "recall_timestep": float(recall),
    "f1": float(f1),

    "tn": int(tn), "fp": int(fp), "fn": int(fn), "tp": int(tp),

    "minute_alerts_per_day": float(minute_alerts_per_day),
    "alert_events_per_day": float(event_alerts_day),
    "n_alert_events": int(event_alerts),

    "incident_event_recall": float(inc_event_recall),
    "incident_events_detected": int(det),
    "incident_events_total": int(tot),
}

res_simple = evaluate_baseline_clean(
    baseline_simple, df, incident_intervals, test_start, test_end, "Simple ANY metric"
)

res_voting = evaluate_baseline_clean(
    baseline_voting, df, incident_intervals, test_start, test_end, "Voting hybrid"
)


EVALUATING: Simple ANY metric
Rule: BaselineRule(cpu_utilization>85, memory_usage>85, request_latency>250, error_rate>10, +DOWN:request_rate<(0.4×median_60m), k=1)
Precision=0.942 Recall(timestep)=0.891 F1=0.916
TN=2710 FP=15 FN=30 TP=245

Alert rate:
  Minutes with alert/day: 124.8
  Alert EVENTS/day:       3.36 (events=7)

Incident EVENT detection:
  Detected incidents: 4/4 = 1.000

EVALUATING: Voting hybrid
Rule: BaselineRuleVoting(cpu_utilization>80, memory_usage>80, request_latency>200, error_rate>8, votes≥2, k=2, critical=[error_rate, request_latency])
Precision=0.880 Recall(timestep)=0.804 F1=0.840
TN=2695 FP=30 FN=54 TP=221

Alert rate:
  Minutes with alert/day: 120.5
  Alert EVENTS/day:       4.32 (events=9)

Incident EVENT detection:
  Detected incidents: 4/4 = 1.000


In [68]:
y_pred_simple = np.asarray(baseline_simple.predict(df.iloc[test_start:test_end]), dtype=int)

for inc in incident_intervals:
    s, e, sc = int(inc["start_idx"]), int(inc["end_idx"]), inc.get("scenario","")
    if e <= test_start or s >= test_end:
        continue
    ls, le = s - test_start, e - test_start
    hit = y_pred_simple[ls:le].any()
    print(sc, s, e, "HIT" if hit else "MISS")


service_crash 17416 17459 HIT
traffic_spike 17968 17996 HIT
traffic_spike 18651 18839 HIT
backend_failure 19671 19687 HIT


In [70]:
baseline_results = {
    "dataset_version": str(DATASET_VERSION),
    "test_timesteps": int(len(df_test)),
    "test_incident_timesteps": int((df_test["incident_label"] == 1).sum()),

    "simple_threshold_plus_drop": {
        "thresholds_up": thresholds_simple,
        "k_consecutive": int(getattr(baseline_simple, "k_consecutive", 1)),
        "request_rate_drop": getattr(baseline_simple, "request_rate_drop", None),

        "metrics": {
            "precision": res_simple["precision"],
            "recall_timestep": res_simple["recall_timestep"],
            "f1": res_simple["f1"],
            "alert_minutes_per_day": res_simple["minute_alerts_per_day"],
            "alert_events_per_day": res_simple["alert_events_per_day"],
            "incident_event_recall": res_simple["incident_event_recall"],
            "incident_events_detected": res_simple["incident_events_detected"],
            "incident_events_total": res_simple["incident_events_total"],
        },
        "confusion_matrix": {
            "tp": res_simple["tp"],
            "fp": res_simple["fp"],
            "tn": res_simple["tn"],
            "fn": res_simple["fn"],
        },
    },

    "voting_hybrid": {
        "thresholds": thresholds_voting,
        "votes_required": int(getattr(baseline_voting, "votes_required", 2)),
        "k_consecutive": int(getattr(baseline_voting, "k_consecutive", getattr(baseline_voting, "k_consecutive_votes", 1))),
        "critical_metrics": sorted(list(getattr(baseline_voting, "critical_metrics", []))),

        "metrics": {
            "precision": res_voting["precision"],
            "recall_timestep": res_voting["recall_timestep"],
            "f1": res_voting["f1"],
            "alert_minutes_per_day": res_voting["minute_alerts_per_day"],
            "alert_events_per_day": res_voting["alert_events_per_day"],
            "incident_event_recall": res_voting["incident_event_recall"],
            "incident_events_detected": res_voting["incident_events_detected"],
            "incident_events_total": res_voting["incident_events_total"],
        },
        "confusion_matrix": {
            "tp": res_voting["tp"],
            "fp": res_voting["fp"],
            "tn": res_voting["tn"],
            "fn": res_voting["fn"],
        },
    },
}

output_file = f"../results/baseline_results_{DATASET_VERSION}.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(baseline_results, f, indent=2, ensure_ascii=False)

print(f"Saved: {output_file}")

best = "simple_threshold_plus_drop" if baseline_results["simple_threshold_plus_drop"]["metrics"]["f1"] >= baseline_results["voting_hybrid"]["metrics"]["f1"] else "voting_hybrid"
m = baseline_results[best]["metrics"]
print("FINAL SUMMARY")
print(f"Best baseline: {best}")
print(f"  Recall(timestep): {m['recall_timestep']:.3f}")
print(f"  Precision:        {m['precision']:.3f}")
print(f"  Alert events/day: {m['alert_events_per_day']:.2f}")
print(f"  Event recall:     {m['incident_event_recall']:.3f}")

Saved: baseline_results_v2.json
FINAL SUMMARY
Best baseline: simple_threshold_plus_drop
  Recall(timestep): 0.891
  Precision:        0.942
  Alert events/day: 3.36
  Event recall:     1.000
