In [None]:
# cell1
import numpy as np

def window_vector_from_values(values):
    """
    일자 평균 시계열 → 5차원 특성 벡터
    """
    values = np.asarray(values, dtype=float)

    if len(values) < 2:
        return {
            "mean": np.nan,
            "std": np.nan,
            "amplitude": np.nan,
            "diff_mean": np.nan,
            "diff_std": np.nan,
        }

    diffs = np.diff(values)

    return {
        "mean": float(np.mean(values)),
        "std": float(np.std(values, ddof=0)),
        "amplitude": float(np.max(values) - np.min(values)),
        "diff_mean": float(np.mean(diffs)),
        "diff_std": float(np.std(diffs, ddof=0)),
    }

print("[OK] window_vector_from_values loaded")

In [None]:
# cell2
import numpy as np

def window_vector_from_values(values):
    """
    일자 평균 시계열 → 5차원 특성 벡터
    """
    values = np.asarray(values, dtype=float)

    if len(values) < 2:
        return {
            "mean": np.nan,
            "std": np.nan,
            "amplitude": np.nan,
            "diff_mean": np.nan,
            "diff_std": np.nan,
        }

    diffs = np.diff(values)

    return {
        "mean": float(np.mean(values)),
        "std": float(np.std(values, ddof=0)),
        "amplitude": float(np.max(values) - np.min(values)),
        "diff_mean": float(np.mean(diffs)),
        "diff_std": float(np.std(diffs, ddof=0)),
    }

print("[OK] window_vector_from_values loaded")

In [None]:
# cell3
# Predictive Maintenance - Normal Reference Builder (Integrated Cell)
# - Query a2_fct_table.fct_table for (station, step_desc, date range)
# - Aggregate daily mean (MMDD) + sample_amount
# - Build 5D feature vector (mean/std/amplitude/diff_mean/diff_std)
# - Save JSONB reference pattern into e4_predictive_maintenance.predictive_maintenance
#   with UPSERT on (station, step_description, pattern_name)
# ============================================

import json
import numpy as np
import pandas as pd
import urllib.parse

from sqlalchemy import create_engine, text
from IPython.display import display

# =========================
# [0] DB 엔진 생성 (필수)
# =========================
DB_CONFIG = {
    "host": "100.105.75.47",
    "port": 5432,
    "dbname": "postgres",
    "user": "postgres",
    "password": "",#보안
}

def get_engine(cfg=DB_CONFIG):
    user = cfg["user"]
    password = urllib.parse.quote_plus(cfg["password"])
    host = cfg["host"]
    port = cfg["port"]
    dbname = cfg["dbname"]
    conn_str = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{dbname}"
    return create_engine(conn_str, pool_pre_ping=True)

engine = get_engine()
print("[OK] SQLAlchemy engine created")

# =========================
# [1] Features + Vector 함수 (필수)
# =========================
FEATURES = ["mean", "std", "amplitude", "diff_mean", "diff_std"]

def window_vector_from_values(values):
    """
    일자 평균 시계열 → 5차원 특성 벡터
    FEATURES = [mean, std, amplitude, diff_mean, diff_std]
    """
    values = np.asarray(values, dtype=float)

    if len(values) < 2:
        return {
            "mean": np.nan,
            "std": np.nan,
            "amplitude": np.nan,
            "diff_mean": np.nan,
            "diff_std": np.nan,
        }

    diffs = np.diff(values)

    return {
        "mean": float(np.mean(values)),
        "std": float(np.std(values, ddof=0)),
        "amplitude": float(np.max(values) - np.min(values)),
        "diff_mean": float(np.mean(diffs)),
        "diff_std": float(np.std(diffs, ddof=0)),
    }

print("[OK] window_vector_from_values loaded")

# =========================
# [2] 조건
# =========================
SCHEMA = "a2_fct_table"
TABLE  = "fct_table"

station   = "FCT2"
step_desc = "1.34_Test_VUSB_Type-C_A(ELoad2=1.35A)vol"

# end_day 비교는 YYYYMMDD로 통일
start_day = "20251229"
end_day   = "20260115"

TARGET_SCHEMA = "e4_predictive_maintenance"
TARGET_TABLE  = "predictive_maintenance"
pattern_name  = "pd_board_normal_ref"

# =========================
# NaN -> None 변환(JSONB용)
# =========================
def sanitize_for_json(obj):
    if isinstance(obj, float) and (np.isnan(obj) or np.isinf(obj)):
        return None
    if isinstance(obj, dict):
        return {k: sanitize_for_json(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple)):
        return [sanitize_for_json(v) for v in obj]
    return obj

# =========================
# 1) DB 조회 (end_day를 YYYYMMDD로 변환해서 비교)
# =========================
sql = text(f"""
SELECT
  replace(CAST(end_day AS text), '-', '') AS end_day_yyyymmdd,
  station,
  value
FROM {SCHEMA}.{TABLE}
WHERE station = :station
  AND replace(CAST(end_day AS text), '-', '') BETWEEN :start_day AND :end_day
  AND step_description = :step_desc
ORDER BY end_day_yyyymmdd
""")

with engine.begin() as conn:
    raw = pd.read_sql(sql, conn, params={
        "station": station,
        "start_day": start_day,
        "end_day": end_day,
        "step_desc": step_desc
    })

# =========================
# 2) 데이터 0건 방지
# =========================
if raw.empty:
    raise ValueError(
        f"[ERROR] No rows found. Check end_day format/type in DB. "
        f"station={station}, range={start_day}~{end_day}, step_desc={step_desc}"
    )

raw["value_num"] = pd.to_numeric(raw["value"], errors="coerce")
raw = raw.dropna(subset=["value_num"]).copy()

if raw.empty:
    raise ValueError("[ERROR] value column has no numeric rows after conversion.")

raw["mmdd"] = raw["end_day_yyyymmdd"].str.slice(4, 8)

# =========================
# 3) MMDD별 평균(2dp) + 표본수
# =========================
avg_df_post = (
    raw.groupby(["station", "mmdd"], as_index=False)
       .agg(value_avg=("value_num", "mean"),
            sample_amount=("value_num", "count"))
       .sort_values(["station", "mmdd"])
)
avg_df_post["value_avg"] = avg_df_post["value_avg"].round(2)

baseline_days = int(avg_df_post["mmdd"].nunique())
if baseline_days < 2:
    raise ValueError(f"[ERROR] Not enough days to build vector. baseline_days={baseline_days}")

# =========================
# 4) 정상 기준 벡터(구간 전체로 1개 벡터)
# =========================
values = avg_df_post["value_avg"].values
v_normal_post = window_vector_from_values(values)

ref_pattern = {k: float(v_normal_post[k]) for k in FEATURES}
ref_pattern_2dp = {k: round(ref_pattern[k], 2) for k in FEATURES}

# =========================
# 5) DB 저장(JSONB)
# =========================
euclid_graph = {
    "type": "normal_reference",
    "features": FEATURES,
    "reference_pattern": ref_pattern,
    "reference_pattern_2dp": ref_pattern_2dp,
    "data_range": {"start_day": start_day, "end_day": end_day},
    "baseline_days": baseline_days,
    "source": {
        "schema": SCHEMA,
        "table": TABLE,
        "station": station,
        "step_description": step_desc
    },
}

# JSONB는 NaN 금지 → sanitize + allow_nan=False
euclid_graph_clean = sanitize_for_json(euclid_graph)
euclid_graph_json = json.dumps(euclid_graph_clean, ensure_ascii=False, allow_nan=False)

create_schema_sql = text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_SCHEMA};")

create_table_sql = text(f"""
CREATE TABLE IF NOT EXISTS {TARGET_SCHEMA}.{TARGET_TABLE} (
    station               TEXT NOT NULL,
    step_description      TEXT NOT NULL,
    pattern_name          TEXT NOT NULL,
    window_size           INTEGER NOT NULL,
    normal_start_mmdd     TEXT,
    normal_end_mmdd       TEXT,
    abnormal_start_mmdd   TEXT,
    abnormal_end_mmdd     TEXT,
    euclid_graph          JSONB NOT NULL,
    updated_at            TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    PRIMARY KEY (station, step_description, pattern_name)
);
""")

upsert_sql = text(f"""
INSERT INTO {TARGET_SCHEMA}.{TARGET_TABLE} (
    station, step_description, pattern_name,
    window_size,
    normal_start_mmdd, normal_end_mmdd,
    abnormal_start_mmdd, abnormal_end_mmdd,
    euclid_graph, updated_at
) VALUES (
    :station, :step_description, :pattern_name,
    :window_size,
    :normal_start_mmdd, :normal_end_mmdd,
    NULL, NULL,
    CAST(:euclid_graph AS JSONB), NOW()
)
ON CONFLICT (station, step_description, pattern_name)
DO UPDATE SET
    window_size = EXCLUDED.window_size,
    normal_start_mmdd = EXCLUDED.normal_start_mmdd,
    normal_end_mmdd = EXCLUDED.normal_end_mmdd,
    euclid_graph = EXCLUDED.euclid_graph,
    updated_at = NOW();
""")

with engine.begin() as conn:
    conn.execute(create_schema_sql)
    conn.execute(create_table_sql)
    conn.execute(upsert_sql, {
        "station": station,
        "step_description": step_desc,
        "pattern_name": pattern_name,
        "window_size": baseline_days,
        "normal_start_mmdd": start_day[4:8],  # "1117"
        "normal_end_mmdd": end_day[4:8],      # "1124"
        "euclid_graph": euclid_graph_json,
    })

print("[OK] Saved:", pattern_name, "| baseline_days =", baseline_days)
display(avg_df_post)
print("Normal ref vector (2dp):", ref_pattern_2dp)


In [None]:
# cell4 (FCT2 포함 버전)
import pandas as pd
import numpy as np
from sqlalchemy import text

SCHEMA = "a2_fct_table"
TABLE  = "fct_table"

# ✅ FCT2 추가
stations  = ["FCT1", "FCT2", "FCT3", "FCT4"]

start_day = "20251229"
end_day   = "20260115"
step_desc = "1.34_Test_VUSB_Type-C_A(ELoad2=1.35A)vol"

sql = text(f"""
SELECT
  trim(end_day) AS end_day_yyyymmdd,
  trim(station) AS station,
  step_description,
  value
FROM {SCHEMA}.{TABLE}
WHERE trim(station) = ANY(:stations)
  AND trim(end_day) BETWEEN :start_day AND :end_day
  AND step_description = :step_desc
ORDER BY station, end_day_yyyymmdd
""")

with engine.begin() as conn:
    raw = pd.read_sql(sql, conn, params={
        "stations": stations,
        "start_day": start_day,
        "end_day": end_day,
        "step_desc": step_desc
    })

print(f"[DEBUG] raw rows = {len(raw):,}")
if raw.empty:
    raise ValueError("[ERROR] raw is empty. (station/date/step_description 조건 불일치)")

print("[DEBUG] value sample (top10):", raw["value"].astype(str).head(10).tolist())

raw["value_num"] = pd.to_numeric(raw["value"], errors="coerce")
print(f"[DEBUG] numeric rows after to_numeric = {raw['value_num'].notna().sum():,} / {len(raw):,}")

if raw["value_num"].notna().sum() == 0:
    raw["value_num"] = (
        raw["value"].astype(str)
        .str.extract(r"(-?\d+(?:\.\d+)?)", expand=False)
    )
    raw["value_num"] = pd.to_numeric(raw["value_num"], errors="coerce")
    print(f"[DEBUG] numeric rows after regex extract = {raw['value_num'].notna().sum():,} / {len(raw):,}")

raw = raw.dropna(subset=["value_num"]).copy()
if raw.empty:
    raise ValueError("[ERROR] value_num is empty after numeric parsing. value 포맷 확인 필요.")

raw["mmdd"] = raw["end_day_yyyymmdd"].str.slice(4, 8)
raw["date"] = pd.to_datetime(raw["end_day_yyyymmdd"], format="%Y%m%d", errors="coerce")
raw = raw.dropna(subset=["date"]).copy()

avg_df_all = (
    raw.groupby(["station", "mmdd"], as_index=False)
       .agg(
           value_avg=("value_num", "mean"),
           sample_amount=("value_num", "count"),
           date=("date", "min")
       )
       .sort_values(["station", "mmdd"])
       .reset_index(drop=True)
)

avg_df_all["value_avg"] = avg_df_all["value_avg"].round(2)

print(f"[OK] avg_df_all rows = {len(avg_df_all):,} | stations={sorted(avg_df_all['station'].unique().tolist())}")
display(avg_df_all.head(10))

In [None]:
# cell5
# ============================================
# [CELL 0] Analysis Date Range Definition (FIXED)
# - 2025 고정 변환 제거
# - START_DAY/END_DAY 기반으로 연도 자동 적용
# - 가능하면 end_day_yyyymmdd / date 컬럼을 우선 사용
# ============================================

import pandas as pd

# -----------------------------
# 1) 분석 기간 (여기만 바꿔서 사용)
# -----------------------------
START_DAY = "20251229"   # YYYYMMDD
END_DAY   = "20260115"   # YYYYMMDD

START_DT = pd.to_datetime(START_DAY, format="%Y%m%d")
END_DT   = pd.to_datetime(END_DAY, format="%Y%m%d")

print(f"[INFO] Analysis Period = {START_DT.date()} ~ {END_DT.date()}")

# -----------------------------
# 2) 안전 변환 함수들
# -----------------------------
def yyyymmdd_to_date(yyyymmdd: str) -> pd.Timestamp:
    """YYYYMMDD(8자리) 문자열을 datetime으로 변환"""
    yyyymmdd = str(yyyymmdd).strip()
    return pd.to_datetime(yyyymmdd, format="%Y%m%d", errors="coerce")

def mmdd_to_date_by_range(mmdd: str, start_dt: pd.Timestamp, end_dt: pd.Timestamp) -> pd.Timestamp:
    """
    MMDD(4자리) → date 변환
    - 기본 연도는 start_dt.year 사용
    - 만약 (start~end)가 연말~연초를 가로지르는 케이스면 연도 자동 보정
      예) start=20251228, end=20260105 인 경우
          mmdd=0102 는 2026-01-02로 판단
    """
    mmdd = str(mmdd).strip()
    if len(mmdd) != 4 or not mmdd.isdigit():
        return pd.NaT

    y_start = start_dt.year
    y_end   = end_dt.year

    # 우선 start 연도로 붙여본다
    dt1 = pd.to_datetime(f"{y_start}{mmdd}", format="%Y%m%d", errors="coerce")

    # start~end가 연도를 가로지르면(예: 2025-12 ~ 2026-01) 보정
    crosses_year = (y_start != y_end)

    if crosses_year and pd.notna(dt1):
        # dt1이 end_dt보다 너무 뒤(예: 2025-01-02가 되어버린 경우)면 end 연도로 재시도
        # 또는 dt1이 start_dt보다 너무 앞이면(end 연도로 가야 하는) end 연도로 변환
        dt2 = pd.to_datetime(f"{y_end}{mmdd}", format="%Y%m%d", errors="coerce")

        # dt1이 범위 밖이고 dt2가 범위에 더 잘 맞으면 dt2 선택
        if pd.notna(dt2):
            in1 = (dt1 >= start_dt) and (dt1 <= end_dt)
            in2 = (dt2 >= start_dt) and (dt2 <= end_dt)
            if (not in1) and in2:
                return dt2

    return dt1

# -----------------------------
# 3) avg_df_all 기간 필터
# -----------------------------
assert "avg_df_all" in globals(), "[ERROR] avg_df_all must be created before CELL 0"

avg_df_all = avg_df_all.copy()

# (우선순위) date 컬럼이 이미 있으면 그대로 사용
if "date" in avg_df_all.columns:
    avg_df_all["date"] = pd.to_datetime(avg_df_all["date"], errors="coerce")

# (차선) end_day_yyyymmdd가 있으면 그걸로 date 생성
elif "end_day_yyyymmdd" in avg_df_all.columns:
    avg_df_all["date"] = avg_df_all["end_day_yyyymmdd"].apply(yyyymmdd_to_date)

# (최후) mmdd만 있으면 START/END 기반 연도 자동 적용
elif "mmdd" in avg_df_all.columns:
    avg_df_all["date"] = avg_df_all["mmdd"].apply(lambda x: mmdd_to_date_by_range(x, START_DT, END_DT))

else:
    raise ValueError("[ERROR] avg_df_all must have one of ['date', 'end_day_yyyymmdd', 'mmdd']")

# date 생성 실패 제거
avg_df_all = avg_df_all.dropna(subset=["date"]).copy()

# 기간 필터 적용
avg_df_all = avg_df_all[
    (avg_df_all["date"] >= START_DT) &
    (avg_df_all["date"] <= END_DT)
].reset_index(drop=True)

print(f"[OK] avg_df_all filtered rows = {len(avg_df_all):,}")
print(f"[OK] stations = {sorted(avg_df_all['station'].unique().tolist()) if len(avg_df_all) else []}")
display(avg_df_all.head(10))

In [None]:
# cell6
import json
from sqlalchemy import text

TARGET_SCHEMA = "e4_predictive_maintenance"
TARGET_TABLE  = "predictive_maintenance"

def load_pattern(station, step_desc, pattern_name):
    q = text(f"""
SELECT euclid_graph
    FROM {TARGET_SCHEMA}.{TARGET_TABLE}
    WHERE station=:station AND step_description=:step_desc AND pattern_name=:pattern_name
    """)
    with engine.begin() as conn:
        row = conn.execute(q, {
            "station": station,
            "step_desc": step_desc,
            "pattern_name": pattern_name
        }).fetchone()
    if row is None:
        raise ValueError(f"[ERROR] Pattern not found: {station} / {pattern_name}")
    g = row[0]
    if isinstance(g, str):
        g = json.loads(g)
    return g

# 저장해 둔 패턴 로드
g_normal = load_pattern("FCT2", step_desc, "pd_board_normal_ref")
g_abn    = load_pattern("FCT2", step_desc, "pd_board_degradation_ref")

FEATURES = g_normal["features"]
V_normal = np.array([g_normal["reference_pattern"][k] for k in FEATURES], dtype=float)
A_ref    = np.array([g_abn["reference_pattern"][k] for k in FEATURES], dtype=float)

print("[OK] FEATURES =", FEATURES)
print("[OK] V_normal(2dp) =", {k: round(float(g_normal["reference_pattern"][k]), 2) for k in FEATURES})
print("[OK] A_ref(2dp)    =", {k: round(float(g_abn["reference_pattern"][k]), 2) for k in FEATURES})

In [None]:
# cell8 (FCT2 포함 버전)
import numpy as np
import pandas as pd

WINDOW = 5

def cosine_sim(a, b, eps=1e-12):
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    if na < eps or nb < eps:
        return np.nan
    return float(np.dot(a, b) / (na * nb))

assert "avg_df_all" in globals(), "[ERROR] avg_df_all 이(가) 없습니다. 이전 셀 실행 순서를 확인하세요."
need_cols = {"station", "mmdd", "value_avg"}
missing = need_cols - set(avg_df_all.columns)
assert not missing, f"[ERROR] avg_df_all 컬럼 누락: {missing}"

print(f"[INFO] avg_df_all rows={len(avg_df_all):,} | stations={sorted(avg_df_all['station'].unique().tolist())}")

# ✅ FCT2 추가
st_targets = ["FCT1", "FCT2", "FCT3", "FCT4"]
lens = {st: int((avg_df_all["station"] == st).sum()) for st in st_targets}
print("[INFO] rows per station:", lens, "| WINDOW =", WINDOW)

rows = []

for st in st_targets:
    df_st = avg_df_all[avg_df_all["station"] == st].copy()
    df_st = df_st.dropna(subset=["mmdd", "value_avg"]).copy()
    df_st["mmdd"] = df_st["mmdd"].astype(str)
    df_st = df_st.sort_values("mmdd").reset_index(drop=True)

    if len(df_st) < WINDOW:
        print(f"[WARN] skip {st}: len(df_st)={len(df_st)} < WINDOW({WINDOW})")
        continue

    for i in range(len(df_st) - WINDOW + 1):
        chunk = df_st.iloc[i:i+WINDOW]
        mmdd_end = chunk["mmdd"].iloc[-1]

        v = window_vector_from_values(chunk["value_avg"].values)
        V_t = np.array([v[k] for k in FEATURES], dtype=float)

        A_t = V_t - V_normal

        rows.append({
            "station": st,
            "mmdd": mmdd_end,
            "score_from_normal": float(np.linalg.norm(A_t)),
            "cos_sim_to_ref": cosine_sim(A_t, A_ref),
            "dist_to_ref": float(np.linalg.norm(A_t - A_ref)),
        })

if not rows:
    compare_df = pd.DataFrame(columns=["station", "mmdd", "score_from_normal", "cos_sim_to_ref", "dist_to_ref"])
    print("[ERROR] compare_df 생성 실패: rows가 비어 있습니다.")
    print("        원인 후보:")
    print("        1) avg_df_all이 기간 필터(START_DAY/END_DAY)로 비어짐")
    print("        2) 대상 스테이션 데이터가 WINDOW(5)일 미만")
    print("        3) value_avg가 NaN으로 전부 drop됨")
else:
    compare_df = pd.DataFrame(rows).sort_values(["station", "mmdd"]).reset_index(drop=True)
    compare_df["score_from_normal"] = compare_df["score_from_normal"].round(2)
    compare_df["cos_sim_to_ref"] = compare_df["cos_sim_to_ref"].round(3)
    compare_df["dist_to_ref"] = compare_df["dist_to_ref"].round(2)

display(compare_df.head(10))
print("[OK] compare_df rows =", len(compare_df))

In [None]:
# cell9 =========================
# 코사인 유사도 임계값
# =========================
COS_TH = 0.70   # 0.7 이상이면 "이상 패턴과 방향이 유사"

print("[OK] COS_TH defined:", COS_TH)

In [None]:
# cell10
import plotly.express as px
import pandas as pd

dfp = compare_df.copy()

# mmdd → 실제 연도 자동 적용 (2025 고정 금지)
def mmdd_to_date(mmdd):
    mmdd = str(mmdd).zfill(4)
    y_start = START_DT.year
    y_end   = END_DT.year

    d1 = pd.to_datetime(f"{y_start}{mmdd}", format="%Y%m%d", errors="coerce")
    if (y_start != y_end) and (d1 < START_DT or d1 > END_DT):
        d2 = pd.to_datetime(f"{y_end}{mmdd}", format="%Y%m%d", errors="coerce")
        if START_DT <= d2 <= END_DT:
            return d2
    return d1

dfp["date"] = dfp["mmdd"].apply(mmdd_to_date)

# ✅ 시간순으로 선이 제대로 이어지도록 정렬 (Plotly는 행 순서대로 선을 연결함)
dfp = (dfp.sort_values(['station','date'], kind='mergesort')
          .reset_index(drop=True))

# 1) 코사인 유사도 추이
fig1 = px.line(
    dfp,
    x="date",
    y="cos_sim_to_ref",
    color="station",
    markers=True,
    title=f"Cosine Similarity to Anomaly Pattern (A_ref) | TH={COS_TH}",
    labels={"date":"Date", "cos_sim_to_ref":"cosine(A_t, A_ref)", "station":"Station"},
)
fig1.add_hline(y=COS_TH, line_dash="dash")
fig1.update_xaxes(tickformat="%m-%d")
fig1.update_layout(width=1100, height=420)
fig1.show()

# 2) 정상 기준 대비 score 추이
fig2 = px.line(
    dfp,
    x="date",
    y="score_from_normal",
    color="station",
    markers=True,
    title="Score from Normal Baseline (||A_t||)",
    labels={"date":"Date", "score_from_normal":"||A_t||", "station":"Station"},
)
fig2.update_xaxes(tickformat="%m-%d")
fig2.update_layout(width=1100, height=420)
fig2.show()

In [None]:
# cell11 (연도 꼬임 제거 버전: START_DT/END_DT 기반)
import numpy as np
import pandas as pd
import plotly.express as px

K_MAD = 4.0
MIN_SAMPLES_FOR_ROBUST = 8

def mad(x: pd.Series) -> float:
    x = x.dropna().astype(float).values
    if len(x) == 0:
        return np.nan
    med = np.median(x)
    return float(np.median(np.abs(x - med)))

def robust_threshold(x: pd.Series, k=K_MAD):
    x = x.dropna().astype(float)
    if len(x) < MIN_SAMPLES_FOR_ROBUST:
        if len(x) == 0:
            return np.nan
        p90 = float(np.percentile(x, 90))
        return p90 * 1.10 if np.isfinite(p90) else np.nan

    med = float(np.median(x))
    m = mad(x)
    if not np.isfinite(m) or m == 0:
        return float(np.percentile(x, 95))
    return med + k * m

# mmdd → date (연도 자동)
def mmdd_to_date(mmdd: str) -> pd.Timestamp:
    mmdd = str(mmdd).zfill(4)
    y_start = START_DT.year
    y_end   = END_DT.year

    d1 = pd.to_datetime(f"{y_start}{mmdd}", format="%Y%m%d", errors="coerce")
    if (y_start != y_end) and (pd.notna(d1)) and ((d1 < START_DT) or (d1 > END_DT)):
        d2 = pd.to_datetime(f"{y_end}{mmdd}", format="%Y%m%d", errors="coerce")
        if pd.notna(d2) and (START_DT <= d2 <= END_DT):
            return d2
    return d1

# 1) station별 threshold
th_df = (
    compare_df.groupby("station")["score_from_normal"]
    .apply(lambda s: robust_threshold(s, k=K_MAD))
    .reset_index(name="th_score")
)
display(th_df)

dfi = compare_df.merge(th_df, on="station", how="left").copy()
dfi["date"] = dfi["mmdd"].apply(mmdd_to_date)

# 2) 상태 분류
dfi["is_cos_like"] = dfi["cos_sim_to_ref"] >= COS_TH
dfi["is_score_high"] = dfi["score_from_normal"] >= dfi["th_score"]

def classify(row):
    if row["is_cos_like"] and row["is_score_high"]:
        return "CRITICAL"
    if row["is_cos_like"] and (not row["is_score_high"]):
        return "WARNING"
    if (not row["is_cos_like"]) and row["is_score_high"]:
        return "WATCH"
    return "OK"

dfi["status"] = dfi.apply(classify, axis=1)

# 3) 지속성(연속 경보일)
ALERT_LEVELS = {"WARNING", "CRITICAL"}

def add_consecutive_alerts(df_station: pd.DataFrame) -> pd.DataFrame:
    df_station = df_station.sort_values("date").copy()
    consec = 0
    out = []
    for _, r in df_station.iterrows():
        if r["status"] in ALERT_LEVELS:
            consec += 1
        else:
            consec = 0
        out.append(consec)
    df_station["alert_streak"] = out
    return df_station

dfi = (
    dfi.groupby("station", as_index=True, group_keys=True)
       .apply(add_consecutive_alerts, include_groups=False)
       .reset_index(level=0)
       .reset_index(drop=True)
)

# 4) 핵심 요약표
N_LAST = 5
latest_df = (
    dfi.sort_values(["station", "date"])
       .groupby("station")
       .tail(N_LAST)
       .sort_values(["station", "date"])
)
display(latest_df[[
    "station","mmdd","score_from_normal","th_score","cos_sim_to_ref","dist_to_ref",
    "status","alert_streak"
]])

summary = (
    dfi.sort_values(["station","date"])
       .groupby("station")
       .agg(
           last_date=("date","max"),
           last_status=("status","last"),
           last_score=("score_from_normal","last"),
           th_score=("th_score","last"),
           last_cos=("cos_sim_to_ref","last"),
           max_score=("score_from_normal","max"),
           max_cos=("cos_sim_to_ref","max"),
           max_streak=("alert_streak","max"),
           crit_days=("status", lambda s: int((s=="CRITICAL").sum())),
           warn_days=("status", lambda s: int((s=="WARNING").sum())),
       )
       .reset_index()
)
display(summary)

# 5) 최종 판단 텍스트
def interpret_station(row):
    st = row["station"]
    ls = row["last_status"]
    score = row["last_score"]
    th = row["th_score"]
    cosv = row["last_cos"]
    streak = row["max_streak"]

    if ls == "CRITICAL":
        return (f"- {st}: 현재 CRITICAL. cos={cosv:.3f}(>=TH {COS_TH}), "
                f"||A_t||={score:.2f}(>=th {th:.2f}). 최대 연속 경보 {streak}일. "
                f"즉시 점검/교체 타이밍 검토 권장.")
    if ls == "WARNING":
        return (f"- {st}: 현재 WARNING(전조). cos={cosv:.3f}(>=TH {COS_TH})로 방향성 정렬. "
                f"||A_t||={score:.2f}, th={th:.2f}는 미초과/경계. "
                f"추세 지속 시 CRITICAL 전이 가능.")
    if ls == "WATCH":
        return (f"- {st}: 현재 WATCH. ||A_t||={score:.2f}(>=th {th:.2f})로 이탈은 있으나 "
                f"cos={cosv:.3f}(<TH {COS_TH})로 ref 패턴 불일치. "
                f"다른 원인 가능 → FAIL%, 다른 step 교차검증 권장.")
    return (f"- {st}: 현재 OK. cos={cosv:.3f}, ||A_t||={score:.2f}(th {th:.2f}) 범위 내.")

print("\n[INTERPRETATION]")
for _, r in summary.sort_values("station").iterrows():
    print(interpret_station(r))

# 6) status 타임라인
status_map = {"OK":0, "WATCH":1, "WARNING":2, "CRITICAL":3}
dfv = dfi.copy()
dfv["status_level"] = dfv["status"].map(status_map)

fig_status = px.line(
    dfv,
    x="date",
    y="status_level",
    color="station",
    markers=True,
    title=f"Station Status Timeline (0=OK,1=WATCH,2=WARNING,3=CRITICAL) | COS_TH={COS_TH}, K_MAD={K_MAD}",
    labels={"date":"Date","status_level":"Status Level","station":"Station"},
)
fig_status.update_xaxes(tickformat="%m-%d")
fig_status.update_yaxes(dtick=1, range=[-0.2, 3.2])
fig_status.update_layout(width=1100, height=420)
fig_status.show()