<a href="https://colab.research.google.com/github/hwangho-kim/Utility-OAC/blob/main/Daily_FDC_Monitoring_with_Simplified_Isolation_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import joblib
from datetime import date, timedelta

# --- 그래프 한글 폰트 설정 ---
try:
    import koreanize_matplotlib
except ImportError:
    pass

# --- 데이터 생성 및 특징 추출 함수들 (이전과 동일) ---
def create_long_format_csv(filepath, num_wafers, anomaly_info=None):
    print(f"'{filepath}' 이름으로 샘플 CSV 파일을 생성합니다...")
    num_steps, num_sensors, time_points = 10, 10, 10
    np.random.seed(42)
    records = []
    for wafer_id in range(1, num_wafers + 1):
        for step_id in range(1, num_steps + 1):
            record_base = {'wafer_id': wafer_id, 'step_id': step_id}
            for i in range(num_sensors):
                sensor_name = f'Sensor_{chr(65+i)}'
                record_base[sensor_name] = np.random.randn(time_points) * (i * 0.1 + 0.5) + (i * 5 + step_id)
            if anomaly_info:
                if (anomaly_info.get('wafer_id') is None or wafer_id >= anomaly_info['wafer_id']) and \
                   (anomaly_info.get('step_id') is None or step_id == anomaly_info['step_id']):
                    record_base['Sensor_C'] += anomaly_info.get('c_drift', 0)
                    record_base['Sensor_G'] *= anomaly_info.get('g_noise', 1)
            for t_idx in range(time_points):
                row = {**record_base, 'time': t_idx}
                for i in range(num_sensors):
                    sensor_name = f'Sensor_{chr(65+i)}'
                    row[sensor_name] = record_base[sensor_name][t_idx]
                records.append(row)
    df = pd.DataFrame(records)
    df.to_csv(filepath, index=False)
    print("샘플 파일 생성 완료.")
    return df

def find_columns(df):
    id_candidates = {'wafer': ['wafer_id'], 'step': ['step_id'], 'time': ['time']}
    detected_cols = {}
    remaining_cols = list(df.columns)
    for id_type, candidates in id_candidates.items():
        found = False
        for col in remaining_cols:
            if col.lower() in candidates:
                detected_cols[id_type] = col; remaining_cols.remove(col); found = True; break
        if not found: detected_cols[id_type] = None
    sensor_cols = [col for col in remaining_cols if pd.api.types.is_numeric_dtype(df[col])]
    return detected_cols['wafer'], detected_cols['step'], sensor_cols

def extract_and_pivot_features(df, wafer_id_col, step_id_col, sensor_cols):
    def rms(x): return np.sqrt(np.mean(x**2))
    agg_funcs = ['mean', 'std', 'max', 'min', 'median', 'skew', pd.Series.kurt, rms]
    features = df.groupby([wafer_id_col, step_id_col])[sensor_cols].agg(agg_funcs).reset_index()
    new_cols = [wafer_id_col, step_id_col]
    for col_level0, col_level1 in features.columns[2:]:
        func_name = col_level1 if isinstance(col_level1, str) else col_level1.__name__
        new_cols.append(f"{col_level0}_{func_name}")
    features.columns = new_cols
    feature_pivot = features.pivot(index=wafer_id_col, columns=step_id_col)
    feature_pivot.columns = [f"S{int(col[1])}_{col[0]}" for col in feature_pivot.columns.values]
    feature_pivot.fillna(0, inplace=True)
    return feature_pivot.reset_index()

# --- Wafer 진단 리포트 함수 ---
def diagnose_top_wafers(df_daily, normal_stats, wafer_id_col, feature_cols, top_n=3):
    print(f"\n--- 당일 Health Index 상위 {top_n}개 Wafer 상세 진단 ---")
    df_sorted = df_daily.sort_values(by='health_index', ascending=False)
    for i, row in enumerate(df_sorted.head(top_n).itertuples()):
        wafer_id = getattr(row, wafer_id_col)
        z_scores = {}
        for feature in feature_cols:
            val = getattr(row, feature)
            mean = normal_stats.loc['mean', feature]
            std = normal_stats.loc['std', feature]
            if std > 1e-6:
                z_scores[feature] = (val - mean) / std
        sorted_features = sorted(z_scores.items(), key=lambda item: abs(item[1]), reverse=True)
        print(f"\n[{i+1}] Wafer ID: {wafer_id} (Health Index: {row.health_index:.4f})")
        print("  > 상위 원인 특징 (Z-score 기준):")
        for feature, score in sorted_features[:3]:
            parts = feature.split('_')
            step_info, stat_info = parts[0].replace('S', ''), parts[-1]
            sensor_info = '_'.join(parts[1:-1])
            print(f"    - Step: {step_info}, Sensor: {sensor_info}, Statistic: {stat_info}, Z-score: {score:.2f}")

# ======================================================================================
# Phase 1: 기준 모델 생성 (최초 1회 실행)
# ======================================================================================
def train_and_save_reference_model(golden_data_path='golden_data.csv'):
    print("\n" + "="*25 + " Phase 1: 기준 모델 생성 (Simplified Isolation Forest) " + "="*25)

    create_long_format_csv(golden_data_path, num_wafers=100)
    df_long = pd.read_csv(golden_data_path)
    wafer_id_col, step_id_col, sensor_cols = find_columns(df_long)
    df_wide = extract_and_pivot_features(df_long, wafer_id_col, step_id_col, sensor_cols)
    feature_cols = [col for col in df_wide.columns if col != wafer_id_col]

    scaler_final = StandardScaler()
    golden_features_scaled = scaler_final.fit_transform(df_wide[feature_cols])

    iforest = IsolationForest(contamination='auto', random_state=42)
    print("\n아이솔레이션 포레스트 모델 학습 시작...")
    iforest.fit(golden_features_scaled)
    print("모델 학습 완료.")

    # --- [수정됨] 가장 좋은 점수를 0점의 기준으로 설정 ---
    decision_scores = iforest.decision_function(golden_features_scaled)
    max_score_baseline = np.max(decision_scores)
    print(f"\n가장 좋은 상태의 기준 점수(0점 기준): {max_score_baseline:.6f}")

    # 자가 진단: 골든 데이터셋의 평균 인덱스 출력
    raw_health_indexes = max_score_baseline - decision_scores
    print(f"[자가 진단] 골든 데이터셋 자체의 평균 Health Index: {np.mean(raw_health_indexes):.4f}")

    normal_stats = df_wide[feature_cols].agg(['mean', 'std'])

    # --- [수정됨] 저장할 모델 및 기준값 단순화 ---
    joblib.dump(iforest, 'iforest_model.joblib')
    joblib.dump(scaler_final, 'scaler_final.joblib')
    joblib.dump(feature_cols, 'feature_cols.joblib')
    joblib.dump(max_score_baseline, 'max_score_baseline.joblib')
    joblib.dump(normal_stats, 'normal_stats.joblib')

    print("\n기준 모델 생성 및 저장 완료.")
    print("="*70)

# ======================================================================================
# Phase 2: 일일 모니터링 (매일 반복 실행)
# ======================================================================================
def run_daily_monitoring(days_to_monitor=10):
    print("\n" + "="*25 + " Phase 2: 일일 모니터링 시작 " + "="*25)

    try:
        iforest = joblib.load('iforest_model.joblib')
        scaler_final = joblib.load('scaler_final.joblib')
        feature_cols = joblib.load('feature_cols.joblib')
        max_score_baseline = joblib.load('max_score_baseline.joblib')
        normal_stats = joblib.load('normal_stats.joblib')
        print("저장된 기준 모델을 성공적으로 불러왔습니다.")
    except FileNotFoundError as e:
        print(f"오류: 기준 모델 파일(.joblib)을 찾을 수 없습니다. Phase 1을 먼저 실행해주세요. ({e})")
        return

    daily_log = []
    start_date = date.today()

    for day in range(days_to_monitor):
        current_date = start_date + timedelta(days=day)
        print(f"\n--- {current_date} 데이터 모니터링 ---")

        daily_data_path = f'daily_data_day_{day+1}.csv'
        degradation_info = {'c_drift': day * 0.05, 'g_noise': 1 + day * 0.01}
        create_long_format_csv(daily_data_path, num_wafers=20, anomaly_info=degradation_info)
        df_long_daily = pd.read_csv(daily_data_path)

        wafer_id_col, step_id_col, sensor_cols = find_columns(df_long_daily)
        df_wide_daily = extract_and_pivot_features(df_long_daily, wafer_id_col, step_id_col, sensor_cols)
        df_wide_daily = df_wide_daily[df_wide_daily.columns.intersection([wafer_id_col] + feature_cols)]

        # --- [수정됨] Health Index 계산 단순화 ---
        daily_features_scaled = scaler_final.transform(df_wide_daily[feature_cols])

        decision_scores = iforest.decision_function(daily_features_scaled)
        raw_health_indexes = max_score_baseline - decision_scores
        df_wide_daily['health_index'] = raw_health_indexes

        daily_index = np.mean(raw_health_indexes)

        print(f"{current_date}의 Daily Index: {daily_index:.4f}")
        daily_log.append({'date': current_date, 'daily_index': daily_index})

        diagnose_top_wafers(df_wide_daily, normal_stats, wafer_id_col, feature_cols, top_n=3)

    log_df = pd.DataFrame(daily_log)
    log_df.to_csv('daily_health_log.csv', index=False)
    print("\n일일 모니터링 로그 저장 완료: daily_health_log.csv")

    plt.figure(figsize=(12, 6))
    sns.lineplot(x='date', y='daily_index', data=log_df, marker='o')
    plt.title('Daily Equipment Health Index Trend (Simplified)', fontsize=16)
    plt.xlabel('Date')
    plt.ylabel('Daily Health Index (Lower is Better)')
    plt.grid(True, linestyle='--')
    plt.tight_layout()
    plt.show()


if __name__ == '__main__':
    train_and_save_reference_model()
    run_daily_monitoring(days_to_monitor=10)