In [4]:
# 현재 커널/버전 확인(선택)
import sys
print("Python:", sys.version)
print("Exec  :", sys.executable)

# numpy/pandas 재설치 (호환 버전 고정)
%pip install -U --force-reinstall "numpy==1.26.4" "pandas==2.2.2"

# (선택) 추가로 자주 쓰는 것들
%pip install -U scikit-learn matplotlib


Python: 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0]
Exec  : /opt/conda/bin/python
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting pandas==2.2.2
  Using cached pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting python-dateutil>=2.8.2 (from pandas==2.2.2)
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas==2.2.2)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas==2.2.2)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas==2.2.2)
  Using cached six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)
Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Using cached pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manyli

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
MLflow-친화 파이프라인 (v3, 개선판)
- EDA -> Feature Engineering -> 여러 모델(Linear/RF/(XGB)) 학습/검증/미래예측
- 변경점:
  * mlflow 모델 로깅: artifact_path -> name (MLflow 3.x 경고 제거)
  * 모델 로깅 가속: pip_requirements 최소화 또는 비활성화
  * Permutation Importance 경량화: 테스트셋 샘플링(<=500), n_repeats=3
"""

from pathlib import Path
import os, warnings, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
os.environ.setdefault("GIT_PYTHON_REFRESH", "quiet")  # git 경고 숨김

# ---------------- MLflow: 전역 임포트 (함수 내 재할당 금지) ----------------
import mlflow
import mlflow.data
try:
    import mlflow.sklearn as mlflow_sklearn
except Exception:
    mlflow_sklearn = None

# autolog은 켜되, 느리면 꺼도 됩니다.
try:
    if mlflow_sklearn is not None:
        mlflow_sklearn.autolog(log_models=False, log_input_examples=True, silent=True)
        # log_models=False: 자동 모델 로깅은 끄고, 아래에서 수동으로 빠르게 로깅
except Exception:
    pass

# ---------------- 경로/설정 ----------------
PROC = Path("/root/covid_processed.csv")      # 입력(전처리 완료)
FEAT = Path("/root/covid_features.csv")       # FE 출력
OUTDIR = Path("/root")
TARGET = "new_cases"                           # 필요시 변경
HORIZON = 30                                   # 미래 예측일수
TEST_DAYS = 60                                 # 테스트 구간(말일 기준)
LOOKBACKS = [1, 7, 14]                         # 래그
ROLLS = [7, 14, 28]                            # 롤링 윈도우
EXPERIMENT_NAME = "covid_timeseries_prophet_lstm"  # 기존과 동일

mlflow.set_experiment(EXPERIMENT_NAME)

# ---------------- 로깅 도우미 ----------------
def log_df_preview_md(df: pd.DataFrame, name: str, n: int = 20):
    mlflow.log_text(df.head(n).to_markdown(index=False), f"{name}_preview.md")

def log_json(d: dict, name: str):
    mlflow.log_text(json.dumps(d, indent=2, ensure_ascii=False), f"{name}.json")

def lineplot(dates, series, title, fname):
    fig = plt.figure()
    plt.plot(dates, series)
    plt.title(title); plt.xlabel("date"); plt.ylabel("value")
    mlflow.log_figure(fig, fname)
    plt.close(fig)

def dualplot(dates, y_true, y_pred, title, fname):
    fig = plt.figure()
    plt.plot(dates, y_true, label="actual")
    plt.plot(dates, y_pred, label="pred")
    plt.legend(); plt.title(title); plt.xlabel("date"); plt.ylabel("value")
    mlflow.log_figure(fig, fname)
    plt.close(fig)

def residplot(y_true, y_pred, title, fname):
    res = np.array(y_true) - np.array(y_pred)
    fig = plt.figure()
    plt.scatter(y_pred, res, s=8)
    plt.axhline(0, color="gray"); plt.title(title)
    plt.xlabel("pred"); plt.ylabel("residuals")
    mlflow.log_figure(fig, fname)
    plt.close(fig)

def corr_heatmap(df: pd.DataFrame, fname="eda_corr_heatmap.png", max_cols=30):
    try:
        import seaborn as sns  # 선택
        cols = df.select_dtypes(include=[np.number]).columns.tolist()[:max_cols]
        if len(cols) >= 2:
            fig = plt.figure(figsize=(6,5))
            sns.heatmap(df[cols].corr(), cmap="coolwarm", center=0)
            plt.title("Correlation (numeric, head)")
            mlflow.log_figure(fig, fname); plt.close(fig)
    except Exception:
        pass

# ---------------- 피처 엔지니어링 ----------------
def add_time_features(df: pd.DataFrame, date_col="date"):
    if date_col not in df.columns: 
        return df
    df["date"] = pd.to_datetime(df[date_col], errors="coerce")
    df["dow"] = df["date"].dt.dayofweek
    df["weekofyear"] = df["date"].dt.isocalendar().week.astype(int)
    df["dayofyear"] = df["date"].dt.dayofyear
    df["dow_sin"] = np.sin(2*np.pi*df["dow"]/7); df["dow_cos"] = np.cos(2*np.pi*df["dow"]/7)
    if "month" in df.columns:
        df["month_sin"] = np.sin(2*np.pi*df["month"]/12); df["month_cos"] = np.cos(2*np.pi*df["month"]/12)
    return df

def add_lag_roll(df: pd.DataFrame, target: str, lags, rolls):
    df = df.sort_values("date").reset_index(drop=True)
    for l in lags:
        df[f"{target}_lag{l}"] = df[target].shift(l)
    for w in rolls:
        df[f"{target}_rollmean{w}"] = df[target].shift(1).rolling(w, min_periods=1).mean()
        df[f"{target}_rollstd{w}"]  = df[target].shift(1).rolling(w, min_periods=1).std()
    df[f"{target}_diff1"] = df[target].diff(1)
    df[f"{target}_pct"]   = df[target].pct_change().replace([np.inf, -np.inf], np.nan)
    df = df.bfill().ffill()
    return df

def feature_engineer(in_path: Path, target: str, out_path: Path) -> pd.DataFrame:
    df = pd.read_csv(in_path)
    if "date" not in df.columns: 
        raise ValueError("Need 'date' column.")
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df = add_time_features(df)
    df = add_lag_roll(df, target, LOOKBACKS, ROLLS)
    num_cols = df.select_dtypes(include=[np.number]).columns
    df[num_cols] = df[num_cols].interpolate("linear", limit_direction="both")
    df = df.bfill().ffill()
    out_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(out_path, index=False)
    return df

# ---------------- 데이터셋/스플릿 ----------------
def time_split(df: pd.DataFrame, test_days: int):
    df = df.sort_values("date").reset_index(drop=True)
    cutoff = df["date"].max() - pd.Timedelta(days=test_days-1)
    train = df[df["date"] < cutoff].copy()
    test  = df[df["date"] >= cutoff].copy()
    return train, test

def build_xy(df: pd.DataFrame, target: str):
    df = df.sort_values("date")
    df["y_next"] = df[target].shift(-1)  # 1-step ahead
    feats = [c for c in df.select_dtypes(include=[np.number]).columns if c not in [target,"y_next"]]
    df = df.dropna(subset=["y_next"])
    X = df[feats].values.astype(np.float32)
    y = df["y_next"].values.astype(np.float32)
    idx = df.index  # 라벨 인덱스
    return X, y, feats, idx

# ---------------- 평가 ----------------
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
def reg_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = float(np.mean(np.abs((y_true - y_pred)/np.maximum(1e-9, np.abs(y_true))))*100.0)
    r2 = r2_score(y_true, y_pred)
    return {"MAE": float(mae), "RMSE": float(rmse), "MAPE": float(mape), "R2": float(r2)}

# ---------------- 재귀 예측(미래 H일) ----------------
def recursive_forecast(df_feat: pd.DataFrame, feats: list, model, horizon: int, target: str) -> pd.DataFrame:
    df = df_feat.sort_values("date").copy()
    out_dates = pd.date_range(df["date"].max() + pd.Timedelta(days=1), periods=horizon, freq="D")
    preds = []
    x_last = df[feats].iloc[-1].values.reshape(1, -1)  # 간단: 고정 입력
    for _ in range(horizon):
        yhat = float(model.predict(x_last)[0])
        preds.append(yhat)
    return pd.DataFrame({"date": out_dates, "yhat": preds})

# ---------------- Permutation Importance (경량화) ----------------
def permutation_importance_light(model, X, y, feats, n_repeats=3, max_rows=500):
    try:
        from sklearn.inspection import permutation_importance as sk_perm
        if len(X) > max_rows:
            # 무작위 샘플링으로 비용 절감
            idx = np.random.RandomState(42).choice(len(X), size=max_rows, replace=False)
            Xs, ys = X[idx], y[idx]
        else:
            Xs, ys = X, y
        r = sk_perm(model, Xs, ys, n_repeats=n_repeats, random_state=42, n_jobs=-1)
        imp = pd.DataFrame({"feature": feats, "importance": r.importances_mean}).sort_values("importance", ascending=False)
        return imp
    except Exception:
        return None

# ---------------- 안전한 모델 로깅 (MLflow 3.x 호환) ----------------
def safe_log_sklearn_model(model, name: str, signature=None, input_example=None, pip_reqs_min=True):
    """
    MLflow 3.x: name= 권장. 오래 걸리는 환경 스냅샷을 피하려 pip_requirements 최소화/생략.
    """
    if mlflow_sklearn is None:
        return
    kwargs = {}
    if signature is not None: kwargs["signature"] = signature
    if input_example is not None: kwargs["input_example"] = input_example
    if pip_reqs_min:
        # 최소 요구사항만 기록하거나, 아예 비활성화도 가능
        kwargs["pip_requirements"] = ["mlflow", "scikit-learn"]
    try:
        mlflow_sklearn.log_model(sk_model=model, name=name, **kwargs)
    except TypeError:
        # 구버전 호환(경고는 감수)
        mlflow_sklearn.log_model(model, artifact_path=name, **kwargs)

def safe_log_xgb_model(model, name: str, signature=None, input_example=None, pip_reqs_min=True):
    try:
        import mlflow.xgboost as mlflow_xgb
    except Exception:
        return
    kwargs = {}
    if signature is not None: kwargs["signature"] = signature
    if input_example is not None: kwargs["input_example"] = input_example
    if pip_reqs_min:
        kwargs["pip_requirements"] = ["mlflow", "xgboost"]
    try:
        mlflow_xgb.log_model(model, name=name, **kwargs)
    except TypeError:
        mlflow_xgb.log_model(model, artifact_path=name, **kwargs)

# ==================== main ====================
def main():
    assert PROC.exists(), f"Processed CSV not found: {PROC}"
    base_df = pd.read_csv(PROC)
    if "date" not in base_df.columns or TARGET not in base_df.columns:
        raise ValueError("Input must have 'date' and target.")

    with mlflow.start_run(run_name="experiment_run") as parent_run:

        # ----- 0) EDA -----
        with mlflow.start_run(run_name="data_and_eda", nested=True):
            ds = mlflow.data.from_pandas(base_df, source=str(PROC), name="raw_processed")
            mlflow.log_input(ds, context="training")
            mlflow.log_metrics({
                "rows": len(base_df),
                "cols": base_df.shape[1],
                "missing_total": int(base_df.isnull().sum().sum())
            })
            try:
                dfp = base_df.copy()
                dfp["date"] = pd.to_datetime(dfp["date"], errors="coerce")
                dfp = dfp.dropna(subset=["date"]).sort_values("date")
                lineplot(dfp["date"], dfp[TARGET], f"{TARGET} over time", "eda_target.png")
                corr_heatmap(dfp, "eda_corr_heatmap.png", max_cols=30)
            except Exception:
                pass
            log_df_preview_md(base_df, "raw_preview", 20)

        # ----- 1) Feature Engineering -----
        with mlflow.start_run(run_name="feature_engineering", nested=True):
            df_feat = feature_engineer(PROC, TARGET, FEAT)
            ds = mlflow.data.from_pandas(df_feat, source=str(FEAT), name="feature_engineered")
            mlflow.log_input(ds, context="training")
            mlflow.log_metrics({"fe_rows": len(df_feat), "fe_cols": df_feat.shape[1]})
            mlflow.log_artifact(str(FEAT))
            log_df_preview_md(df_feat, "fe_preview", 20)

        # ----- 2) 모델 학습/검증/미래예측 -----
        df_feat = pd.read_csv(FEAT)
        df_feat["date"] = pd.to_datetime(df_feat["date"], errors="coerce")
        train_df, test_df = time_split(df_feat, TEST_DAYS)

        # datasets lineage
        train_ds = mlflow.data.from_pandas(train_df, name="train_dataset")
        test_ds  = mlflow.data.from_pandas(test_df,  name="test_dataset")

        # 공통 X,y
        X_train, y_train, feats, idx_train = build_xy(train_df, TARGET)
        X_test,  y_test,  _,    idx_test  = build_xy(test_df,  TARGET)

        # ---- 모델 1: Linear Regression
        from sklearn.linear_model import LinearRegression
        with mlflow.start_run(run_name="model_linear", nested=True):
            mlflow.log_params({"algo":"LinearRegression","horizon":HORIZON,"test_days":TEST_DAYS,
                               "lookbacks":",".join(map(str,LOOKBACKS)),"rolls":",".join(map(str,ROLLS))})
            mlflow.log_input(train_ds, context="training")
            mlflow.log_input(test_ds,  context="testing")

            mdl = LinearRegression()
            mdl.fit(X_train, y_train)
            pred_test = mdl.predict(X_test)

            m = reg_metrics(y_test, pred_test)
            mlflow.log_metrics({f"test_{k}":v for k,v in m.items()})

            dt_test = test_df.loc[idx_test, "date"]   # 라벨 인덱스는 loc로
            dualplot(dt_test, y_test, pred_test, "Linear: actual vs pred (test)", "lin_test_pred.png")
            residplot(y_test, pred_test, "Linear: residuals (test)", "lin_test_resid.png")

            imp = permutation_importance_light(mdl, X_test, y_test, feats, n_repeats=3, max_rows=500)
            if imp is not None:
                mlflow.log_text(imp.head(30).to_markdown(index=False), "lin_perm_importance.md")

            df_fore = recursive_forecast(df_feat, feats, mdl, HORIZON, TARGET)
            fore_path = OUTDIR / "linear_forecast.csv"
            df_fore.to_csv(fore_path, index=False)
            mlflow.log_artifact(str(fore_path))
            lineplot(df_fore["date"], df_fore["yhat"], "Linear forecast (future)", "lin_forecast.png")

            try:
                sig = mlflow.models.infer_signature(X_train[:5], mdl.predict(X_train[:5]))
                safe_log_sklearn_model(mdl, name="model_linear", signature=sig, input_example=X_train[:5], pip_reqs_min=True)
            except Exception:
                pass

        # ---- 모델 2: RandomForest
        from sklearn.ensemble import RandomForestRegressor
        with mlflow.start_run(run_name="model_rf", nested=True):
            params = {"algo":"RandomForestRegressor","n_estimators":400,"max_depth":12,"min_samples_leaf":2,
                      "n_jobs":-1,"random_state":42,"horizon":HORIZON,"test_days":TEST_DAYS}
            mlflow.log_params(params)
            mlflow.log_input(train_ds, context="training"); mlflow.log_input(test_ds, context="testing")

            rf = RandomForestRegressor(
                n_estimators=params["n_estimators"],
                max_depth=params["max_depth"],
                min_samples_leaf=params["min_samples_leaf"],
                n_jobs=-1, random_state=42
            )
            rf.fit(X_train, y_train)
            pred_test = rf.predict(X_test)

            m = reg_metrics(y_test, pred_test)
            mlflow.log_metrics({f"test_{k}":v for k,v in m.items()})

            dt_test = test_df.loc[idx_test, "date"]
            dualplot(dt_test, y_test, pred_test, "RF: actual vs pred (test)", "rf_test_pred.png")
            residplot(y_test, pred_test, "RF: residuals (test)", "rf_test_resid.png")

            imp = permutation_importance_light(rf, X_test, y_test, feats, n_repeats=3, max_rows=500)
            if imp is not None:
                mlflow.log_text(imp.head(30).to_markdown(index=False), "rf_perm_importance.md")

            df_fore = recursive_forecast(df_feat, feats, rf, HORIZON, TARGET)
            fore_path = OUTDIR / "rf_forecast.csv"
            df_fore.to_csv(fore_path, index=False)
            mlflow.log_artifact(str(fore_path))
            lineplot(df_fore["date"], df_fore["yhat"], "RF forecast (future)", "rf_forecast.png")

            try:
                sig = mlflow.models.infer_signature(X_train[:5], rf.predict(X_train[:5]))
                safe_log_sklearn_model(rf, name="model_rf", signature=sig, input_example=X_train[:5], pip_reqs_min=True)
            except Exception:
                pass

        # ---- (선택) 모델 3: XGBoost (설치 시 자동)
        try:
            from xgboost import XGBRegressor
            with mlflow.start_run(run_name="model_xgb", nested=True):
                params = {"algo":"XGBRegressor","n_estimators":600,"max_depth":6,"learning_rate":0.05,
                          "subsample":0.9,"colsample_bytree":0.9,"random_state":42,
                          "reg_lambda":1.0,"horizon":HORIZON,"test_days":TEST_DAYS}
                mlflow.log_params(params); mlflow.log_input(train_ds, "training"); mlflow.log_input(test_ds, "testing")

                xgb = XGBRegressor(
                    n_estimators=params["n_estimators"], max_depth=params["max_depth"],
                    learning_rate=params["learning_rate"], subsample=params["subsample"],
                    colsample_bytree=params["colsample_bytree"], random_state=42, n_jobs=-1,
                    reg_lambda=params["reg_lambda"], tree_method="hist"
                )
                xgb.fit(X_train, y_train)
                pred_test = xgb.predict(X_test)

                m = reg_metrics(y_test, pred_test)
                mlflow.log_metrics({f"test_{k}":v for k,v in m.items()})

                dt_test = test_df.loc[idx_test, "date"]
                dualplot(dt_test, y_test, pred_test, "XGB: actual vs pred (test)", "xgb_test_pred.png")
                residplot(y_test, pred_test, "XGB: residuals (test)", "xgb_test_resid.png")

                df_fore = recursive_forecast(df_feat, feats, xgb, HORIZON, TARGET)
                fore_path = OUTDIR / "xgb_forecast.csv"
                df_fore.to_csv(fore_path, index=False)
                mlflow.log_artifact(str(fore_path))
                lineplot(df_fore["date"], df_fore["yhat"], "XGB forecast (future)", "xgb_forecast.png")

                try:
                    sig = mlflow.models.infer_signature(X_train[:5], xgb.predict(X_train[:5]))
                    safe_log_xgb_model(xgb, name="model_xgb", signature=sig, input_example=X_train[:5], pip_reqs_min=True)
                except Exception:
                    pass
        except Exception:
            pass  # xgboost 미설치면 자동 스킵

if __name__ == "__main__":
    main()
