In [13]:
%pip -q install pandas numpy scikit-learn matplotlib python-dateutil xgboost lightgbm tensorflow==2.*

zsh:1: no matches found: tensorflow==2.*
Note: you may need to restart the kernel to use updated packages.


In [14]:
import os, sys, warnings
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

BASE_DIR = "/Users/llouis/Documents/model_test"
DATA_DIR = os.path.join(BASE_DIR, "data")

sys.path.insert(0, BASE_DIR)
from pipeline import run_pipeline

print("BASE_DIR:", BASE_DIR)
print("DATA_DIR:", DATA_DIR)

BASE_DIR: /Users/llouis/Documents/model_test
DATA_DIR: /Users/llouis/Documents/model_test/data


In [15]:
def read_csv_smart(path):
    for enc in ["utf-8", "cp949", "euc-kr", "latin1"]:
        try:
            return pd.read_csv(path, encoding=enc)
        except Exception:
            pass
    raise RuntimeError(f"CSV 인코딩 해석 실패: {path}")


ds1 = read_csv_smart(os.path.join(DATA_DIR, "big_data_set1_f.csv"))
ds2 = read_csv_smart(os.path.join(DATA_DIR, "ds2_monthly_usage.csv"))
ds3 = read_csv_smart(os.path.join(DATA_DIR, "ds3_monthly_customers.csv"))
print(ds1.shape, ds2.shape, ds3.shape)

(4185, 9) (86590, 15) (86590, 17)


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

KEY_MCT, KEY_YM = "ENCODED_MCT", "TA_YM"


def to_month(s):
    dt = pd.to_datetime(s.astype(str), errors="coerce")
    return pd.to_datetime(dt.dt.to_period("M").astype(str))


def build_labels_robust(ds1, ds2, ds3, k_months=3, topq=0.10):
    df = ds2.merge(ds3, on=[KEY_MCT, KEY_YM], how="outer")
    df[KEY_YM] = to_month(df[KEY_YM])
    df[KEY_MCT] = df[KEY_MCT].astype(str)
    df = df.sort_values([KEY_MCT, KEY_YM]).reset_index(drop=True)
    df["y"] = 0

    # 1) Real label (폐업일)
    if "MCT_ME_D" in ds1.columns:
        tmp = ds1[[KEY_MCT, "MCT_ME_D"]].copy()
        tmp[KEY_MCT] = tmp[KEY_MCT].astype(str)
        tmp["MCT_ME_D"] = pd.to_datetime(tmp["MCT_ME_D"], errors="coerce")
        df = df.merge(tmp, on=KEY_MCT, how="left")
        t0 = df[KEY_YM]
        tK = t0 + pd.offsets.MonthEnd(0) + pd.DateOffset(months=k_months)
        cond = (df["MCT_ME_D"].notna()) & (df["MCT_ME_D"] > t0) & (df["MCT_ME_D"] <= tK)
        df.loc[cond, "y"] = 1

    # 2) Proxy label (급락/해지 과열)
    if df["y"].nunique() < 2:
        def bin2num(s):
            s = s.astype(str)
            m = s.str.extract(r"(\d+)", expand=False)
            return pd.to_numeric(m, errors="coerce")

        df["RC_SAA_num"] = bin2num(df.get("RC_M1_SAA", ""))
        df["RC_CUS_num"] = bin2num(df.get("RC_M1_UE_CUS_CN", ""))
        df["dSAA"] = df.groupby(KEY_MCT)["RC_SAA_num"].diff()
        df["dCUS"] = df.groupby(KEY_MCT)["RC_CUS_num"].diff()
        cxl = pd.to_numeric(df.get("APV_CE_RAT", 0), errors="coerce")
        indme = pd.to_numeric(df.get("M12_SME_RY_ME_MCT_RAT", 0), errors="coerce")
        bznme = pd.to_numeric(df.get("M12_SME_BZN_ME_MCT_RAT", 0), errors="coerce")
        sig = (df["dSAA"] <= -10).astype(int) + (df["dCUS"] <= -10).astype(int) + (cxl >= 90).astype(int) + (
                indme >= 80).astype(int) + (bznme >= 80).astype(int)
        df["y_proxy"] = (sig >= 2).astype(int)
        if df["y_proxy"].nunique() >= 2 and df["y_proxy"].sum() > 0:
            df["y"] = df["y_proxy"]

    # 3) 여전히 한 클래스면 risk 기반 topq
    if df["y"].nunique() < 2:
        out = run_pipeline(ds1, ds2, ds3, preds=None)
        outj = out.merge(df[[KEY_MCT, KEY_YM]], on=[KEY_MCT, KEY_YM], how="right")
        pf = pd.to_numeric(outj["p_final"], errors="coerce").fillna(0)
        thr = pf.quantile(1 - topq)
        df["y"] = (pf >= thr).astype(int)
    return df


robust_df = build_labels_robust(ds1, ds2, ds3, k_months=3, topq=0.10)
print("Label counts:", robust_df["y"].value_counts(dropna=False).to_dict())

Label counts: {1: 86590}


In [17]:
num_cols = [
    "M1_SME_RY_SAA_RAT", "M1_SME_RY_CNT_RAT",
    "M12_SME_RY_SAA_PCE_RT", "M12_SME_BZN_SAA_PCE_RT",
    "M12_SME_RY_ME_MCT_RAT", "M12_SME_BZN_ME_MCT_RAT",
    "DLV_SAA_RAT", "MCT_UE_CLN_REU_RAT", "MCT_UE_CLN_NEW_RAT"
]
cat_cols = [c for c in ["HPSN_MCT_ZCD_NM", "HPSN_MCT_BZN_CD_NM"] if c in robust_df.columns]

X = robust_df[num_cols + cat_cols].copy()
y = robust_df["y"].astype(int)

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

num_transform = Pipeline([("imp", SimpleImputer(strategy="median"))])
ct = ColumnTransformer([
    ("num", num_transform, num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
], remainder="drop")

In [18]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import ClassifierMixin
import copy


def stratified_split_force(X, y, test_size=0.25, random_state=42):
    y = y.reset_index(drop=True)
    idx_pos = np.where(y == 1)[0]
    idx_neg = np.where(y == 0)[0]

    if len(idx_pos) >= 2 and len(idx_neg) >= 2:
        rs = np.random.RandomState(random_state)
        t_pos = max(1, int(round(test_size * len(idx_pos))))
        t_neg = max(1, int(round(test_size * len(idx_neg))))
        test_idx = np.r_[rs.choice(idx_pos, t_pos, replace=False),
        rs.choice(idx_neg, t_neg, replace=False)]
        train_mask = np.ones(len(y), dtype=bool)
        train_mask[test_idx] = False
        return X.iloc[train_mask], X.iloc[test_idx], y.iloc[train_mask], y.iloc[test_idx]
    else:
        return X, X, y, y


def safe_proba(pipe, X, pos_label=1):
    clf = None
    try:
        clf = pipe.named_steps.get("clf", None)
    except Exception:
        pass

    if hasattr(pipe, "predict_proba"):
        proba = pipe.predict_proba(X)
        classes_ = None
        if clf is not None and hasattr(clf, "classes_"):
            classes_ = clf.classes_
        elif hasattr(pipe, "classes_"):
            classes_ = pipe.classes_

        if proba.shape[1] == 1:
            cls = classes_[0] if classes_ is not None and len(classes_) == 1 else None
            if cls == pos_label:
                return np.ones(proba.shape[0])
            else:
                return np.zeros(proba.shape[0])
        else:
            if classes_ is not None:
                idx = int(np.where(classes_ == pos_label)[0][0])
            else:
                idx = 1
            return proba[:, idx]

    if hasattr(pipe, "decision_function"):
        s = pipe.decision_function(X)
        return 1.0 / (1.0 + np.exp(-s))

    pred = pipe.predict(X)
    return pred.astype(float)

In [19]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, IsolationForest
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
import xgboost as xgb
import lightgbm as lgb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import copy
import numpy as np


def metrics_safe(y_true, p):
    if hasattr(y_true, "nunique"):
        nuniq = y_true.nunique()
    else:
        nuniq = len(np.unique(y_true))
    if nuniq < 2:
        return {"note": "single-class test; AUC/PR 미정", "roc_auc": None, "pr_auc": None}
    return {
        "roc_auc": float(roc_auc_score(y_true, p)),
        "pr_auc": float(average_precision_score(y_true, p)),
    }


Xtr, Xte, ytr, yte = stratified_split_force(X, y, test_size=0.25, random_state=42)

# ====== A. 전체가 한 클래스인 경우 → 비지도(IF) 경로로 통째 처리 ======
if y.nunique() < 2:
    print("Global single-class detected → IsolationForest-only fallback.")

    ct_dl = copy.deepcopy(ct)
    pipe_if = Pipeline([("prep", ct_dl), ("clf", IsolationForest(
        n_estimators=400, contamination=0.10, random_state=42, n_jobs=-1
    ))])
    pipe_if.fit(X)
    s = pipe_if["clf"].score_samples(pipe_if["prep"].transform(X))
    s = (s - s.min()) / (s.max() - s.min() + 1e-9)
    pe = 1 - s

    prf_f = pgb_f = pxgb_f = plgb_f = pdl_f = pe

    preds_full = robust_df[["ENCODED_MCT", "TA_YM"]].copy()
    preds_full["ENCODED_MCT"] = preds_full["ENCODED_MCT"].astype(str)
    preds_full["TA_YM"] = pd.to_datetime(preds_full["TA_YM"], errors="coerce").dt.to_period("M").dt.to_timestamp()
    preds_full["pred_xgb"] = pxgb_f
    preds_full["pred_lgbm"] = plgb_f
    preds_full["pred_rf"] = prf_f
    preds_full["pred_gb"] = pgb_f
    preds_full["pred_dl"] = pdl_f
    preds_full = preds_full.dropna(subset=["ENCODED_MCT", "TA_YM"])
    preds_path = os.path.join(DATA_DIR, "preds.csv")
    preds_full.to_csv(preds_path, index=False, encoding="utf-8")
    print("Saved preds:", preds_path)

else:
    # ====== B. 지도 경로(두 클래스 존재) — 개별 모델 안전 학습 ======
    def fit_or_dummy(pipe_ctor, Xtr, ytr, Xte, label_name):
        try:
            if ytr.nunique() < 2:
                raise ValueError("train has single class")
            pipe = pipe_ctor()
            pipe.fit(Xtr, ytr)
            p = safe_proba(pipe, Xte)
            return pipe, p, False
        except Exception as e:
            print(f"[{label_name}] fallback to DummyClassifier due to: {e}")
            dummy = Pipeline([
                ("prep", ct),
                ("clf", DummyClassifier(strategy="prior"))
            ])
            dummy.fit(Xtr, ytr)
            p = safe_proba(dummy, Xte)
            return dummy, p, True


    # 1) RF
    rf_ctor = lambda: Pipeline([("prep", ct),
                                ("clf", RandomForestClassifier(
                                    n_estimators=400, random_state=42, n_jobs=-1, class_weight="balanced"))])
    rf, prf, rf_dummy = fit_or_dummy(rf_ctor, Xtr, ytr, Xte, "RF")
    print("RF :", metrics_safe(yte, prf))

    # 2) GB
    gb_ctor = lambda: Pipeline([("prep", ct),
                                ("clf", GradientBoostingClassifier(random_state=42))])
    gb, pgb, gb_dummy = fit_or_dummy(gb_ctor, Xtr, ytr, Xte, "GB")
    print("GB :", metrics_safe(yte, pgb))

    # 3) XGB
    xgb_ctor = lambda: Pipeline([("prep", ct),
                                 ("clf", xgb.XGBClassifier(
                                     n_estimators=400, max_depth=5, learning_rate=0.05,
                                     subsample=0.8, colsample_bytree=0.8,
                                     eval_metric="logloss", random_state=42, tree_method="hist"))])
    xgb_clf, pxgb, xgb_dummy = fit_or_dummy(xgb_ctor, Xtr, ytr, Xte, "XGB")
    print("XGB:", metrics_safe(yte, pxgb))

    # 4) LGB
    lgb_ctor = lambda: Pipeline([("prep", ct),
                                 ("clf", lgb.LGBMClassifier(
                                     n_estimators=500, max_depth=-1, num_leaves=31, learning_rate=0.05,
                                     subsample=0.8, colsample_bytree=0.8,
                                     objective="binary", random_state=42))])
    lgb_clf, plgb, lgb_dummy = fit_or_dummy(lgb_ctor, Xtr, ytr, Xte, "LGB")
    print("LGB:", metrics_safe(yte, plgb))

    # 5) DL (별도 전처리 복사본 사용)
    ct_dl = copy.deepcopy(ct)
    Xd_tr = ct_dl.fit_transform(Xtr)
    Xd_te = ct_dl.transform(Xte)

    dl_model = None
    try:
        if ytr.nunique() < 2:
            raise ValueError("train has single class")
        inp = keras.Input(shape=(Xd_tr.shape[1],))
        h = layers.Dense(128, activation="relu")(inp)
        h = layers.Dropout(0.2)(h)
        h = layers.Dense(64, activation="relu")(h)
        outp = layers.Dense(1, activation="sigmoid")(h)
        dl_model = keras.Model(inp, outp)
        dl_model.compile(optimizer=keras.optimizers.Adam(1e-3), loss="binary_crossentropy")
        dl_model.fit(Xd_tr, ytr, epochs=10, batch_size=256, verbose=0)
        pdl = dl_model.predict(Xd_te, verbose=0).ravel()
        print("DL :", metrics_safe(yte, pdl))
        dl_dummy = False
    except Exception as e:
        print(f"[DL] fallback to Dummy due to: {e}")
        dummy_dl = Pipeline([("prep", ct), ("clf", DummyClassifier(strategy="prior"))])
        dummy_dl.fit(Xtr, ytr)
        pdl = safe_proba(dummy_dl, Xte)
        dl_dummy = True

    # ===== 앙상블 + 보정 (이용 가능한 모델만 사용해 가중 재정규화) =====
    w = {"xgb": 0.25, "lgb": 0.25, "rf": 0.25, "gb": 0.15, "dl": 0.10}
    parts, used = [], []
    if pxgb is not None: parts.append((w["xgb"], pxgb)); used.append("xgb")
    if plgb is not None: parts.append((w["lgb"], plgb)); used.append("lgb")
    if prf is not None: parts.append((w["rf"], prf));  used.append("rf")
    if pgb is not None: parts.append((w["gb"], pgb));  used.append("gb")
    if pdl is not None: parts.append((w["dl"], pdl));  used.append("dl")
    sw = sum(w for w, _ in parts)
    stack = np.sum([(w_ / sw) * p for (w_, p) in parts], axis=0)

    from sklearn.linear_model import LogisticRegression

    if yte.nunique() >= 2:
        pl = LogisticRegression(max_iter=200)
        pl.fit(stack.reshape(-1, 1), yte)
        stack_cal = pl.predict_proba(stack.reshape(-1, 1))[:, 1]
        print("Ensemble raw:", metrics_safe(yte, stack))
        print("Ensemble cal:", metrics_safe(yte, stack_cal))
    else:
        stack_cal = stack


    # ===== 전체 데이터 예측 → preds.csv =====
    def predict_full_models():
        prf_f = safe_proba(rf, X) if rf is not None else stack
        pgb_f = safe_proba(gb, X) if gb is not None else stack
        pxgb_f = safe_proba(xgb_clf, X) if xgb_clf is not None else stack
        plgb_f = safe_proba(lgb_clf, X) if lgb_clf is not None else stack
        Xd_full = ct_dl.transform(X)
        if dl_model is not None and not dl_dummy:
            pdl_f = dl_model.predict(Xd_full, verbose=0).ravel()
        else:
            pdl_f = stack
        if yte.nunique() >= 2:
            stack_f = (0.25 * pxgb_f + 0.25 * plgb_f + 0.25 * prf_f + 0.15 * pgb_f + 0.10 * pdl_f)
            pcal_f = pl.predict_proba(stack_f.reshape(-1, 1))[:, 1]
        else:
            pcal_f = (0.25 * pxgb_f + 0.25 * plgb_f + 0.25 * prf_f + 0.15 * pgb_f + 0.10 * pdl_f)
        return prf_f, pgb_f, pxgb_f, plgb_f, pdl_f, pcal_f


    prf_f, pgb_f, pxgb_f, plgb_f, pdl_f, pcal_f = predict_full_models()

    preds_full = robust_df[["ENCODED_MCT", "TA_YM"]].copy()
    preds_full["ENCODED_MCT"] = preds_full["ENCODED_MCT"].astype(str)
    preds_full["TA_YM"] = pd.to_datetime(preds_full["TA_YM"], errors="coerce").dt.to_period("M").dt.to_timestamp()
    preds_full["pred_xgb"] = pxgb_f
    preds_full["pred_lgbm"] = plgb_f
    preds_full["pred_rf"] = prf_f
    preds_full["pred_gb"] = pgb_f
    preds_full["pred_dl"] = pdl_f
    preds_full = preds_full.dropna(subset=["ENCODED_MCT", "TA_YM"])
    preds_path = os.path.join(DATA_DIR, "preds.csv")
    preds_full.to_csv(preds_path, index=False, encoding="utf-8")
    print("Saved preds:", preds_path)

Global single-class detected → IsolationForest-only fallback.
Saved preds: /Users/llouis/Documents/model_test/data/preds.csv


In [20]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.utils.validation import check_is_fitted


def _fallback_split(X, y, test_size=0.25, random_state=42):
    try:
        from sklearn.model_selection import train_test_split
        return train_test_split(X, y, stratify=y if getattr(y, "nunique", lambda: 2)() > 1 else None,
                                test_size=test_size, random_state=random_state)
    except Exception as e:
        raise RuntimeError("X/y로 테스트 분할을 만들 수 없습니다. 상위 셀에서 X, y를 먼저 정의하세요.") from e


def _is_fitted_estimator(est) -> bool:
    try:
        check_is_fitted(est)
        return True
    except Exception:
        return False


def _is_fitted_pipeline(pipe) -> bool:
    if pipe is None:
        return False
    est = getattr(pipe, "named_steps", {}).get("clf", pipe)
    return _is_fitted_estimator(est)


def _safe_proba(pipe, X, pos_label=1):
    n = len(X)
    if pipe is None or not _is_fitted_pipeline(pipe):
        return np.zeros(n)

    clf = getattr(pipe, "named_steps", {}).get("clf", None)

    if hasattr(pipe, "predict_proba"):
        try:
            proba = pipe.predict_proba(X)
            if proba.ndim == 1:
                return proba.astype(float)
            if proba.shape[1] > 1:
                classes_ = getattr(clf, "classes_", getattr(pipe, "classes_", None))
                if classes_ is not None and pos_label in list(classes_):
                    idx = int(np.where(classes_ == pos_label)[0][0])
                else:
                    idx = 1
                return proba[:, idx]
            else:
                classes_ = getattr(clf, "classes_", getattr(pipe, "classes_", [0]))
                return np.ones(n) if (len(classes_) == 1 and classes_[0] == pos_label) else np.zeros(n)
        except Exception:
            pass

    if hasattr(pipe, "decision_function"):
        try:
            s = pipe.decision_function(X)
            return 1.0 / (1.0 + np.exp(-s))
        except Exception:
            pass

    try:
        pred = pipe.predict(X)
        return pred.astype(float)
    except Exception:
        return np.zeros(n)


if "Xte" not in globals() or "yte" not in globals():
    if "X" not in globals() or "y" not in globals():
        raise RuntimeError("앙상블/보정을 위해서는 상위 셀에서 X, y가 먼저 정의되어 있어야 합니다.")
    if "stratified_split_force" in globals():
        Xtr, Xte, ytr, yte = stratified_split_force(X, y, test_size=0.25, random_state=42)
    else:
        Xtr, Xte, ytr, yte = _fallback_split(X, y, test_size=0.25, random_state=42)

prf = _safe_proba(globals().get("rf", None), Xte)
pgb = _safe_proba(globals().get("gb", None), Xte)
pxgb = _safe_proba(globals().get("xgb_clf", None), Xte)
plgb = _safe_proba(globals().get("lgb_clf", None), Xte)

if "dl_model" in globals() and "ct_dl" in globals() and dl_model is not None:
    try:
        Xd_te = ct_dl.transform(Xte)
        pdl = dl_model.predict(Xd_te, verbose=0).ravel()
    except Exception:
        pdl = np.zeros(len(Xte))
else:
    pdl = np.zeros(len(Xte))


def metrics_safe(y_true, p):
    y_true = pd.Series(y_true)
    if y_true.nunique() < 2:
        return {"note": "single-class test; AUC/PR 미정", "roc_auc": None, "pr_auc": None}
    return {
        "roc_auc": float(roc_auc_score(y_true, p)),
        "pr_auc": float(average_precision_score(y_true, p)),
    }


def platt_calibrate(scores, y, max_iter=200):
    scores = np.asarray(scores, dtype="float64")
    y = np.asarray(y)

    mask = np.isfinite(scores)
    if not mask.all():
        med = np.nanmedian(scores[mask]) if mask.any() else 0.5
        scores[~mask] = med

    if np.allclose(scores, scores[0]):
        return scores, {"note": "constant scores; skip calibration"}
    if pd.Series(y).nunique() < 2:
        return scores, {"note": "single-class; skip calibration"}

    lr = LogisticRegression(max_iter=max_iter)
    lr.fit(scores.reshape(-1, 1), y)
    calibrated = lr.predict_proba(scores.reshape(-1, 1))[:, 1]
    info = {"coef": float(lr.coef_[0, 0]), "intercept": float(lr.intercept_[0])}
    return calibrated, info


w = {"xgb": 0.25, "lgb": 0.25, "rf": 0.25, "gb": 0.15, "dl": 0.10}
stack = w["xgb"] * pxgb + w["lgb"] * plgb + w["rf"] * prf + w["gb"] * pgb + w["dl"] * pdl

stack_cal, pl_info = platt_calibrate(stack, yte, max_iter=200)
print("Ensemble raw:", metrics_safe(yte, stack))
print("Ensemble cal:", metrics_safe(yte, stack_cal))
print("Platt info:", pl_info)

Ensemble raw: {'note': 'single-class test; AUC/PR 미정', 'roc_auc': None, 'pr_auc': None}
Ensemble cal: {'note': 'single-class test; AUC/PR 미정', 'roc_auc': None, 'pr_auc': None}
Platt info: {'note': 'constant scores; skip calibration'}


In [22]:
import numpy as np


def predict_full_models_safe(Xf):
    prf_f = _safe_proba(globals().get("rf", None), Xf)
    pgb_f = _safe_proba(globals().get("gb", None), Xf)
    pxgb_f = _safe_proba(globals().get("xgb_clf", None), Xf)
    plgb_f = _safe_proba(globals().get("lgb_clf", None), Xf)

    # DL
    if "dl_model" in globals() and "ct_dl" in globals() and dl_model is not None:
        try:
            Xd_full = ct_dl.transform(Xf)
            pdl_f = dl_model.predict(Xd_full, verbose=0).ravel()
        except Exception:
            pdl_f = np.zeros(len(Xf))
    else:
        pdl_f = np.zeros(len(Xf))

    w = {"xgb": 0.25, "lgb": 0.25, "rf": 0.25, "gb": 0.15, "dl": 0.10}
    stack_f = (w["xgb"] * pxgb_f + w["lgb"] * plgb_f + w["rf"] * prf_f + w["gb"] * pgb_f + w["dl"] * pdl_f)

    use_platt = ("pl" in globals()) and (callable(getattr(pl, "predict_proba", None))) \
                and ("yte" in globals()) and (pd.Series(yte).nunique() >= 2)
    if use_platt:
        try:
            pcal_f = pl.predict_proba(stack_f.reshape(-1, 1))[:, 1]
        except Exception:
            pcal_f = stack_f
    else:
        pcal_f = stack_f

    return prf_f, pgb_f, pxgb_f, plgb_f, pdl_f, pcal_f

In [23]:
p = pd.read_csv(preds_path)
p["ENCODED_MCT"] = p["ENCODED_MCT"].astype(str)
p["TA_YM"] = pd.to_datetime(p["TA_YM"], errors="coerce").dt.to_period("M").dt.to_timestamp()
p = p.dropna(subset=["ENCODED_MCT", "TA_YM"])

out = run_pipeline(ds1, ds2, ds3, preds=p)
out_path = os.path.join(BASE_DIR, "risk_output_trained.csv")
out.to_csv(out_path, index=False, encoding="utf-8")
print("Saved:", out_path)

Saved: /Users/llouis/Documents/model_test/risk_output_trained.csv
