In [8]:
%pip install -q pandas numpy scikit-learn matplotlib python-dateutil
import os, sys, warnings

warnings.filterwarnings("ignore")

BASE_DIR = "/Users/llouis/Documents/model_test"
DATA_DIR = os.path.join(BASE_DIR, "data")

sys.path.insert(0, BASE_DIR)

from pipeline import run_pipeline

print("BASE_DIR:", BASE_DIR)
print("DATA_DIR:", DATA_DIR)

Note: you may need to restart the kernel to use updated packages.
BASE_DIR: /Users/llouis/Documents/model_test
DATA_DIR: /Users/llouis/Documents/model_test/data


In [9]:
import pandas as pd
import numpy as np


def read_csv_smart(path):
    for enc in ["utf-8", "cp949", "euc-kr", "latin1"]:
        try:
            return pd.read_csv(path, encoding=enc)
        except Exception:
            pass
    raise RuntimeError(f"CSV 인코딩 해석 실패: {path}")


ds1 = read_csv_smart(os.path.join(DATA_DIR, "big_data_set1_f.csv"))
ds2 = read_csv_smart(os.path.join(DATA_DIR, "ds2_monthly_usage.csv"))
ds3 = read_csv_smart(os.path.join(DATA_DIR, "ds3_monthly_customers.csv"))

print(ds1.shape, ds2.shape, ds3.shape)
print(ds2.head(2))
print(ds3.head(2))

(4185, 9) (86590, 15) (86590, 17)
  ENCODED_MCT   TA_YM MCT_OPE_MS_CN           RC_M1_SAA      RC_M1_TO_UE_CT  \
0  000F03E44A  202404      4_50-75%            5_75-90%            5_75-90%   
1  000F03E44A  202312      4_50-75%  6_90%초과(하위 10% 이하)  6_90%초과(하위 10% 이하)   

      RC_M1_UE_CUS_CN      RC_M1_AV_NP_AT APV_CE_RAT  DLV_SAA_RAT  \
0            5_75-90%            5_75-90%    1_상위1구간    -999999.9   
1  6_90%초과(하위 10% 이하)  6_90%초과(하위 10% 이하)        NaN    -999999.9   

   M1_SME_RY_SAA_RAT  M1_SME_RY_CNT_RAT  M12_SME_RY_SAA_PCE_RT  \
0                2.6               10.6                   93.8   
1                0.0                0.0                   94.8   

   M12_SME_BZN_SAA_PCE_RT  M12_SME_RY_ME_MCT_RAT  M12_SME_BZN_ME_MCT_RAT  
0                    71.5                   16.7                     7.8  
1                    73.4                   16.6                     7.2  
  ENCODED_MCT   TA_YM  M12_MAL_1020_RAT  M12_MAL_30_RAT  M12_MAL_40_RAT  \
0  0305234DDB  202311

## 라벨 생성 전략
1) **실라벨(권장)**: `ds1`에 `MCT_ME_D`(폐업일)가 있으면, 기준월 `t`에서 이후 `K`개월 내 폐업 발생 여부를 라벨로 사용합니다.
2) **프록시 라벨**: `ds2/3`만 있는 경우, 급락 시나리오(예: 매출 구간/고객수 구간 MoM 큰 하락) 조합으로 이벤트를 정의합니다.


In [10]:
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta

KEY_MCT = "ENCODED_MCT"
KEY_YM = "TA_YM"
K = 3


def to_month(dt_series):
    dt = pd.to_datetime(dt_series.astype(str), errors="coerce")
    return pd.to_datetime(dt.dt.to_period("M").astype(str))


df = ds2.merge(ds3, on=[KEY_MCT, KEY_YM], how="outer")
df[KEY_YM] = to_month(df[KEY_YM])
df = df.sort_values([KEY_MCT, KEY_YM])

use_real_label = False
if "MCT_ME_D" in ds1.columns:
    tmp = ds1[[KEY_MCT, "MCT_ME_D"]].copy()
    tmp["MCT_ME_D"] = pd.to_datetime(tmp["MCT_ME_D"], errors="coerce")
    df = df.merge(tmp, on=KEY_MCT, how="left")
    df["y"] = 0
    me = df["MCT_ME_D"]
    t0 = df[KEY_YM]
    tK = t0 + pd.offsets.MonthEnd(0) + pd.DateOffset(months=K)
    cond = (me.notna()) & (me > t0) & (me <= tK)
    df.loc[cond, "y"] = 1
    use_real_label = df["y"].sum() > 0

if not use_real_label:
    def bin2num(s):
        s = s.astype(str)
        m = s.str.extract(r"(\d+)", expand=False)
        return pd.to_numeric(m, errors="coerce")


    df["RC_SAA_num"] = bin2num(df.get("RC_M1_SAA", ""))
    df["RC_CUS_num"] = bin2num(df.get("RC_M1_UE_CUS_CN", ""))

    df = df.sort_values([KEY_MCT, KEY_YM])
    df["dSAA"] = df.groupby(KEY_MCT)["RC_SAA_num"].diff()
    df["dCUS"] = df.groupby(KEY_MCT)["RC_CUS_num"].diff()

    cxl = pd.to_numeric(df.get("APV_CE_RAT", 0), errors="coerce")
    ind_me = pd.to_numeric(df.get("M12_SME_RY_ME_MCT_RAT", 0), errors="coerce")
    bzn_me = pd.to_numeric(df.get("M12_SME_BZN_ME_MCT_RAT", 0), errors="coerce")

    bad = (
            (df["dSAA"] <= -10) | (df["dCUS"] <= -10) |
            (cxl >= 90) | (ind_me >= 90) | (bzn_me >= 90)
    )
    df["y"] = bad.astype(int)

print("Positive ratio:", df["y"].mean())

Positive ratio: 0.0


In [11]:
from copy import deepcopy


def build_labels_robust(ds1, ds2, ds3, base_dir, k_months=3, topq=0.10):
    KEY_MCT, KEY_YM = "ENCODED_MCT", "TA_YM"

    def to_month(dt_series):
        dt = pd.to_datetime(dt_series.astype(str), errors="coerce")
        return pd.to_datetime(dt.dt.to_period("M").astype(str))

    df = ds2.merge(ds3, on=[KEY_MCT, KEY_YM], how="outer")
    df[KEY_YM] = to_month(df[KEY_YM])
    df = df.sort_values([KEY_MCT, KEY_YM]).reset_index(drop=True)
    df["y"] = 0

    used_real = False
    if "MCT_ME_D" in ds1.columns:
        tmp = ds1[[KEY_MCT, "MCT_ME_D"]].copy()
        tmp["MCT_ME_D"] = pd.to_datetime(tmp["MCT_ME_D"], errors="coerce")
        df = df.merge(tmp, on=KEY_MCT, how="left")
        t0 = df[KEY_YM]
        tK = t0 + pd.offsets.MonthEnd(0) + pd.DateOffset(months=k_months)
        cond = (df["MCT_ME_D"].notna()) & (df["MCT_ME_D"] > t0) & (df["MCT_ME_D"] <= tK)
        df.loc[cond, "y"] = 1
        used_real = df["y"].sum() > 0

    if df["y"].nunique() < 2:
        def bin2num(s):
            s = s.astype(str)
            m = s.str.extract(r"(\d+)", expand=False)
            return pd.to_numeric(m, errors="coerce")

        df["RC_SAA_num"] = bin2num(df.get("RC_M1_SAA", ""))
        df["RC_CUS_num"] = bin2num(df.get("RC_M1_UE_CUS_CN", ""))

        df["dSAA"] = df.groupby(KEY_MCT)["RC_SAA_num"].diff()
        df["dCUS"] = df.groupby(KEY_MCT)["RC_CUS_num"].diff()

        cxl = pd.to_numeric(df.get("APV_CE_RAT", 0), errors="coerce")
        indme = pd.to_numeric(df.get("M12_SME_RY_ME_MCT_RAT", 0), errors="coerce")
        bznme = pd.to_numeric(df.get("M12_SME_BZN_ME_MCT_RAT", 0), errors="coerce")

        sig = 0
        sig += (df["dSAA"] <= -10).astype(int)
        sig += (df["dCUS"] <= -10).astype(int)
        sig += (cxl >= 90).astype(int)
        sig += (indme >= 80).astype(int)
        sig += (bznme >= 80).astype(int)
        df["y_proxy"] = (sig >= 2).astype(int)

        if df["y_proxy"].nunique() >= 2 and df["y_proxy"].sum() > 0:
            df["y"] = df["y_proxy"]

    if df["y"].nunique() < 2:
        sys.path.insert(0, base_dir)
        from pipeline import run_pipeline

        def read_csv_smart(path):
            for enc in ["utf-8", "cp949", "euc-kr", "latin1"]:
                try:
                    return pd.read_csv(path, encoding=enc)
                except Exception:
                    pass
            raise RuntimeError(f"CSV 인코딩 해석 실패: {path}")

        ds1_ = deepcopy(ds1)
        ds2_ = deepcopy(ds2)
        ds3_ = deepcopy(ds3)

        out = run_pipeline(ds1_, ds2_, ds3_, preds=None)
        outj = out.merge(df[[KEY_MCT, KEY_YM]], on=[KEY_MCT, KEY_YM], how="right")
        pf = pd.to_numeric(outj["p_final"], errors="coerce").fillna(0)
        thr = pf.quantile(1 - topq)
        df["y_q"] = (pf >= thr).astype(int)

        if df["y_q"].nunique() >= 2 and df["y_q"].sum() > 0:
            df["y"] = df["y_q"]

    if df["y"].nunique() < 2:
        if "p_final" not in df.columns:
            df["p_final"] = 0.0
        idx = df["p_final"].nlargest(min(50, len(df))).index
        df.loc[idx, "y"] = 1

    return df


robust_df = build_labels_robust(ds1, ds2, ds3, BASE_DIR, k_months=3, topq=0.10)
print("Label distribution:", robust_df["y"].value_counts(dropna=False).to_dict())

KEY_MCT, KEY_YM = "ENCODED_MCT", "TA_YM"
num_cols = [
    "M1_SME_RY_SAA_RAT", "M1_SME_RY_CNT_RAT",
    "M12_SME_RY_SAA_PCE_RT", "M12_SME_BZN_SAA_PCE_RT",
    "M12_SME_RY_ME_MCT_RAT", "M12_SME_BZN_ME_MCT_RAT",
    "DLV_SAA_RAT", "MCT_UE_CLN_REU_RAT", "MCT_UE_CLN_NEW_RAT"
]
cat_cols = [c for c in ["HPSN_MCT_ZCD_NM", "HPSN_MCT_BZN_CD_NM"] if c in robust_df.columns]

X = robust_df[num_cols + cat_cols].copy()
y = robust_df["y"].astype(int)

Label distribution: {0: 86540, 1: 50}


## 특징 생성 & 학습/평가

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, IsolationForest
import numpy as np

num_transform = Pipeline([("imp", SimpleImputer(strategy="median"))])
ct = ColumnTransformer([
    ("num", num_transform, num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
], remainder="drop")


def fit_supervised(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y if y.nunique() > 1 else None,
        test_size=0.25, random_state=42
    )
    models = {
        "logit": LogisticRegression(max_iter=200, class_weight="balanced"),
        "rf": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1, class_weight="balanced"),
        "gb": GradientBoostingClassifier(random_state=42)
    }
    res, pipes, probs = {}, {}, {}
    for name, clf in models.items():
        pipe = Pipeline([("prep", ct), ("clf", clf)])
        pipe.fit(X_train, y_train)
        if hasattr(pipe, "predict_proba"):
            p = pipe.predict_proba(X_test)[:, 1]
        else:
            s = pipe.decision_function(X_test)
            p = 1 / (1 + np.exp(-s))
        auc = roc_auc_score(y_test, p)
        ap = average_precision_score(y_test, p)
        res[name] = {"roc_auc": float(auc), "pr_auc": float(ap)}
        pipes[name] = pipe
        probs[name] = (X_test.index, p, y_test)
    return res, pipes, probs


def fit_oneclass(X):
    pipe = Pipeline([("prep", ct), ("clf", IsolationForest(
        n_estimators=400, contamination=0.10, random_state=42, n_jobs=-1
    ))])
    pipe.fit(X)
    s = pipe["clf"].score_samples(pipe["prep"].transform(X))
    s = (s - s.min()) / (s.max() - s.min() + 1e-9)
    p = 1 - s
    return {"oneclass": {"note": "IsolationForest"}}, {"oneclass": pipe}, {"oneclass": (X.index, p, None)}


if y.nunique() >= 2 and y.sum() > 0:
    res, pipes, probs = fit_supervised(X, y)
    print("Supervised metrics:", res)
else:
    print("Only one class detected → using IsolationForest fallback.")
    res, pipes, probs = fit_oneclass(X)

keys = list(pipes.keys())


def predict_full(pipe_dict, Xf):
    P = []
    for k in pipe_dict:
        pipe = pipe_dict[k]
        if "IsolationForest" in str(pipe):
            s = pipe["clf"].score_samples(pipe["prep"].transform(Xf))
            s = (s - s.min()) / (s.max() - s.min() + 1e-9)
            p = 1 - s
        else:
            if hasattr(pipe, "predict_proba"):
                p = pipe.predict_proba(Xf)[:, 1]
            else:
                sc = pipe.decision_function(Xf)
                p = 1 / (1 + np.exp(-sc))
        P.append(p)
    P = np.vstack(P).T
    return P.mean(axis=1)


X_full = robust_df[num_cols + cat_cols].copy()
pe = predict_full(pipes, X_full)

preds_full["ENCODED_MCT"] = preds_full["ENCODED_MCT"].astype(str)
preds_full["TA_YM"] = pd.to_datetime(preds_full["TA_YM"], errors="coerce").dt.to_period("M").dt.to_timestamp()
preds_full["pred_xgb"] = np.nan
preds_full["pred_lgbm"] = np.nan
preds_full["pred_rf"] = pe
preds_full["pred_gb"] = np.nan
preds_full["pred_dl"] = np.nan

preds_path = os.path.join(DATA_DIR, "preds.csv")
preds_full.to_csv(preds_path, index=False, encoding="utf-8")
print("Saved preds:", preds_path)

out = run_pipeline(ds1, ds2, ds3, preds=pd.read_csv(preds_path))
out_path = os.path.join(BASE_DIR, "risk_output_trained.csv")
out.to_csv(out_path, index=False, encoding="utf-8")
print("Saved:", out_path)

Supervised metrics: {'logit': {'roc_auc': 0.8744768981884766, 'pr_auc': 0.0028350413288156994}, 'rf': {'roc_auc': 0.9997155606122559, 'pr_auc': 0.8314446237523161}, 'gb': {'roc_auc': 0.6852002631064336, 'pr_auc': 0.05085954503504265}}
Saved preds: /Users/llouis/Documents/model_test/data/preds.csv


ValueError: You are trying to merge on datetime64[ns] and float64 columns for key 'TA_YM'. If you wish to proceed you should use pd.concat

### 앙상블 & 확률 보정(Platt)

In [None]:

import numpy as np

w = {"rf": 0.25, "gb": 0.15, "logit": 0.25}
ws = np.array([w.get(k, 0.0) for k in pred_dict.keys()], dtype="float64")
if ws.sum() == 0:
    ws = np.ones_like(ws) / len(ws)
else:
    ws = ws / ws.sum()

probs = np.zeros_like(list(pred_dict.values())[0][1])
for j, k in enumerate(pred_dict.keys()):
    probs += ws[j] * pred_dict[k][1]

from sklearn.linear_model import LogisticRegression

pl = LogisticRegression(max_iter=200)
pl.fit(probs.reshape(-1, 1), y_test)
p_cal = pl.predict_proba(probs.reshape(-1, 1))[:, 1]

print("Ensemble ROC-AUC (raw):", roc_auc_score(y_test, probs))
print("Ensemble ROC-AUC (cal):", roc_auc_score(y_test, p_cal))
print("Ensemble PR-AUC (cal):", average_precision_score(y_test, p_cal))

## preds.csv 생성 (전체 데이터 대상 예측)

In [None]:

def predict_full(pipe_dict, df_full):
    Xf = df_full[num_cols + cat_cols].copy()
    P = []
    keys = list(pipe_dict.keys())
    for k in keys:
        pipe = pipe_dict[k][0]
        p = pipe.predict_proba(Xf)[:, 1] if hasattr(pipe, "predict_proba") else pipe.decision_function(Xf)
        P.append(p)
    P = np.vstack(P).T
    m = P.shape[1]
    if len(ws) != m:
        w2 = np.ones(m) / m
    else:
        w2 = ws
    pe = (P * w2).sum(axis=1)
    pc = pl.predict_proba(pe.reshape(-1, 1))[:, 1]
    return pc


df_full = df[["ENCODED_MCT", "TA_YM"]].copy()
df_full["TA_YM"] = pd.to_datetime(df_full["TA_YM"], errors="coerce").dt.strftime("%Y-%m")
df_full["pred_xgb"] = np.nan
df_full["pred_lgbm"] = np.nan
df_full["pred_rf"] = predict_full({"rf": pred_dict["rf"]}, df)
df_full["pred_gb"] = predict_full({"gb": pred_dict["gb"]}, df)
df_full["pred_dl"] = predict_full({"logit": pred_dict["logit"]}, df)

preds_path = os.path.join(DATA_DIR, "preds.csv")
df_full.to_csv(preds_path, index=False, encoding="utf-8")
preds_path

## 파이프라인 재실행 → 최종 경보

In [None]:

out = run_pipeline(ds1, ds2, ds3, preds=read_csv_smart(preds_path))
out_path = os.path.join(BASE_DIR, "risk_output_trained.csv")
out.to_csv(out_path, index=False, encoding="utf-8")
out.head(10)