In [1]:
# 필수 패키지
%pip -q install pandas numpy scikit-learn matplotlib python-dateutil

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os, sys, warnings, pandas as pd, numpy as np

warnings.filterwarnings("ignore")

BASE_DIR = "/Users/llouis/Documents/model_test"
DATA_DIR = os.path.join(BASE_DIR, "data")

sys.path.insert(0, BASE_DIR)
from pipeline import run_pipeline

print("BASE_DIR:", BASE_DIR)
print("DATA_DIR:", DATA_DIR)

BASE_DIR: /Users/llouis/Documents/model_test
DATA_DIR: /Users/llouis/Documents/model_test/data


In [3]:
def read_csv_smart(path):
    for enc in ["utf-8", "cp949", "euc-kr", "latin1"]:
        try:
            return pd.read_csv(path, encoding=enc)
        except Exception:
            pass
    raise RuntimeError(f"CSV 인코딩 해석 실패: {path}")


ds1 = read_csv_smart(os.path.join(DATA_DIR, "big_data_set1_f.csv"))
ds2 = read_csv_smart(os.path.join(DATA_DIR, "ds2_monthly_usage.csv"))
ds3 = read_csv_smart(os.path.join(DATA_DIR, "ds3_monthly_customers.csv"))

print(ds1.shape, ds2.shape, ds3.shape)

(4185, 9) (86590, 15) (86590, 17)


In [4]:
from dateutil.relativedelta import relativedelta

KEY_MCT, KEY_YM = "ENCODED_MCT", "TA_YM"


def to_month(s):
    dt = pd.to_datetime(s.astype(str), errors="coerce")
    return pd.to_datetime(dt.dt.to_period("M").astype(str))


def build_labels_robust(ds1, ds2, ds3, k_months=3, topq=0.10):
    # 1) 베이스 프레임(월 정규화)
    df = ds2.merge(ds3, on=[KEY_MCT, KEY_YM], how="outer")
    df[KEY_YM] = to_month(df[KEY_YM])
    df[KEY_MCT] = df[KEY_MCT].astype(str)
    df = df.sort_values([KEY_MCT, KEY_YM]).reset_index(drop=True)
    df["y"] = 0

    # 2) 실라벨 (폐업일)
    used_real = False
    if "MCT_ME_D" in ds1.columns:
        tmp = ds1[[KEY_MCT, "MCT_ME_D"]].copy()
        tmp[KEY_MCT] = tmp[KEY_MCT].astype(str)
        tmp["MCT_ME_D"] = pd.to_datetime(tmp["MCT_ME_D"], errors="coerce")
        df = df.merge(tmp, on=KEY_MCT, how="left")
        t0 = df[KEY_YM]
        tK = t0 + pd.offsets.MonthEnd(0) + pd.DateOffset(months=k_months)
        cond = (df["MCT_ME_D"].notna()) & (df["MCT_ME_D"] > t0) & (df["MCT_ME_D"] <= tK)
        df.loc[cond, "y"] = 1
        used_real = df["y"].sum() > 0

    # 3) 프록시 라벨 (급락/취소/해지 과열)
    if df["y"].nunique() < 2:
        def bin2num(s):
            s = s.astype(str)
            m = s.str.extract(r"(\d+)", expand=False)
            return pd.to_numeric(m, errors="coerce")

        df["RC_SAA_num"] = bin2num(df.get("RC_M1_SAA", ""))
        df["RC_CUS_num"] = bin2num(df.get("RC_M1_UE_CUS_CN", ""))

        df["dSAA"] = df.groupby(KEY_MCT)["RC_SAA_num"].diff()
        df["dCUS"] = df.groupby(KEY_MCT)["RC_CUS_num"].diff()

        cxl = pd.to_numeric(df.get("APV_CE_RAT", 0), errors="coerce")
        indme = pd.to_numeric(df.get("M12_SME_RY_ME_MCT_RAT", 0), errors="coerce")
        bznme = pd.to_numeric(df.get("M12_SME_BZN_ME_MCT_RAT", 0), errors="coerce")

        sig = 0
        sig += (df["dSAA"] <= -10).astype(int)
        sig += (df["dCUS"] <= -10).astype(int)
        sig += (cxl >= 90).astype(int)
        sig += (indme >= 80).astype(int)
        sig += (bznme >= 80).astype(int)
        df["y_proxy"] = (sig >= 2).astype(int)

        if df["y_proxy"].nunique() >= 2 and df["y_proxy"].sum() > 0:
            df["y"] = df["y_proxy"]

    if df["y"].nunique() < 2:
        out = run_pipeline(ds1, ds2, ds3, preds=None)
        outj = out.merge(df[[KEY_MCT, KEY_YM]], on=[KEY_MCT, KEY_YM], how="right")
        pf = pd.to_numeric(outj["p_final"], errors="coerce").fillna(0)
        thr = pf.quantile(1 - topq)
        df["y"] = (pf >= thr).astype(int)

    return df


robust_df = build_labels_robust(ds1, ds2, ds3, k_months=3, topq=0.10)
print("Label counts:", robust_df["y"].value_counts(dropna=False).to_dict())

Label counts: {1: 86590}


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, IsolationForest
from sklearn.metrics import roc_auc_score, average_precision_score
import numpy as np

num_cols = [
    "M1_SME_RY_SAA_RAT", "M1_SME_RY_CNT_RAT",
    "M12_SME_RY_SAA_PCE_RT", "M12_SME_BZN_SAA_PCE_RT",
    "M12_SME_RY_ME_MCT_RAT", "M12_SME_BZN_ME_MCT_RAT",
    "DLV_SAA_RAT", "MCT_UE_CLN_REU_RAT", "MCT_UE_CLN_NEW_RAT"
]
cat_cols = [c for c in ["HPSN_MCT_ZCD_NM", "HPSN_MCT_BZN_CD_NM"] if c in robust_df.columns]

X = robust_df[num_cols + cat_cols].copy()
y = robust_df["y"].astype(int)

num_transform = Pipeline([("imp", SimpleImputer(strategy="median"))])
ct = ColumnTransformer([
    ("num", num_transform, num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
], remainder="drop")


def fit_supervised(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y if y.nunique() > 1 else None, test_size=0.25, random_state=42
    )
    models = {
        "logit": LogisticRegression(max_iter=200, class_weight="balanced"),
        "rf": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1, class_weight="balanced"),
        "gb": GradientBoostingClassifier(random_state=42)
    }
    res, pipes = {}, {}
    for name, clf in models.items():
        pipe = Pipeline([("prep", ct), ("clf", clf)])
        pipe.fit(X_train, y_train)
        p = pipe.predict_proba(X_test)[:, 1] if hasattr(pipe, "predict_proba") else 1 / (
                    1 + np.exp(-pipe.decision_function(X_test)))
        res[name] = {"roc_auc": float(roc_auc_score(y_test, p)), "pr_auc": float(average_precision_score(y_test, p))}
        pipes[name] = pipe
    return res, pipes


def fit_oneclass(X):
    pipe = Pipeline(
        [("prep", ct), ("clf", IsolationForest(n_estimators=400, contamination=0.10, random_state=42, n_jobs=-1))])
    pipe.fit(X)
    s = pipe["clf"].score_samples(pipe["prep"].transform(X))
    s = (s - s.min()) / (s.max() - s.min() + 1e-9)
    p = 1 - s
    return {"oneclass": {"note": "IsolationForest"}}, {"oneclass": pipe}


if y.nunique() >= 2 and y.sum() > 0:
    res, pipes = fit_supervised(X, y)
    print("Supervised metrics:", res)
else:
    print("Only one class → IsolationForest fallback.")
    res, pipes = fit_oneclass(X)

Only one class → IsolationForest fallback.


In [6]:
def predict_full(pipe_dict, Xf):
    P = []
    for name, pipe in pipe_dict.items():
        if isinstance(pipe.named_steps.get("clf"), IsolationForest):
            s = pipe["clf"].score_samples(pipe["prep"].transform(Xf))
            s = (s - s.min()) / (s.max() - s.min() + 1e-9)
            p = 1 - s
        else:
            if hasattr(pipe, "predict_proba"):
                p = pipe.predict_proba(Xf)[:, 1]
            else:
                sc = pipe.decision_function(Xf);
                p = 1 / (1 + np.exp(-sc))
        P.append(p)
    P = np.vstack(P).T
    return P.mean(axis=1)


X_full = robust_df[num_cols + cat_cols].copy()
pe = predict_full(pipes, X_full)

preds_full = robust_df[["ENCODED_MCT", "TA_YM"]].copy()
preds_full["ENCODED_MCT"] = preds_full["ENCODED_MCT"].astype(str)
preds_full["TA_YM"] = pd.to_datetime(preds_full["TA_YM"], errors="coerce").dt.to_period("M").dt.to_timestamp()

preds_full["pred_xgb"] = np.nan
preds_full["pred_lgbm"] = np.nan
preds_full["pred_rf"] = pe
preds_full["pred_gb"] = np.nan
preds_full["pred_dl"] = np.nan

preds_full = preds_full.dropna(subset=["ENCODED_MCT", "TA_YM"])  # 키 결측 제거
preds_path = os.path.join(DATA_DIR, "preds.csv")
preds_full.to_csv(preds_path, index=False, encoding="utf-8")
print("Saved preds:", preds_path)
print(preds_full.dtypes[["ENCODED_MCT", "TA_YM"]])

Saved preds: /Users/llouis/Documents/model_test/data/preds.csv
ENCODED_MCT            object
TA_YM          datetime64[ns]
dtype: object


In [7]:
preds = pd.read_csv(preds_path)
preds["ENCODED_MCT"] = preds["ENCODED_MCT"].astype(str)
preds["TA_YM"] = pd.to_datetime(preds["TA_YM"], errors="coerce").dt.to_period("M").dt.to_timestamp()
preds = preds.dropna(subset=["ENCODED_MCT", "TA_YM"])

out = run_pipeline(ds1, ds2, ds3, preds=preds)
out_path = os.path.join(BASE_DIR, "risk_output_trained.csv")
out.to_csv(out_path, index=False, encoding="utf-8")
print("Saved:", out_path)
out.head(10)

Saved: /Users/llouis/Documents/model_test/risk_output_trained.csv


Unnamed: 0,ENCODED_MCT,TA_YM,Sales_Risk,Customer_Risk,Market_Risk,RiskScore,p_model,p_final,Alert
0,000F03E44A,2023-01-01,0.0296,0.0,0.55645,0.178775,,0.07151,GREEN
1,000F03E44A,2023-02-01,0.0296,0.2,0.56075,0.240065,,0.096026,GREEN
2,000F03E44A,2023-03-01,0.0298,0.0,0.5615,0.18037,,0.072148,GREEN
3,000F03E44A,2023-04-01,0.0312,0.0,0.559725,0.180397,,0.072159,GREEN
4,000F03E44A,2023-05-01,0.0309,0.03334,0.563,0.191262,,0.076505,GREEN
5,000F03E44A,2023-06-01,0.0317,0.01666,0.562325,0.186375,,0.07455,GREEN
6,000F03E44A,2023-07-01,0.0312,0.05,0.564,0.19668,,0.078672,GREEN
7,000F03E44A,2023-08-01,0.0313,0.0,0.563,0.18142,,0.072568,GREEN
8,000F03E44A,2023-09-01,0.0308,0.0,0.559725,0.180237,,0.072095,GREEN
9,000F03E44A,2023-10-01,0.0307,0.04,0.56125,0.192655,,0.077062,GREEN
