In [5]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    precision_recall_curve
)

import matplotlib.pyplot as plt

from xgboost import XGBClassifier


In [8]:
file_path = '../../DATA/VIP/VIP_combined_part0.csv'

print('file_path =', file_path)

df = pd.read_csv(file_path)
print('[로드 완료]', df.shape)
display(df.head())


file_path = ../../DATA/VIP/VIP_combined_part0.csv
[로드 완료] (600000, 891)


Unnamed: 0,발급회원번호,기준년월,컨택건수_카드론_TM_B0M,컨택건수_리볼빙_TM_B0M,컨택건수_CA_TM_B0M,컨택건수_이용유도_TM_B0M,컨택건수_신용발급_TM_B0M,컨택건수_부대서비스_TM_B0M,컨택건수_포인트소진_TM_B0M,컨택건수_보험_TM_B0M,...,할인금액_제휴연회비_B0M,청구금액_기본연회비_B0M,청구금액_제휴연회비_B0M,상품관련면제카드수_B0M,임직원면제카드수_B0M,우수회원면제카드수_B0M,기타면제카드수_B0M,카드신청건수,Life_Stage,최종카드발급경과월
0,SYN_0,201807,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5.자녀성장기(2),22
1,SYN_8,201807,0,0,0,0,0,0,0,0,...,0,5001,0,0,0,0,0,0,1.Single,44
2,SYN_9,201807,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.자녀출산기,14
3,SYN_17,201807,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7.노령,41
4,SYN_18,201807,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.자녀성장기(1),17


In [None]:
# ==============================
# 2) VIP 이탈 타겟 생성 (기존 로직 유지)
#   - 당월 총 이용금액 = 신용_B0M + 체크_B0M
#   - 직전 3개월 평균 = (신용_R3M + 체크_R3M) / 3
#   - 당월 < 직전평균 * 0.8 이면 이탈(1)
#   - 직전평균 <= 0 또는 NaN 이면 판단 제외(NaN)
# ==============================

df = df.sort_values(by=['발급회원번호', '기준년월']).copy()

# 당월 총 이용금액
df['당월_총_이용금액'] = df['이용금액_신용_B0M'] + df['이용금액_체크_B0M']

# 직전 3개월 평균 이용금액
df['직전_3M_평균_이용금액'] = (df['이용금액_신용_R3M'] + df['이용금액_체크_R3M']) / 3

def define_churn_strict(row):
    if pd.isna(row['직전_3M_평균_이용금액']) or row['직전_3M_평균_이용금액'] <= 0:
        return np.nan
    if row['당월_총_이용금액'] < (row['직전_3M_평균_이용금액'] * 0.8):
        return 1
    return 0

df['이탈_타겟'] = df.apply(define_churn_strict, axis=1)

train_df = df[df['이탈_타겟'].notna()].copy()
train_df['이탈_타겟'] = train_df['이탈_타겟'].astype(int)

dormant_new_df = df[df['이탈_타겟'].isna()].copy()

print('--- [데이터 분리 결과] ---')
print('1) 학습 가능(0/1):', len(train_df))
print('   - 유지(0):', int((train_df['이탈_타겟']==0).sum()))
print('   - 이탈(1):', int((train_df['이탈_타겟']==1).sum()))
print('2) 판단 제외(NaN):', len(dormant_new_df))


In [None]:
# ==============================
# 3) 누수(Leakage) 컬럼 제거 + Feature/Target 분리
#   - 타겟 생성에 사용된 컬럼 및 B0M(당월) 계열은 제거하는 게 안전
# ==============================

leakage_cols = [
    '당월_총_이용금액',
    '직전_3M_평균_이용금액',
    '이용금액_신용_B0M', '이용금액_체크_B0M',
    '이용금액_신용_R3M', '이용금액_체크_R3M'
]

b0m_cols = [col for col in train_df.columns if 'B0M' in col]

drop_cols = list(set(['발급회원번호', '기준년월', '이탈_타겟'] + leakage_cols + b0m_cols))
drop_cols = [c for c in drop_cols if c in train_df.columns]

X = train_df.drop(columns=drop_cols)
y = train_df['이탈_타겟']

print('X shape:', X.shape)
print('y mean(이탈 비중):', round(float(y.mean()), 4))


In [None]:
# ==============================
# 4) 발급회원번호 기준 ID Split (기존 유지)
# ==============================

unique_ids = train_df['발급회원번호'].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)

X_train = X[train_df['발급회원번호'].isin(train_ids)].copy()
y_train = y[train_df['발급회원번호'].isin(train_ids)].copy()

X_test  = X[train_df['발급회원번호'].isin(test_ids)].copy()
y_test  = y[train_df['발급회원번호'].isin(test_ids)].copy()

print('Train:', X_train.shape, ' Test:', X_test.shape)
print('Train 이탈 비중:', round(float(y_train.mean()), 4))
print('Test  이탈 비중:', round(float(y_test.mean()), 4))


In [None]:
# ==============================
# 5) XGBoost 입력 전처리
#   - (A) XGBoost native categorical 가능하면: object -> category 변환 + enable_categorical=True
#   - (B) 미지원이면: object 컬럼 one-hot 인코딩으로 자동 fallback
# ==============================

import xgboost as xgb

def _xgb_supports_native_categorical():
    # 대략적인 기준: 1.6+에서 categorical 지원이 실사용 가능
    try:
        major, minor = xgb.__version__.split('.')[:2]
        major = int(major); minor = int(minor)
        return (major > 1) or (major == 1 and minor >= 6)
    except Exception:
        return False

def prepare_for_xgb(X_tr, X_te):
    X_tr = X_tr.copy()
    X_te = X_te.copy()

    cat_cols = X_tr.select_dtypes(include=['object']).columns.tolist()

    if len(cat_cols) == 0:
        return X_tr, X_te, cat_cols, True  # categorical 관련 이슈 없음

    # 우선 category로 변환 시도
    for col in cat_cols:
        X_tr[col] = X_tr[col].astype('category')
        X_te[col] = X_te[col].astype('category')

    if _xgb_supports_native_categorical():
        # native categorical 사용
        return X_tr, X_te, cat_cols, True

    # fallback: one-hot
    X_tr_enc = pd.get_dummies(X_tr, columns=cat_cols, dummy_na=True)
    X_te_enc = pd.get_dummies(X_te, columns=cat_cols, dummy_na=True)
    X_tr_enc, X_te_enc = X_tr_enc.align(X_te_enc, join='outer', axis=1, fill_value=0)
    return X_tr_enc, X_te_enc, cat_cols, False

X_train_p, X_test_p, cat_cols, use_native_cat = prepare_for_xgb(X_train, X_test)

print('범주형(object) 컬럼 수:', len(cat_cols))
print('native categorical 사용 여부:', use_native_cat)
print('X_train_p shape:', X_train_p.shape, 'X_test_p shape:', X_test_p.shape)


In [None]:
# ==============================
# 6) XGBoost 학습
#   - 불균형 대응: scale_pos_weight = (음성/양성)
#   - early stopping 적용
# ==============================

pos = float(y_train.sum())
neg = float(len(y_train) - y_train.sum())
scale_pos_weight = (neg / pos) if pos > 0 else 1.0

print('scale_pos_weight:', round(scale_pos_weight, 4))

xgb_model = XGBClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=1,
    reg_alpha=0.1,
    reg_lambda=1.0,
    gamma=0.0,
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
    enable_categorical=use_native_cat
)

xgb_model.fit(
    X_train_p, y_train,
    eval_set=[(X_test_p, y_test)],
    verbose=200,
    early_stopping_rounds=100
)

print('best_iteration:', getattr(xgb_model, 'best_iteration', None))


In [None]:
# ==============================
# 7) 기본 평가(Threshold=0.5) + AUC
# ==============================

y_proba = xgb_model.predict_proba(X_test_p)[:, 1]
y_pred_05 = (y_proba >= 0.5).astype(int)

print('--- Threshold = 0.5 ---')
print(classification_report(y_test, y_pred_05, digits=4))

try:
    auc = roc_auc_score(y_test, y_proba)
    print('ROC-AUC:', round(float(auc), 4))
except Exception as e:
    print('ROC-AUC 계산 실패:', e)

cm = confusion_matrix(y_test, y_pred_05)
print('Confusion Matrix:
', cm)


In [None]:
# ==============================
# 8) 목표 Recall에 맞춘 Threshold 튜닝 (기존 방식 유지)
#   - 예: target_recall = 0.86
# ==============================

target_recall = 0.86

precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)

# recalls 길이는 thresholds보다 1개 더 김 → recalls[:-1] 사용
idx = (np.abs(recalls[:-1] - target_recall)).argmin()
best_thr = float(thresholds[idx])

y_pred_thr = (y_proba >= best_thr).astype(int)

print(f'✅ 목표 Recall {target_recall} 근처 Threshold = {best_thr:.6f}')
print(classification_report(y_test, y_pred_thr, digits=4))

cm_thr = confusion_matrix(y_test, y_pred_thr)
print('Confusion Matrix (tuned):
', cm_thr)

# 시각화
plt.figure(figsize=(5, 4))
plt.imshow(cm_thr, interpolation='nearest')
plt.title('Confusion Matrix (Tuned Threshold)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks([0, 1]); plt.yticks([0, 1])
for (i, j), v in np.ndenumerate(cm_thr):
    plt.text(j, i, str(v), ha='center', va='center')
plt.tight_layout()
plt.show()


In [None]:
# ==============================
# 9) Feature Importance Top 20 (gain 기준)
# ==============================

booster = xgb_model.get_booster()
score = booster.get_score(importance_type='gain')

if len(score) == 0:
    print('importance score가 비어 있습니다. (학습이 정상인지 확인 필요)')
else:
    imp = pd.Series(score).sort_values(ascending=False)
    top20 = imp.head(20)

    plt.figure(figsize=(10, 7))
    plt.barh(top20.index[::-1], top20.values[::-1])
    plt.title('Top 20 Feature Importance (gain)')
    plt.tight_layout()
    plt.show()

    print('Top20 변수 목록:')
    print(top20.index.tolist())


In [None]:
# ==============================
# 10) (선택) 모델 저장
# ==============================

# JSON으로 저장 (환경 간 호환성이 좋아 권장)
model_out_path = 'VIP_XGBoost_model.json'
xgb_model.save_model(model_out_path)
print('saved:', model_out_path)
