# 02. 불균형 데이터 처리

## 목표
- Class Weights 조정
- SMOTE 오버샘플링
- 임계값 최적화
- Recall/F1 개선

In [None]:
import sys
sys.path.append('../utils')
sys.path.append('../features')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, recall_score, f1_score, precision_recall_curve
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import pickle

from common_utils import load_data, evaluate_model, plot_roc_pr_curves, plot_confusion_matrix
from feature_engineering import build_a_features, build_b_features, merge_features_with_labels
from preprocessing import FeaturePreprocessor

import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

print("✅ 라이브러리 로드 완료")

## 1. 데이터 로드 및 전처리

In [None]:
train, test, train_a, train_b, test_a, test_b = load_data()

features_a = build_a_features(train_a)
features_b = build_b_features(train_b)
X, y = merge_features_with_labels(train, features_a, features_b)

preprocessor = FeaturePreprocessor().fit(X)
X = preprocessor.transform(X)

print(f"\n최종 데이터: X {X.shape}, y {y.shape}")
print(f"클래스 분포:")
print(y.value_counts())
print(f"\n위험군 비율: {y.mean():.2%}")
print(f"불균형 비율: {(1-y.mean())/y.mean():.1f}:1")

In [None]:
# Train/Valid Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {X_train.shape}, Valid: {X_val.shape}")
print(f"Train 위험군: {y_train.mean():.2%}")
print(f"Valid 위험군: {y_val.mean():.2%}")

## 2. Baseline (불균형 처리 없음)

In [None]:
print("Baseline CatBoost 학습 (불균형 처리 없음)...")

model_baseline = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    verbose=False
)
model_baseline.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=30, verbose=False)

y_pred_baseline = model_baseline.predict_proba(X_val)[:, 1]
metrics_baseline = evaluate_model(y_val, y_pred_baseline, threshold=0.5, model_name='Baseline')

print(f"\n⚠️ Recall: {metrics_baseline['recall']:.4f} (낮음!)")

## 3. 방법 1: Class Weights 조정

In [None]:
print("CatBoost with Auto Class Weights...")

model_weights = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=6,
    auto_class_weights='Balanced',  # 불균형 자동 조정
    random_seed=42,
    verbose=False
)
model_weights.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=30, verbose=False)

y_pred_weights = model_weights.predict_proba(X_val)[:, 1]
metrics_weights = evaluate_model(y_val, y_pred_weights, threshold=0.5, model_name='Class Weights')

print(f"\n✅ Recall 개선: {metrics_baseline['recall']:.4f} → {metrics_weights['recall']:.4f}")

## 4. 방법 2: SMOTE 오버샘플링

In [None]:
print("SMOTE 적용...")

# SMOTE: 소수 클래스를 30%까지 증강
smote = SMOTE(sampling_strategy=0.3, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"\n원본 Train: {X_train.shape}, 위험군 {y_train.mean():.2%}")
print(f"SMOTE Train: {X_train_smote.shape}, 위험군 {y_train_smote.mean():.2%}")

In [None]:
print("\nCatBoost with SMOTE...")

model_smote = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    verbose=False
)
model_smote.fit(X_train_smote, y_train_smote, eval_set=(X_val, y_val), early_stopping_rounds=30, verbose=False)

y_pred_smote = model_smote.predict_proba(X_val)[:, 1]
metrics_smote = evaluate_model(y_val, y_pred_smote, threshold=0.5, model_name='SMOTE')

print(f"\n✅ Recall 개선: {metrics_baseline['recall']:.4f} → {metrics_smote['recall']:.4f}")

## 5. 방법 3: 임계값 최적화

In [None]:
# F1 Score를 최대화하는 최적 임계값 탐색
def find_best_threshold(y_true, y_pred_proba, metric='f1'):
    thresholds = np.arange(0.01, 0.99, 0.01)
    best_score = 0
    best_threshold = 0.5
    
    for thresh in thresholds:
        y_pred = (y_pred_proba >= thresh).astype(int)
        if metric == 'f1':
            score = f1_score(y_true, y_pred, zero_division=0)
        elif metric == 'recall':
            score = recall_score(y_true, y_pred, zero_division=0)
        
        if score > best_score:
            best_score = score
            best_threshold = thresh
    
    return best_threshold, best_score

# Class Weights 모델로 최적 임계값 탐색
best_thresh_f1, best_f1 = find_best_threshold(y_val, y_pred_weights, metric='f1')
best_thresh_recall, best_recall = find_best_threshold(y_val, y_pred_weights, metric='recall')

print("=" * 60)
print("최적 임계값 탐색 결과 (Class Weights 모델)")
print("=" * 60)
print(f"F1 최대화 임계값: {best_thresh_f1:.3f} (F1 = {best_f1:.4f})")
print(f"Recall 최대화 임계값: {best_thresh_recall:.3f} (Recall = {best_recall:.4f})")

In [None]:
# 최적 임계값으로 재평가
metrics_optimized = evaluate_model(y_val, y_pred_weights, threshold=best_thresh_f1, 
                                   model_name='Class Weights + Optimized Threshold')

print(f"\n✅ 개선 효과:")
print(f"  Baseline (thresh=0.5): Recall {metrics_baseline['recall']:.4f}, F1 {metrics_baseline['f1']:.4f}")
print(f"  최적화 (thresh={best_thresh_f1:.3f}): Recall {metrics_optimized['recall']:.4f}, F1 {metrics_optimized['f1']:.4f}")

## 6. 모델 비교

In [None]:
# 비교 테이블
comparison = pd.DataFrame([
    metrics_baseline,
    metrics_weights,
    metrics_smote,
    metrics_optimized
])

print("\n" + "=" * 60)
print("모델 성능 비교")
print("=" * 60)
print(comparison.to_string(index=False))

# 시각화
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# ROC AUC 비교
comparison.plot(x='model', y='roc_auc', kind='bar', ax=axes[0], color='steelblue', legend=False)
axes[0].set_title('ROC AUC 비교', fontsize=14, fontweight='bold')
axes[0].set_ylabel('ROC AUC')
axes[0].set_ylim([0.5, 1.0])
axes[0].grid(alpha=0.3)
axes[0].tick_params(axis='x', rotation=45)

# Recall/F1 비교
comparison[['model', 'recall', 'f1']].set_index('model').plot(kind='bar', ax=axes[1])
axes[1].set_title('Recall & F1 비교', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Score')
axes[1].set_ylim([0, 1.0])
axes[1].legend(['Recall', 'F1'])
axes[1].grid(alpha=0.3)
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../output/figures/imbalance_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n✅ 비교 차트 저장: ../output/figures/imbalance_comparison.png")

## 7. 최적 모델 저장

In [None]:
# 최고 성능 모델 선택 (ROC AUC 기준)
best_idx = comparison['roc_auc'].idxmax()
best_model_name = comparison.loc[best_idx, 'model']

print(f"최고 성능 모델: {best_model_name}")
print(f"ROC AUC: {comparison.loc[best_idx, 'roc_auc']:.4f}")

# SMOTE 모델이 최고 성능이면 저장
if 'SMOTE' in best_model_name:
    best_model = model_smote
    best_threshold = 0.5
elif 'Optimized' in best_model_name:
    best_model = model_weights
    best_threshold = best_thresh_f1
else:
    best_model = model_weights
    best_threshold = 0.5

# 모델 및 임계값 저장
with open('../output/models/imbalance_best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

with open('../output/models/best_threshold.pkl', 'wb') as f:
    pickle.dump(best_threshold, f)

print(f"\n✅ 모델 저장: ../output/models/imbalance_best_model.pkl")
print(f"✅ 최적 임계값 저장: ../output/models/best_threshold.pkl (threshold={best_threshold:.3f})")

# 결과 저장
comparison.to_csv('../output/imbalance_results.csv', index=False)
print(f"✅ 결과 저장: ../output/imbalance_results.csv")

## 8. 요약 및 권장사항

In [None]:
print("\n" + "=" * 60)
print("불균형 처리 요약")
print("=" * 60)
print(f"✅ Baseline ROC AUC: {metrics_baseline['roc_auc']:.4f}")
print(f"✅ 최적 모델 ROC AUC: {comparison.loc[best_idx, 'roc_auc']:.4f}")
print(f"✅ 개선: +{(comparison.loc[best_idx, 'roc_auc'] - metrics_baseline['roc_auc']):.4f}")
print(f"\n✅ Baseline Recall: {metrics_baseline['recall']:.4f}")
print(f"✅ 최적 모델 Recall: {comparison.loc[best_idx, 'recall']:.4f}")
print(f"✅ 개선: +{(comparison.loc[best_idx, 'recall'] - metrics_baseline['recall']):.4f}")
print("\n권장사항:")
print("  - ROC AUC 0.78+ 달성 시: 다음 단계(Stacking)로 진행")
print("  - ROC AUC < 0.75: 피처 엔지니어링 재검토 (02_feature_validation.ipynb)")
print("  - Recall < 0.5: SMOTE sampling_strategy 증가 (0.5~0.7)")
print("=" * 60)