 언더샘플링 기반 LightGBM 모델

In [5]:
# ▒▒ 1. 필수 라이브러리 임포트 ▒▒
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.pipeline import Pipeline  # ✅ imblearn Pipeline 사용
from imblearn.under_sampling import RandomUnderSampler
from lightgbm import LGBMClassifier

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, fbeta_score, roc_auc_score,
    classification_report, confusion_matrix,
    make_scorer  # ✅ F2-score용
)

# ▒▒ 2. 전처리 완료된 데이터 로드 ▒▒
print("📂 Loading preprocessed dataset...")
df = pd.read_csv('C:/ITStudy/bank-marketing/data.csv')
print(f"✅ Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

# ▒▒ 3. 데이터 분할 ▒▒
X = df.drop(columns='y')
y = df['y']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
print(f"🧪 Train shape: {X_train.shape}, Validation shape: {X_val.shape}")

# ▒▒ 4. 언더샘플링 + LightGBM 파이프라인 구성 ▒▒
pipeline = Pipeline([
    ('undersample', RandomUnderSampler()),
    ('lgbm', LGBMClassifier(random_state=42))
])

# ▒▒ 5. 하이퍼파라미터 그리드 정의 ▒▒
param_grid = {
    'undersample__sampling_strategy': [0.5, 0.8],
    'lgbm__n_estimators': [50, 100, 200],
    'lgbm__max_depth': [-1, 3, 5],
    'lgbm__learning_rate': [0.01, 0.05, 0.1],
    'lgbm__subsample': [0.8, 1.0],
}

# ▒▒ 6. GridSearchCV 실행 (F2-score 기준) ▒▒
print("🔍 Running GridSearchCV (scoring = F2-score)...")
f2_scorer = make_scorer(fbeta_score, beta=2)

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=3,
    scoring=f2_scorer,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# ▒▒ 7. 최적 모델 예측 수행 ▒▒
best_model = grid_search.best_estimator_
y_pred_prob = best_model.predict_proba(X_val)[:, 1]
y_pred = best_model.predict(X_val)

# ▒▒ 8. 평가 지표 함수 정의 및 출력 ▒▒
def print_metrics(y_true, y_pred, y_score):
    print("\n📊 Evaluation Metrics on Validation Set")
    print(f"Accuracy     : {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision    : {precision_score(y_true, y_pred):.4f}")
    print(f"Recall       : {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score     : {f1_score(y_true, y_pred):.4f}")
    print(f"F2 Score     : {fbeta_score(y_true, y_pred, beta=2):.4f}")
    print(f"ROC AUC Score: {roc_auc_score(y_true, y_score):.4f}")
    print("\n📄 Classification Report:")
    print(classification_report(y_true, y_pred))
    print("\n🔍 Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

print_metrics(y_val, y_pred, y_pred_prob)

# ▒▒ 9. 최종 모델 저장 ▒▒
model_dir = "C:/ITStudy/bank-marketing/model"
model_path = os.path.join(model_dir, "lgbm_under_f2.pkl")

# 디렉토리 없으면 생성
os.makedirs(model_dir, exist_ok=True)

# 모델 저장
with open(model_path, "wb") as model_file:
    pickle.dump(best_model, model_file)

print(f"\n💾 Best model saved to: {model_path}")


📂 Loading preprocessed dataset...
✅ Dataset loaded: 41188 rows, 43 columns
🧪 Train shape: (28831, 42), Validation shape: (12357, 42)
🔍 Running GridSearchCV (scoring = F2-score)...
Fitting 3 folds for each of 108 candidates, totalling 324 fits
[LightGBM] [Info] Number of positive: 3248, number of negative: 4060
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002179 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 418
[LightGBM] [Info] Number of data points in the train set: 7308, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.444444 -> initscore=-0.223144
[LightGBM] [Info] Start training from score -0.223144

📊 Evaluation Metrics on Validation Set
Accuracy     : 0.8657
Precision    : 0.4329
Recall       : 0.6185
F1 Score     : 0.5093
F2 Score     : 0.5697
ROC AUC Score: 0.8100

📄 Classification Report:
     

XGBoost 언더샘플링

In [7]:
# ▒▒ 1. 필수 라이브러리 임포트 ▒▒
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, fbeta_score, roc_auc_score,
    classification_report, confusion_matrix,
    make_scorer
)

# ▒▒ 2. 전처리 완료된 데이터 로드 ▒▒
print("📂 Loading preprocessed dataset...")
df = pd.read_csv('C:/ITStudy/bank-marketing/data.csv')
print(f"✅ Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

# ▒▒ 3. 데이터 분할 ▒▒
X = df.drop(columns='y')
y = df['y']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
print(f"🧪 Train shape: {X_train.shape}, Validation shape: {X_val.shape}")

# ▒▒ 4. 언더샘플링 + XGBoost 파이프라인 구성 ▒▒
pipeline = Pipeline([
    ('undersample', RandomUnderSampler(random_state=42)),
    ('xgb', XGBClassifier(
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss',      # XGBoost 경고 제거
        tree_method='hist'          # ❗ GPU 대신 CPU 기반 학습 설정
    ))
])

# ▒▒ 5. 하이퍼파라미터 그리드 정의 ▒▒
param_grid = {
    'undersample__sampling_strategy': [0.5, 0.8],
    'xgb__n_estimators': [50, 100, 200],
    'xgb__max_depth': [3, 5, 8],
    'xgb__learning_rate': [0.01, 0.05, 0.1],
    'xgb__subsample': [0.8, 1.0],
}

# ▒▒ 6. GridSearchCV 실행 (F2-score 기준) ▒▒
print("🔍 Running GridSearchCV (scoring = F2-score)...")
f2_scorer = make_scorer(fbeta_score, beta=2)

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=3,
    scoring=f2_scorer,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# ▒▒ 7. 최적 모델 예측 수행 ▒▒
best_model = grid_search.best_estimator_
y_pred_prob = best_model.predict_proba(X_val)[:, 1]
y_pred = best_model.predict(X_val)

# ▒▒ 8. 평가 지표 함수 정의 및 출력 ▒▒
def print_metrics(y_true, y_pred, y_score):
    print("\n📊 Evaluation Metrics on Validation Set")
    print(f"Accuracy     : {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision    : {precision_score(y_true, y_pred):.4f}")
    print(f"Recall       : {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score     : {f1_score(y_true, y_pred):.4f}")
    print(f"F2 Score     : {fbeta_score(y_true, y_pred, beta=2):.4f}")
    print(f"ROC AUC Score: {roc_auc_score(y_true, y_score):.4f}")
    print("\n📄 Classification Report:")
    print(classification_report(y_true, y_pred))
    print("\n🔍 Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

print_metrics(y_val, y_pred, y_pred_prob)

# ▒▒ 9. 최종 모델 저장 ▒▒
model_dir = "C:/ITStudy/bank-marketing/model"
model_path = os.path.join(model_dir, "xgb_under_f2.pkl")

os.makedirs(model_dir, exist_ok=True)

with open(model_path, "wb") as model_file:
    pickle.dump(best_model, model_file)

print(f"\n💾 Best model saved to: {model_path}")


📂 Loading preprocessed dataset...
✅ Dataset loaded: 41188 rows, 43 columns
🧪 Train shape: (28831, 42), Validation shape: (12357, 42)
🔍 Running GridSearchCV (scoring = F2-score)...
Fitting 3 folds for each of 108 candidates, totalling 324 fits


Parameters: { "use_label_encoder" } are not used.




📊 Evaluation Metrics on Validation Set
Accuracy     : 0.8647
Precision    : 0.4303
Recall       : 0.6214
F1 Score     : 0.5085
F2 Score     : 0.5707
ROC AUC Score: 0.8114

📄 Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.90      0.92     10965
           1       0.43      0.62      0.51      1392

    accuracy                           0.86     12357
   macro avg       0.69      0.76      0.72     12357
weighted avg       0.89      0.86      0.88     12357


🔍 Confusion Matrix:
[[9820 1145]
 [ 527  865]]

💾 Best model saved to: C:/ITStudy/bank-marketing/model\xgb_under_f2.pkl
