In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek

from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

In [58]:
df = pd.read_csv('/content/drive/MyDrive/bank_marketing/bank/bank-additional-full.csv',sep=';')

In [59]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

# 타겟 라벨링
df['y'] = df['y'].map({'yes': 1, 'no': 0})
# 'job' 컬럼에서 원핫 인코딩
df = pd.get_dummies(df, columns=['job'], drop_first=False)
# 'marital' 컬럼에서 원핫 인코딩
df = pd.get_dummies(df, columns=['marital'], drop_first=False)
# OrdinalEncoder 사용 (순서 지정)
encoder_edu = OrdinalEncoder(categories=[['unknown', 'illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'university.degree', 'professional.course']])
df['education'] = encoder_edu.fit_transform(df[['education']])
df['education'] = df['education'].astype(int)
# 'yes' 1인 사람이 3명, 컬럼 사용 X
df = df.drop(columns=['default'])
# 'housing' 컬럼에서 원핫 인코딩
df = pd.get_dummies(df, columns=['housing'], drop_first=False)
# 'loan' 컬럼에서 원핫 인코딩
df = pd.get_dummies(df, columns=['loan'], drop_first=False)
# 'contact' 컬럼에서 원핫 인코딩
df['contact'] = df['contact'].map({'cellular': 1, 'telephone': 0})
# 'month' 컬럼을 Ordinal로 변환
month_mapping = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
}
df['month'] = df['month'].replace(month_mapping)
# day_of_week
df = pd.get_dummies(df, columns=['day_of_week'], drop_first=False)
# duration 사용 X
df = df.drop(columns=['duration'])
# pdays -> 999는 0으로, 나머지 값은 1로 인코딩
df['contacted_before'] = df['pdays'].apply(lambda x: 0 if x == 999 else 1)
df = df.drop(columns=['pdays'])
# poutcome
df = pd.get_dummies(df, columns=['poutcome'], drop_first=False)
# int -> float 컬럼 지정
cols_to_convert = ['age', 'campaign', 'previous']
# 지정된 컬럼을 float64로 변환
df[cols_to_convert] = df[cols_to_convert].astype('float64')
# StandardScaler 객체 생성
scaler = StandardScaler()
cols_to_scaling = ['age', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
# 지정된 컬럼들에 대해 스케일링
df[cols_to_scaling] = scaler.fit_transform(df[cols_to_scaling])

  df['month'] = df['month'].replace(month_mapping)


In [60]:
X = df.drop(columns='y')
y = df['y']

In [61]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.3, random_state=42, stratify=y)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(28831, 42) (12357, 42) (28831,) (12357,)


In [62]:
def print_metrics(y_true, y_pred, y_prob):
    print("\n📌 Classification Report:")
    print(classification_report(y_true, y_pred))
    print(f"✅ Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"✅ Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"✅ Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"✅ F1-score: {f1_score(y_true, y_pred):.4f}")
    print(f"✅ ROC AUC: {roc_auc_score(y_true, y_prob):.4f}")
    print("\n📌 Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

1. 오버샘플 없이

In [63]:
param_grid = {
    'n_estimators': [50, 100, 200],            # 트리 개수
    'max_depth': [3, 5, 8],                     # 트리 깊이
    'learning_rate': [0.01, 0.05, 0.1],         # 학습률
    'subsample': [0.8, 1.0],                    # 샘플링 비율
    #'colsample_bytree': [0.8, 1.0],             # 트리별 특성 샘플링 비율
    #'min_child_weight': [1, 5, 10]              # 리프 노드 최소 가중치
}

grid_search = GridSearchCV(
    XGBClassifier(random_state=42, tree_method='gpu_hist', gpu_id=0),
    param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("최적 하이퍼파라미터:", grid_search.best_params_)
print("최고 ROC AUC:", grid_search.best_score_)

best_xgb = grid_search.best_estimator_
y_pred_prob = best_xgb.predict_proba(X_val)[:, 1]
y_pred_best = best_xgb.predict(X_val)


print_metrics(y_val,y_pred_best,y_pred_prob)

Fitting 3 folds for each of 54 candidates, totalling 162 fits



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



최적 하이퍼파라미터: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0}
최고 ROC AUC: 0.7970746896684563

📌 Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     10965
           1       0.68      0.23      0.35      1392

    accuracy                           0.90     12357
   macro avg       0.80      0.61      0.65     12357
weighted avg       0.88      0.90      0.88     12357

✅ Accuracy: 0.9014
✅ Precision: 0.6843
✅ Recall: 0.2320
✅ F1-score: 0.3466
✅ ROC AUC: 0.8113

📌 Confusion Matrix:
[[10816   149]
 [ 1069   323]]


In [64]:
import pickle

# 최적 모델 저장
best_xgb = grid_search.best_estimator_
with open("/content/drive/MyDrive/bank_marketing/model/xgb_base.pkl", "wb") as model_file:
    pickle.dump(best_xgb, model_file)

2. 오버샘플링

In [65]:
# XGBoost
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('xgb', XGBClassifier(random_state=42, tree_method='gpu_hist', gpu_id=0))
])

param_grid = {
    'smote__sampling_strategy': [0.5, 0.7, 1.0],  # 오버샘플링 비율
    'xgb__n_estimators': [50, 100, 200],
    'xgb__max_depth': [3, 5, 8],
    'xgb__learning_rate': [0.01, 0.05, 0.1],
    'xgb__subsample': [0.8, 1.0],
    #'xgb__colsample_bytree': [0.8, 1.0],
    #'xgb__min_child_weight': [1, 5, 10],
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("최적 하이퍼파라미터:", grid_search.best_params_)
print("최고 ROC AUC:", grid_search.best_score_)

best_xgb = grid_search.best_estimator_
y_pred_prob = best_xgb.predict_proba(X_val)[:, 1]
y_pred_best = best_xgb.predict(X_val)


print_metrics(y_val,y_pred_best,y_pred_prob)


Fitting 3 folds for each of 162 candidates, totalling 486 fits



    E.g. tree_method = "hist", device = "cuda"



최적 하이퍼파라미터: {'smote__sampling_strategy': 0.5, 'xgb__learning_rate': 0.05, 'xgb__max_depth': 5, 'xgb__n_estimators': 200, 'xgb__subsample': 0.8}
최고 ROC AUC: 0.7879521436555034

📌 Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     10965
           1       0.61      0.31      0.41      1392

    accuracy                           0.90     12357
   macro avg       0.77      0.64      0.68     12357
weighted avg       0.88      0.90      0.88     12357

✅ Accuracy: 0.9001
✅ Precision: 0.6142
✅ Recall: 0.3053
✅ F1-score: 0.4079
✅ ROC AUC: 0.8040

📌 Confusion Matrix:
[[10698   267]
 [  967   425]]



    E.g. tree_method = "hist", device = "cuda"



In [66]:
best_xgb = grid_search.best_estimator_
with open("/content/drive/MyDrive/bank_marketing/model/xgb_over.pkl", "wb") as model_file:
    pickle.dump(best_xgb, model_file)

3. 언더샘플링

In [67]:
# XGBoost
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

pipeline = Pipeline([
    ('undersample', RandomUnderSampler(random_state=42)),
    ('xgb', XGBClassifier(random_state=42, tree_method='gpu_hist', gpu_id=0))
])

param_grid = {
    'undersample__sampling_strategy': [0.5, 0.8],  # 언더샘플링 비율
    'xgb__n_estimators': [50, 100, 200],
    'xgb__max_depth': [3, 5, 8],
    'xgb__learning_rate': [0.01, 0.05, 0.1],
    'xgb__subsample': [0.8, 1.0],
    #'xgb__colsample_bytree': [0.8, 1.0],
    #'xgb__min_child_weight': [1, 5, 10],
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("최적 하이퍼파라미터:", grid_search.best_params_)
print("최고 ROC AUC:", grid_search.best_score_)

best_xgb = grid_search.best_estimator_
y_pred_prob = best_xgb.predict_proba(X_val)[:, 1]
y_pred_best = best_xgb.predict(X_val)


print_metrics(y_val,y_pred_best,y_pred_prob)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
최적 하이퍼파라미터: {'undersample__sampling_strategy': 0.5, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 5, 'xgb__n_estimators': 50, 'xgb__subsample': 1.0}
최고 ROC AUC: 0.7937872782654777

📌 Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.91      0.93     10965
           1       0.46      0.58      0.51      1392

    accuracy                           0.88     12357
   macro avg       0.70      0.75      0.72     12357
weighted avg       0.89      0.88      0.88     12357

✅ Accuracy: 0.8755
✅ Precision: 0.4586
✅ Recall: 0.5812
✅ F1-score: 0.5127
✅ ROC AUC: 0.8132

📌 Confusion Matrix:
[[10010   955]
 [  583   809]]



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



In [68]:
best_xgb = grid_search.best_estimator_
with open("/content/drive/MyDrive/bank_marketing/model/xgb_under.pkl", "wb") as model_file:
    pickle.dump(best_xgb, model_file)

4. 오버 & 언더

In [69]:
# XGBoost
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('undersample', RandomUnderSampler(random_state=42)),
    ('xgb', XGBClassifier(random_state=42, tree_method='gpu_hist', gpu_id=0))
])


param_grid = {
    'smote__sampling_strategy': [0.5, 0.7, 1.0],  # 오버샘플링 비율
    'undersample__sampling_strategy': [0.5, 0.8],  # 언더샘플링 비율
    'xgb__n_estimators': [50, 100, 200],
    'xgb__max_depth': [3, 5, 8],
    'xgb__learning_rate': [0.01, 0.05, 0.1],
    'xgb__subsample': [0.8, 1.0],
    #'xgb__min_child_weight': [1, 5, 10],
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("최적 하이퍼파라미터:", grid_search.best_params_)
print("최고 ROC AUC:", grid_search.best_score_)

best_xgb = grid_search.best_estimator_
y_pred_prob = best_xgb.predict_proba(X_val)[:, 1]
y_pred_best = best_xgb.predict(X_val)


print_metrics(y_val,y_pred_best,y_pred_prob)

Fitting 3 folds for each of 324 candidates, totalling 972 fits


486 fits failed out of a total of 972.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
486 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/imblearn/pipeline.py", line 518, in fit
    Xt, yt = self._fit(X, y, routed_params, raw_params=params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/l

최적 하이퍼파라미터: {'smote__sampling_strategy': 0.5, 'undersample__sampling_strategy': 0.5, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 5, 'xgb__n_estimators': 200, 'xgb__subsample': 1.0}
최고 ROC AUC: 0.7874926459050604

📌 Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     10965
           1       0.63      0.29      0.40      1392

    accuracy                           0.90     12357
   macro avg       0.77      0.63      0.67     12357
weighted avg       0.88      0.90      0.88     12357

✅ Accuracy: 0.9006
✅ Precision: 0.6277
✅ Recall: 0.2895
✅ F1-score: 0.3963
✅ ROC AUC: 0.8037

📌 Confusion Matrix:
[[10726   239]
 [  989   403]]



    E.g. tree_method = "hist", device = "cuda"



In [70]:
best_xgb = grid_search.best_estimator_
with open("/content/drive/MyDrive/bank_marketing/model/xgb_over+under.pkl", "wb") as model_file:
    pickle.dump(best_xgb, model_file)

### 임시

In [71]:
param_grid = {
    'n_estimators': [100, 200, 500],            # 트리 개수
    'max_depth': [3, 5, 8],                     # 트리 깊이
    'learning_rate': [0.01, 0.05, 0.1],         # 학습률
    'subsample': [0.8, 1.0],                    # 샘플링 비율
    'colsample_bytree': [0.8, 1.0],             # 트리별 특성 샘플링 비율
    'min_child_weight': [1, 5, 10]              # 리프 노드 최소 가중치
}

grid_search1 = GridSearchCV(
    XGBClassifier(random_state=42, tree_method='gpu_hist', gpu_id=0),
    param_grid,
    cv=2,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid_search1.fit(X_train, y_train)

print("최적 하이퍼파라미터:", grid_search1.best_params_)
print("최고 F1-score:", grid_search1.best_score_)

best_xgb1 = grid_search1.best_estimator_
y_pred_prob = best_xgb1.predict_proba(X_val)[:, 1]
y_pred_best = best_xgb1.predict(X_val)

'''accuracy = accuracy_score(y_val, y_pred_best)
precision = precision_score(y_val, y_pred_best)
recall = recall_score(y_val, y_pred_best)
f1 = f1_score(y_val, y_pred_best)
roc_auc = roc_auc_score(y_val, y_pred_prob)'''

print_metrics(y_val,y_pred_best,y_pred_prob)

'''# 평가 지표 출력
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")
print("\n분류 보고서:\n", classification_report(y_val, y_pred_best))'''


Fitting 2 folds for each of 324 candidates, totalling 648 fits



    E.g. tree_method = "hist", device = "cuda"



최적 하이퍼파라미터: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 8, 'min_child_weight': 5, 'n_estimators': 500, 'subsample': 1.0}
최고 F1-score: 0.38577502412107584

📌 Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94     10965
           1       0.59      0.30      0.39      1392

    accuracy                           0.90     12357
   macro avg       0.75      0.63      0.67     12357
weighted avg       0.88      0.90      0.88     12357

✅ Accuracy: 0.8973
✅ Precision: 0.5877
✅ Recall: 0.2960
✅ F1-score: 0.3937
✅ ROC AUC: 0.7761

📌 Confusion Matrix:
[[10676   289]
 [  980   412]]



    E.g. tree_method = "hist", device = "cuda"



'# 평가 지표 출력\nprint(f"Accuracy: {accuracy:.4f}")\nprint(f"Precision: {precision:.4f}")\nprint(f"Recall: {recall:.4f}")\nprint(f"F1 Score: {f1:.4f}")\nprint(f"AUC-ROC: {roc_auc:.4f}")\nprint("\n분류 보고서:\n", classification_report(y_val, y_pred_best))'