In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek

from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

In [40]:
df = pd.read_csv('/content/drive/MyDrive/bank_marketing/bank/bank-additional-full.csv',sep=';')

In [41]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

# 타겟 라벨링
df['y'] = df['y'].map({'yes': 1, 'no': 0})
# 'job' 컬럼에서 원핫 인코딩
df = pd.get_dummies(df, columns=['job'], drop_first=False)
# 'marital' 컬럼에서 원핫 인코딩
df = pd.get_dummies(df, columns=['marital'], drop_first=False)
# OrdinalEncoder 사용 (순서 지정)
encoder_edu = OrdinalEncoder(categories=[['unknown', 'illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'university.degree', 'professional.course']])
df['education'] = encoder_edu.fit_transform(df[['education']])
df['education'] = df['education'].astype(int)
# 'yes' 1인 사람이 3명, 컬럼 사용 X
df = df.drop(columns=['default'])
# 'housing' 컬럼에서 원핫 인코딩
df = pd.get_dummies(df, columns=['housing'], drop_first=False)
# 'loan' 컬럼에서 원핫 인코딩
df = pd.get_dummies(df, columns=['loan'], drop_first=False)
# 'contact' 컬럼에서 원핫 인코딩
df['contact'] = df['contact'].map({'cellular': 1, 'telephone': 0})
# 'month' 컬럼을 Ordinal로 변환
month_mapping = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
}
df['month'] = df['month'].replace(month_mapping)
# day_of_week
df = pd.get_dummies(df, columns=['day_of_week'], drop_first=False)
# duration 사용 X
df = df.drop(columns=['duration'])
# pdays -> 999는 0으로, 나머지 값은 1로 인코딩
df['contacted_before'] = df['pdays'].apply(lambda x: 0 if x == 999 else 1)
df = df.drop(columns=['pdays'])
# poutcome
df = pd.get_dummies(df, columns=['poutcome'], drop_first=False)
# int -> float 컬럼 지정
cols_to_convert = ['age', 'campaign', 'previous']
# 지정된 컬럼을 float64로 변환
df[cols_to_convert] = df[cols_to_convert].astype('float64')
# StandardScaler 객체 생성
scaler = StandardScaler()
cols_to_scaling = ['age', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
# 지정된 컬럼들에 대해 스케일링
df[cols_to_scaling] = scaler.fit_transform(df[cols_to_scaling])

  df['month'] = df['month'].replace(month_mapping)


In [42]:
X = df.drop(columns='y')
y = df['y']

In [43]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.3, random_state=42, stratify=y)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(28831, 42) (12357, 42) (28831,) (12357,)


1. 오버샘플 없이

In [44]:
def print_metrics(y_true, y_pred, y_prob):
    print("\n📌 Classification Report:")
    print(classification_report(y_true, y_pred))
    print(f"✅ Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"✅ Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"✅ Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"✅ F1-score: {f1_score(y_true, y_pred):.4f}")
    print(f"✅ ROC AUC: {roc_auc_score(y_true, y_prob):.4f}")
    print("\n📌 Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

1. 오버샘플 없이

In [45]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [-1, 3, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    #'colsample_bytree': [0.8, 1.0]
}


grid_search = GridSearchCV(
    LGBMClassifier(random_state=42),
    param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("최적 하이퍼파라미터:", grid_search.best_params_)
print("최고 ROC AUC:", grid_search.best_score_)

best_xgb = grid_search.best_estimator_
y_pred_prob = best_xgb.predict_proba(X_val)[:, 1]
y_pred_best = best_xgb.predict(X_val)


print_metrics(y_val,y_pred_best,y_pred_prob)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
[LightGBM] [Info] Number of positive: 3248, number of negative: 25583
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002704 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 408
[LightGBM] [Info] Number of data points in the train set: 28831, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.112657 -> initscore=-2.063889
[LightGBM] [Info] Start training from score -2.063889
최적 하이퍼파라미터: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'subsample': 0.8}
최고 ROC AUC: 0.7959410027297343

📌 Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     10965
           1       0.68      0.23      0.35      1392

    accuracy                           0.90     12357
   macro avg      

In [46]:
import pickle

# 최적 모델 저장
best_xgb = grid_search.best_estimator_
with open("/content/drive/MyDrive/bank_marketing/model/lgbm_base.pkl", "wb") as model_file:
    pickle.dump(best_xgb, model_file)

'''# 모델 불러오기
with open("xgb_base.pkl", "rb") as model_file:
    loaded_xgb = pickle.load(model_file)'''

'# 모델 불러오기\nwith open("xgb_base.pkl", "rb") as model_file:\n    loaded_xgb = pickle.load(model_file)'

2. 오버샘플링

In [47]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('lgbm', LGBMClassifier(random_state=42))
])

param_grid = {
    'smote__sampling_strategy': [0.5, 0.7, 1.0],  # 오버샘플링 비율
    'lgbm__n_estimators': [50, 100, 200],
    'lgbm__max_depth': [-1, 3, 5],  # LightGBM에서는 -1이 자동
    'lgbm__learning_rate': [0.01, 0.05, 0.1],
    'lgbm__subsample': [0.8, 1.0],
    #'lgbm__colsample_bytree': [0.8, 1.0]
}


grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("최적 하이퍼파라미터:", grid_search.best_params_)
print("최고 ROC AUC:", grid_search.best_score_)

best_xgb = grid_search.best_estimator_
y_pred_prob = best_xgb.predict_proba(X_val)[:, 1]
y_pred_best = best_xgb.predict(X_val)


print_metrics(y_val,y_pred_best,y_pred_prob)


Fitting 3 folds for each of 162 candidates, totalling 486 fits
[LightGBM] [Info] Number of positive: 12791, number of negative: 25583
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012033 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1830
[LightGBM] [Info] Number of data points in the train set: 38374, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333325 -> initscore=-0.693186
[LightGBM] [Info] Start training from score -0.693186
최적 하이퍼파라미터: {'lgbm__learning_rate': 0.1, 'lgbm__max_depth': -1, 'lgbm__n_estimators': 100, 'lgbm__subsample': 0.8, 'smote__sampling_strategy': 0.5}
최고 ROC AUC: 0.7902337708960068

📌 Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     10965
           1       0.62      0.30      0.40      1392

    accura

In [48]:
best_xgb = grid_search.best_estimator_
with open("/content/drive/MyDrive/bank_marketing/model/lgbm_over.pkl", "wb") as model_file:
    pickle.dump(best_xgb, model_file)

3. 언더샘플링

In [49]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

pipeline = Pipeline([
    ('undersample', RandomUnderSampler(random_state=42)),
    ('lgbm', LGBMClassifier(random_state=42))
])

param_grid = {
    'undersample__sampling_strategy': [0.5, 0.8],  # 언더샘플링 비율
    'lgbm__n_estimators': [50, 100, 200],
    'lgbm__max_depth': [-1, 3, 5],  # LightGBM에서는 -1이 자동
    'lgbm__learning_rate': [0.01, 0.05, 0.1],
    'lgbm__subsample': [0.8, 1.0],
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("최적 하이퍼파라미터:", grid_search.best_params_)
print("최고 ROC AUC:", grid_search.best_score_)

best_xgb = grid_search.best_estimator_
y_pred_prob = best_xgb.predict_proba(X_val)[:, 1]
y_pred_best = best_xgb.predict(X_val)


print_metrics(y_val,y_pred_best,y_pred_prob)


Fitting 3 folds for each of 108 candidates, totalling 324 fits
[LightGBM] [Info] Number of positive: 3248, number of negative: 6496
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 417
[LightGBM] [Info] Number of data points in the train set: 9744, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147
최적 하이퍼파라미터: {'lgbm__learning_rate': 0.05, 'lgbm__max_depth': 5, 'lgbm__n_estimators': 100, 'lgbm__subsample': 0.8, 'undersample__sampling_strategy': 0.5}
최고 ROC AUC: 0.794597076335927

📌 Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.91      0.93     10965
           1       0.46      0.58      0.51      1392

    accur

In [50]:
best_xgb = grid_search.best_estimator_
with open("/content/drive/MyDrive/bank_marketing/model/lgbm_under.pkl", "wb") as model_file:
    pickle.dump(best_xgb, model_file)

4. 오버 & 언더

In [51]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('undersample', RandomUnderSampler(random_state=42)),
    ('lgbm', LGBMClassifier(random_state=42))
])

param_grid = {
    'smote__sampling_strategy': [0.5, 0.7, 1.0],  # 오버샘플링 비율
    'undersample__sampling_strategy': [0.5, 0.8],  # 언더샘플링 비율
    'lgbm__n_estimators': [50, 100, 200],
    'lgbm__max_depth': [-1, 3, 5],  # LightGBM에서는 -1이 자동
    'lgbm__learning_rate': [0.01, 0.05, 0.1],
    'lgbm__subsample': [0.8, 1.0],
}


grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("최적 하이퍼파라미터:", grid_search.best_params_)
print("최고 ROC AUC:", grid_search.best_score_)

best_xgb = grid_search.best_estimator_
y_pred_prob = best_xgb.predict_proba(X_val)[:, 1]
y_pred_best = best_xgb.predict(X_val)


print_metrics(y_val,y_pred_best,y_pred_prob)


Fitting 3 folds for each of 324 candidates, totalling 972 fits


486 fits failed out of a total of 972.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
486 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/imblearn/pipeline.py", line 518, in fit
    Xt, yt = self._fit(X, y, routed_params, raw_params=params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/l

[LightGBM] [Info] Number of positive: 12791, number of negative: 25582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011618 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1830
[LightGBM] [Info] Number of data points in the train set: 38373, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147
최적 하이퍼파라미터: {'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 5, 'lgbm__n_estimators': 200, 'lgbm__subsample': 0.8, 'smote__sampling_strategy': 0.5, 'undersample__sampling_strategy': 0.5}
최고 ROC AUC: 0.7909586899888222

📌 Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.98      0.95     10965
           1       0.62      0.27      0.38      1392

    accuracy                       

In [52]:
best_xgb = grid_search.best_estimator_
with open("/content/drive/MyDrive/bank_marketing/model/lgbm_over+under.pkl", "wb") as model_file:
    pickle.dump(best_xgb, model_file)